# Setup

In [None]:
%pip install -q -U google-generativeai

In [None]:
%pip install requests-html

Collecting requests-html
  Downloading requests_html-0.10.0-py3-none-any.whl.metadata (15 kB)
Collecting pyquery (from requests-html)
  Downloading pyquery-2.0.1-py3-none-any.whl.metadata (9.0 kB)
Collecting fake-useragent (from requests-html)
  Downloading fake_useragent-2.0.3-py3-none-any.whl.metadata (17 kB)
Collecting parse (from requests-html)
  Downloading parse-1.20.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting bs4 (from requests-html)
  Downloading bs4-0.0.2-py2.py3-none-any.whl.metadata (411 bytes)
Collecting w3lib (from requests-html)
  Downloading w3lib-2.3.1-py3-none-any.whl.metadata (2.3 kB)
Collecting pyppeteer>=0.0.14 (from requests-html)
  Downloading pyppeteer-2.0.0-py3-none-any.whl.metadata (7.1 kB)
Collecting appdirs<2.0.0,>=1.4.3 (from pyppeteer>=0.0.14->requests-html)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting pyee<12.0.0,>=11.0.0 (from pyppeteer>=0.0.14->requests-html)
  Downloading pyee-11.1.1-py3-none-any.whl.metadata (2.8

In [None]:
!pip install lxml_html_clean

Collecting lxml_html_clean
  Downloading lxml_html_clean-0.4.1-py3-none-any.whl.metadata (2.4 kB)
Downloading lxml_html_clean-0.4.1-py3-none-any.whl (14 kB)
Installing collected packages: lxml_html_clean
Successfully installed lxml_html_clean-0.4.1


In [None]:
from vertexai.generative_models import Image
from glob import glob
import re
import pandas as pd
import lxml_html_clean
from requests_html import HTMLSession, AsyncHTMLSession
import nest_asyncio
from bs4 import BeautifulSoup
import requests

nest_asyncio.apply()

In [None]:
# Log in

import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

In [None]:
# Obtain the stored keys

from google.colab import userdata
API_KEY = userdata.get('GOOGLE_API_KEY')
PROJECT_ID = userdata.get('GOOGLE_PROJ_ID1')


In [None]:
# Understand images
# Need to log in first: auth.authenticate_user()

import vertexai
from vertexai.generative_models import GenerativeModel, Part, GenerationConfig

vertexai.init(project=PROJECT_ID, location="us-central1")

model = GenerativeModel("gemini-1.5-flash-002")

In [None]:
# mount drive so that we can access the photos - not sure if it works from the shared folder, or if we need to have them in our own drives
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## TODO: Set folder and brand

In [None]:
# TODO: Find the correct folder where the photos are stored and replace the folder name here
# may need to make a shortcut in your drive to the shared folder to access photos there
folder = '/content/drive/MyDrive/photos1/new/behr/'

In [None]:
# TODO: Set which set of colors you're tagging
# likely not possible for gemini to know the difference between them
# necessary to find rgb values from the different sites
# brand = "valspar"
brand = "behr"

# Extract color names with Gemini

In [None]:
file_mapping = pd.DataFrame(columns=['original_filename', 'color_name', 'color_code', 'new_filename'])

In [None]:
import time
# Have Gemini find the color name and code
for i, image in enumerate(glob(folder + '*.jpg')):
  # print(image)
  image_part = Part.from_image(Image.load_from_file(image))

  # TODO: probably should test if this can detect color code and name when the image is turned in any way
  prompt = "Detect the name and color code for this paint swatch. The color code is the string of numbers and/or letters underneath the name. Give me just the name and color code as strings with single quotes, comma-separated. Nothing else."

  # Query the model
  response = model.generate_content([image_part, prompt])
  # print(response.text)
  parts = re.findall(r"'(.*?)'", response.text)
  # print(parts)

  file_mapping.loc[i] = [image, parts[0], parts[1], f"{str.replace(parts[0], ' ', '-')}_{parts[1]}"]

  time.sleep(2)


In [None]:
# Check that it looks okay - most likely error is in reading the color code
file_mapping

Unnamed: 0,original_filename,color_name,color_code,new_filename
0,/content/drive/MyDrive/photos1/new/behr/202503...,New Bamboo,PPU10-04P,New-Bamboo_PPU10-04P
1,/content/drive/MyDrive/photos1/new/behr/202503...,I Pink I Can,P140-4M,I-Pink-I-Can_P140-4M
2,/content/drive/MyDrive/photos1/new/behr/202503...,Luck of the Irish,P380-78,Luck-of-the-Irish_P380-78
3,/content/drive/MyDrive/photos1/new/behr/202503...,Sultana,P100-7,Sultana_P100-7
4,/content/drive/MyDrive/photos1/new/behr/202503...,Tropics,P450-60,Tropics_P450-60
...,...,...,...,...
138,/content/drive/MyDrive/photos1/new/behr/202503...,The Peak Teal,2119-40,The-Peak-Teal_2119-40
139,/content/drive/MyDrive/photos1/new/behr/202503...,Little Princess,P130-3u,Little-Princess_P130-3u
140,/content/drive/MyDrive/photos1/new/behr/202503...,Bella Vista,P470-6,Bella-Vista_P470-6
141,/content/drive/MyDrive/photos1/new/behr/202503...,Salamander,P370-6,Salamander_P370-6


In [None]:
# save this first mapping if desired, right now going to the same folder as the images
file_mapping.to_csv(f'{folder}file_mapping.csv', index=False)

# Scrape RGB values

In [None]:
# get colors for the Behr site, which loads the colors dynamically with javascript after the rest of the page
async def get_behr_rgb(color_code):
    site_stem = "https://www.behr.com/consumer/ColorDetailView/"
    url = site_stem + color_code
    session = AsyncHTMLSession()
    try:
      response = await session.get(f"{site_stem}{color_code}")
      await response.html.arender(sleep=1, timeout=20)  # Render JavaScript (adjust sleep and timeout as needed)
      raw_html = response.html.raw_html
      await session.close()

      soup = BeautifulSoup(raw_html, "html.parser")
      R_val = soup.find("span", class_="rgb_R").contents[1].text
      G_val = soup.find("span", class_="rgb_G").contents[1].text
      B_val = soup.find("span", class_="rgb_B").contents[1].text

      return R_val, G_val, B_val

    except Exception as e:
        print(f"An error on color {color_code} occurred: {e}")
        return None, None, None

In [None]:
async def get_valspar_rgb(color_name):
  site_stem = "https://www.valspar.com"
  color_query = str.replace(color_name, ' ', '+')

  session = AsyncHTMLSession()
  try:
    # valspar site includes the color category (e.g. red, blue, etc.) in url
    # so need to search for the color first to get the right link without just guessing the color category
    # since we have the exact name, we're assuming that will be enough to get the correct color first try - spot check them though
    response = await session.get(f"{site_stem}/en/search?q={color_query}")
    await response.html.arender(sleep=1, timeout=20)  # Render JavaScript (adjust sleep and timeout as needed)
    raw_html = response.html.raw_html
    await session.close()

    # parse search results to get the actual color page link, then get the rgb vals
    soup = BeautifulSoup(raw_html, "html.parser")
    color_path = soup.find("div", class_="cbg-cmp-searchresult--color cbg-cmp-card__swatch-container swatch-container").find("a", href=True)["href"]
    page = requests.get(f"{site_stem}{color_path}")
    soup = BeautifulSoup(page.content, "html.parser")

    rgb = soup.find("div", class_="info-box rgb").find("span", class_="info-value").text
    r, g, b = (val.strip() for val in rgb.strip().split(','))

    return r, g, b
  except Exception as e:
    print(f"An error on color {color_name} occurred: {e}")
    return None, None, None

In [None]:
new_file_mapping = pd.DataFrame(columns=['original_filename', 'color_name', 'color_code', 'verified_code', 'R', 'G', 'B', 'new_filename'])

In [None]:
from collections import defaultdict
color_counts = defaultdict(int)

for i, row in file_mapping.iterrows():
  # if row['color_name'] in new_file_mapping['color_name'].values:
  #   # skip colors that have already been done, in case of any errors that stop it

  #   continue

  # TODO: could do something to pull the filename w/ color
  if brand.lower() == "valspar":
    verified_code = row['color_code']
    R_val, G_val, B_val = await get_valspar_rgb(row['color_name'])

  elif brand.lower() == "behr":
    # gemini had some issues before with the behr color codes
    # this helped catch errors before trying to request the site
    code_regex = r"[A-Z]\d{3}-\d"
    verified_code = re.search(code_regex, row['color_code'])
    if verified_code:
      verified_code = verified_code.group(0)
      R_val, G_val, B_val = await get_behr_rgb(verified_code)
    else:
      # no matching color code, so can't get to the site
      R_val, G_val, B_val = None, None, None
  else:
    print("Error: brand must be either 'valspar' or 'behr'")

  color_counts[row['color_name']] += 1
  # TODO: Change the naming convention here
  new_filename = f"{str.replace(row['color_name'], ' ', '-')}_{verified_code}_{R_val}_{G_val}_{B_val}_{color_counts[row['color_name']]}"
  new_file_mapping.loc[i] = [row['original_filename'], row['color_name'], row['color_code'], verified_code, R_val, G_val, B_val, new_filename]


An error on color P450-8 occurred: 'NoneType' object has no attribute 'contents'


In [None]:
new_file_mapping

Unnamed: 0,original_filename,color_name,color_code,verified_code,R,G,B,new_filename
0,/content/drive/MyDrive/photos1/new/behr/202503...,New Bamboo,PPU10-04P,,,,,New-Bamboo_None_None_None_None_1
1,/content/drive/MyDrive/photos1/new/behr/202503...,I Pink I Can,P140-4M,P140-4,212,127,141,I-Pink-I-Can_P140-4_212_127_141_1
2,/content/drive/MyDrive/photos1/new/behr/202503...,Luck of the Irish,P380-78,P380-7,84,120,57,Luck-of-the-Irish_P380-7_84_120_57_1
3,/content/drive/MyDrive/photos1/new/behr/202503...,Sultana,P100-7,P100-7,103,70,104,Sultana_P100-7_103_70_104_1
4,/content/drive/MyDrive/photos1/new/behr/202503...,Tropics,P450-60,P450-6,0,155,142,Tropics_P450-6_0_155_142_1
...,...,...,...,...,...,...,...,...
138,/content/drive/MyDrive/photos1/new/behr/202503...,The Peak Teal,2119-40,,,,,The-Peak-Teal_None_None_None_None_2
139,/content/drive/MyDrive/photos1/new/behr/202503...,Little Princess,P130-3u,P130-3,230,170,193,Little-Princess_P130-3_230_170_193_6
140,/content/drive/MyDrive/photos1/new/behr/202503...,Bella Vista,P470-6,P470-6,0,142,154,Bella-Vista_P470-6_0_142_154_6
141,/content/drive/MyDrive/photos1/new/behr/202503...,Salamander,P370-6,P370-6,137,164,87,Salamander_P370-6_137_164_87_6


In [None]:
new_file_mapping.to_csv(f'{folder}new_file_mapping.csv', index=False)

In [None]:
import os

# rename files if desired
for i, row in new_file_mapping.iterrows():
  os.rename(row['original_filename'], f"{folder}{row['new_filename']}.jpg")