# Scraping Products from INCID Decoder

In [3]:
import requests
from bs4 import BeautifulSoup
import re
import json

In [4]:
exclude_links = set(["/products/new", "/products/create"])
all_product_links = set()

## Scrape each page for varied results separately

In [52]:
url = f"https://incidecoder.com/products?page={3}"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
product_links = soup.find_all('a')    
for a in product_links:
    href = a.get("href")  # or a['href']
    if href.startswith("/products/") and href not in exclude_links and "discontinued" not in href:
        all_product_links.add(href)

In [294]:
# Scrape for when search result is put in for more number of results

url = "https://incidecoder.com/search?query=exfoliant"
r = requests.get(url)
soup = BeautifulSoup(r.text, "html.parser")
div = soup.find("div", class_="search-content")
product_links = div.find_all('a')    
for a in product_links:
    href = a.get("href")  # or a['href']
    if href.startswith("/products/") and href not in exclude_links and "discontinued" not in href:
        all_product_links.add(href)

In [295]:
print(len(all_product_links))

6010


In [296]:
all_product_links

{'/products/la-mer-the-intensive-revitalizing-mask',
 '/products/drgl-toner-sensitive',
 '/products/uni-love-vegan-baby-cream',
 '/products/baby-boo-bamboo-natural-baby-lotion',
 '/products/the-ordinary-caffeine-solution-5-egcg',
 '/products/dr-zenovia-skincare-10-benzoyl-peroxide-acne-cleanser',
 '/products/wiley-body-hand-body-wash',
 '/products/bio-essence-bio-water-foamy-cleanser',
 '/products/olaplex-shampoo',
 '/products/olay-prox-spot-fading-essence',
 '/products/lemon-bottle-',
 '/products/lacura-foaming-cleanser',
 '/products/valuge-vansame-spray',
 '/products/loye-mineral-foundation',
 '/products/biosilk-silk-therapy-treatment',
 '/products/tula-mineral-magic',
 '/products/scinic-snail-aio-ampoule',
 '/products/elmore-deep-nourishment-glycerin-body-lotion',
 '/products/alverde-express-handcreme-bio-rosmarin-bio-zitronenmelisse',
 '/products/bee-naturals-nail-cuticle-balm',
 '/products/equate-beauty-foaming-scrub',
 '/products/epionce-melanolyte-pigment-perfection-serum',
 '/p

In [297]:
def scrape_each_product(product_links:list):
    docid = 1
    all_products_json = []
    ingredients_json = []
    unique_ingredients = {}

    for link in product_links:
        prod_json = {}
        url = f"https://incidecoder.com{link}"
        r = requests.get(url)
        soup = BeautifulSoup(r.text, "html.parser")

        prod_json['docid'] = docid
        prod_json['product_url'] = url

        ids = ["product-title", "product-brand-title", "product-details", "product-main-image"]
        classes = ["title", "brand", "desc", "img_url"]

        for i in range(len(ids)):
            each_id = ids[i]
            each_class = classes[i]
            soup_text = soup.find(id=each_id)

            if soup_text:
                text = soup_text.get_text()
                # specifically for scraping the image
                if each_id == "product-main-image":
                    product_img = soup_text.find("img")
                    img_url = product_img["src"] if product_img else None
                    prod_json[each_class] = img_url
                # specifically for scraping description since it has some garbage strings in the middle
                elif each_id == "product-details":
                    cleaned = " ".join(text.split())
                    # 2. Remove [more] and [less] or any [word] tokens
                    cleaned = re.sub(r"\[.*?\]", "", cleaned)
                    # 3. Clean extra spaces again
                    cleaned_desc = re.sub(r"\s+", " ", cleaned).strip()
                    prod_json[each_class] = cleaned_desc

                else:
                    prod_json[each_class] = text.strip()
         # for ingredient functions
        for block in soup.find_all("div", class_="ingredlist-by-function-block"):
            # Either returns Key Ingredients or Other Ingredients
            section_title = block.find("h3")
            if not section_title:
                continue
            section_title = section_title.get_text(strip=True)

            links = block.find_all("a", class_="func-link")

            func_ids_list = []
            for a in links:
                if a.get("href"):
                    function = a.get_text(strip=True)
                    func_url = a["href"]
                    # found a new ingredient - update json
                    if function not in unique_ingredients:                        
                        r = requests.get(f"https://incidecoder.com{func_url}")
                        func_soup = BeautifulSoup(r.text, "html.parser")
                        function_desc = func_soup.find("p").get_text(strip=True)
                        ingr_json = {"func_id" : len(ingredients_json), "title":function, "desc" : function_desc}
                        unique_ingredients[function] = len(ingredients_json)
                        ingredients_json.append(ingr_json)

                    func_ids_list.append(unique_ingredients[function])
            

            if "key" in section_title.lower():
                prod_json['key_ingredient_func'] = func_ids_list
            if "other" in section_title.lower():
                prod_json['other_ingredient_func'] = func_ids_list
        
        docid += 1
        all_products_json.append(prod_json)
    return all_products_json, ingredients_json
    

## Write to JSON

In [298]:
def write_json(products, ingredients):
    with open("inci_products_new.jsonl", "w", encoding="utf-8") as f:
        for product in products:
            json.dump(product, f, ensure_ascii=False)
            f.write("\n")
    
    with open("inci_ingredient_functions_new.jsonl", "w", encoding="utf-8") as f:
        for ingredient in ingredients:
            json.dump(ingredient, f, ensure_ascii=False)
            f.write("\n")

In [299]:
print("Scraping Product Links")

print("Statistics of Products Scraped")
print(f"Total Number of Products = {len(all_product_links)}")

Scraping Product Links
Statistics of Products Scraped
Total Number of Products = 6010


In [300]:
print("Building JSON products and ingredient functions")
all_products, all_ingredients = scrape_each_product(all_product_links)

Building JSON products and ingredient functions


In [301]:
print("Statistics of Product JSON and Ingredients JSON")
print(f"Total Number of Products = {len(all_products)} (must be the same as number of products scraped)")
print(f"Total Number of Ingredient Functions = {len(all_ingredients)}")

print("Writing to JSON file")
write_json(all_products, all_ingredients)

Statistics of Product JSON and Ingredients JSON
Total Number of Products = 6010 (must be the same as number of products scraped)
Total Number of Ingredient Functions = 21
Writing to JSON file
