In [1]:
# import statements
import pandas as pd
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from lxml import html
from bs4 import BeautifulSoup
import requests
import time
import re

In [2]:
# reading CSVs to do the thing
df = pd.read_csv("../../data/youtube_data/brandy_melville_youtube_data.csv")
df.head(2)

Unnamed: 0,Title,Links,VideoLink
0,VLOG: spring disney display + brandy melville ...,https://bit.ly/3udRW3H\nhttps://www.amazon.com...,https://www.youtube.com/watch?v=ZOAabQAwk1Y
1,HUGE $400 Brandy Melville Try On Haul (So many...,https://us.brandymelville.com/products/christy...,https://www.youtube.com/watch?v=OvxqmlWQNYQ


In [44]:
# initial function
def scrape_brandy(url):
    try:
        # Set Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        
        # Initialize the WebDriver
        with webdriver.Chrome(options=chrome_options) as driver:
            driver.get(url)
            
            # Get the page source after interactions
            page_source = driver.page_source
            driver.quit()
        
        # Parse the page source with Beautiful Soup
        soup = BeautifulSoup(page_source, "html.parser")
        
        # Find the <div> tag with class "product__description rte"
        data_element = soup.find("div", class_="product__description rte")
        title_element = soup.find("h1", class_="product__title")
        title = title_element.text.strip()
        
        if data_element:
            data_text = data_element.text.strip().replace('\xa0', ' ')
            # Add space between each section of the output
            data_text = data_text.replace('Fabrics:', ' Fabrics:')
            data_text = data_text.replace('Measurement:', ' Measurement:')
            data_text = data_text.replace('Made in:', ' Made in:')
            return title, data_text
        else:
            return "Data element not found on the page."
            
    except Exception as e:
        return f"An error occurred: {str(e)}"

# URL of the webpage you want to scrape
url = "https://us.brandymelville.com/products/chloe-radio-silence-top-3"
# Call the function to scrape the website
data = scrape_brandy(url)
print(data)

('Chloe Radio Silence Top', 'Soft and lightweight crewneck cotton top in black with the Radio Silence graphic on the front in white. Fabrics: 100% cotton Measurement: 19" (48cm) length, 17" (43cm) bust Made in: Europe')


In [10]:
# test code!! It works!!
top_row = df.iloc[1]
links_list = top_row["Links"].split('\n')

for link in links_list:
    scraped_data = scrape_brandy(link)
    if scraped_data:
        print(scraped_data)
        print()
    time.sleep(3)

Data element not found on the page.

Bra style tank top with a crossover v-neckline, and adjustable strapsAll intimates are final sale.Fabrics: 96% cotton, 4% elastaneMeasurements: 10" (25 cm) length, 12" (30 cm) bust Made in: ﻿Europe

Bra style tank top with a crossover v-neckline, and adjustable strapsAll intimates are final sale.Fabrics: 96% cotton, 4% elastaneMeasurements: 10" (25 cm) length, 12" (30 cm) bust Made in: ﻿Europe

Soft, striped, cotton shorts with an elastic waistband and three buttons.

Fabrics: 96% cotton, 4% elastane


Measurement: 8" (20 cm) rise, 2.5" (6 cm) inseam, 26" (66 cm) waist (stretches)


Made in: Europe

Soft, striped, cotton shorts with an elastic waistband and three buttons.

Fabrics: 96% cotton, 4% elastane


Measurement: 8" (20 cm) rise, 2.5" (6 cm) inseam, 26" (66 cm) waist (stretches)


Made in: Europe

Data element not found on the page.

Soft cotton blend sweatpants with elastic waistband, side pockets, and elastic cuffs.

Fabrics: 75% cotton, 25

In [77]:
# works!!! for items with singular component
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup

def scrape_brandy(url):
    try:
        # Set Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        
        # Initialize the WebDriver
        with webdriver.Chrome(options=chrome_options) as driver:
            driver.get(url)
            
            # Get the page source after interactions
            page_source = driver.page_source
        
        # Parse the page source with Beautiful Soup
        soup = BeautifulSoup(page_source, "html.parser")
        
        # Find the <div> tag with class "product__description rte"
        data_element = soup.find("div", class_="product__description rte")
        title_element = soup.find("h1", class_="product__title")
        title = title_element.text.strip()
        
        if data_element:
            # Extract text content and remove HTML tags
            data_text = data_element.get_text(separator=' ', strip=True)
            print(data_text)
            
            # Extract material makeup
            materials = {}
            fabric_match = re.search(r'Fabrics:\s*((?:\d+\s*%?\s*\w+\s*,?\s*)+)\s*', data_text, re.IGNORECASE)
            if fabric_match:
                fabric_string = fabric_match.group(1)
                fabric_percentages = re.findall(r'(\d+)\s*%?\s*(\w+)', fabric_string)
                for percent, fabric in fabric_percentages:
                    materials[fabric.lower()] = int(percent)
            
            # Construct the final dictionary
            final_data = {'item': title}
            final_data.update(materials)
            
            return final_data
        else:
            return "Data element not found on the page."
            
    except Exception as e:
        return f"An error occurred: {str(e)}"

# URL of the webpage you want to scrape
url="https://us.brandymelville.com/products/nadia-skirt?pr_prod_strat=jac&pr_rec_id=4c9989488&pr_rec_pid=7282719391953&pr_ref_pid=5605883871396&pr_seq=uniform"
# Call the function to scrape the website
data = scrape_brandy(url)
print(data)


Long, flowy, prairie skirt with a tie waist and ruffle seams. Fabrics: Shell 100% Viscose, Lining 100% Cotton Measurements: 36" (91 cm) length, 28" (71 cm) waist (stretches) Made in: China
{'item': 'Nadia Skirt'}


# USE THIS NEXT CODE CHUNK AS AN EXAMPLE OF WHAT I WANT TO DO!

In [8]:
## WORKS FOR ITEMS WITH MULTIPLE COMPONENTS!!! 
def scrape_brandy_done(url):
    try:
        # Set Chrome options for headless mode
        chrome_options = Options()
        chrome_options.add_argument("--headless")
        
        # Initialize the WebDriver
        with webdriver.Chrome(options=chrome_options) as driver:
            driver.get(url)
            
            # Get the page source after interactions
            page_source = driver.page_source
        
        # Parse the page source with Beautiful Soup
        soup = BeautifulSoup(page_source, "html.parser")
        
        # Find the <div> tag with class "product__description rte"
        data_element = soup.find("div", class_="product__description rte")
        title_element = soup.find("h1", class_="product__title")
        title = title_element.text.strip()
        
        if data_element:
            # Extract text content and remove HTML tags
            data_text = data_element.get_text(separator=' ', strip=True)
            print(data_text)
            
            # Extract material makeup
            materials = {}
            fabric_matches = re.findall(r'(\w+):\s*(.*?)\s*(?=(?:\w+:)|$)', data_text, re.DOTALL)
            for item, fabric_string in fabric_matches:
                if item.lower() == 'fabrics':
                    fabric_percentages = re.findall(r'(\d+)\s*%?\s*(\w+)', fabric_string)
                    for percent, fabric in fabric_percentages:
                        fabric_key = fabric.lower()
                        materials[fabric_key] = int(percent)
                else:
                    continue
            
            # Construct the final dictionary
            final_data = {'item': title}
            final_data.update(materials)
            
            return final_data
        else:
            return "Data element not found on the page."
            
    except Exception as e:
        return f"An error occurred: {str(e)}"

# URL of the webpage you want to scrape
url="https://us.brandymelville.com/products/nadia-skirt?pr_prod_strat=jac&pr_rec_id=4c9989488&pr_rec_pid=7282719391953&pr_ref_pid=5605883871396&pr_seq=uniform"

# Call the function to scrape the website
data = scrape_brandy_done(url)
print(data)


Long, flowy, prairie skirt with a tie waist and ruffle seams. Fabrics: Shell 100% Viscose, Lining 100% Cotton Measurements: 36" (91 cm) length, 28" (71 cm) waist (stretches) Made in: China
{'item': 'Nadia Skirt', 'viscose': 100, 'cotton': 100}


In [7]:
# You dont have to do this this is for getting the links from the CSV
def scrape_and_update(row):
    links_list = row["Links"].split('\n')
    scraped_data_combined = ""

    for link in links_list:
        print(f"Scraping data from link: {link}")
        scraped_data = scrape_brandy_done(link)
        if scraped_data:
            scraped_data_combined += str(scraped_data) + "\n"
        time.sleep(3)

    return scraped_data_combined

In [91]:
df["ScrapedData"] = df.apply(scrape_and_update, axis=1)

Scraping data from link: https://bit.ly/3udRW3H
Scraping data from link: https://www.amazon.com/shop/cassiscastle?ref=ac_inf_hm_vp
Scraping data from link: https://us.brandymelville.com/products/christy-miami-hoodie?variant=41395820495057
Scraping data from link: https://us.brandymelville.com/products/polina-bra-top?variant=41447319077073
Bra style tank top with a crossover v-neckline, and adjustable straps All intimates are final sale. Fabrics: 96 % cotton, 4% elastane Measurements: 10 " (25 cm) length, 12" (30 cm) bust Made in: ﻿ Europe
Scraping data from link: https://us.brandymelville.com/products/polina-bra-top?variant=41480992129233
Bra style tank top with a crossover v-neckline, and adjustable straps All intimates are final sale. Fabrics: 96 % cotton, 4% elastane Measurements: 10 " (25 cm) length, 12" (30 cm) bust Made in: ﻿ Europe
Scraping data from link: https://us.brandymelville.com/products/keira-stripe-shorts?variant=41314058797265
Soft, striped, cotton shorts with an elast

In [92]:
df.to_csv('brandy_materials.csv', index=False)

In [107]:
df_small = pd.read_csv("../../data/youtube_data/brandy_youtube_data.csv")

df_small["ScrapedData"] = df_small.apply(scrape_and_update, axis=1)


Scraping data from link: https://open.spotify.com/user/1zp0jj4ii37zggvjy6wavhawl?si=1ce7025ca4d0428b
Scraping data from link: http://bit.ly/itsrlife
Scraping data from link: https://us.brandymelville.com/products/copy-of-bonnie-top-july-2022-ok?variant=41127180107985
Cotton blend off the shoulder long-sleeve top with a raw hem. Fabrics: 96% cotton, 4% elastane Measurement: 19"(48 cm) length, 18"(46 cm)  bust Made in: Italy
Scraping data from link: https://us.brandymelville.com/products/hailie-basic-top-2
Regular fit, basic, short sleeve, cotton top with a crewneck collar. Fabrics: 100% cotton Measurement: 20" (51 cm) length, 16" (41 cm) bust Made In: China
Scraping data from link: https://us.brandymelville.com/products/black-radio-silence-graphic-thin-shirt?pr_prod_strat=use_description&pr_rec_id=a7a338f07&pr_rec_pid=6552046764241&pr_ref_pid=5833669443793&pr_seq=uniform
Scraping data from link: https://us.brandymelville.com/products/copy-of-brianna-cotton-thick-stripe-sweater-july-2022

In [108]:
df_small.to_csv("brandy_small_materials.csv", index=False)