# Part 1: Get sustainability ratings of popular brands from Good On You's website

## Step 1: Create a list of brands

In [1]:
brands = [
    'Princess Polly',
    'Brandy Melville',
    'Shein',
    'Nike',
    'Abercrombie & Fitch',
    'Amazon',
    'ASOS',
    'Forever 21', 
    'American Eagle',
    'Alo',
    'Reformation',
    'Acne Studios',
    'Alice + Olivia',
    'Sandy Liang',
    'Billabong',
    'Adidas',
    'Aritzia',
    'Uniqlo',
    'Area',
    'Balenciaga',
    'Bottega Veneta',
    'Brooks Brothers',
    'Burberry',
    'Chanel',
    'Coach',
    'Fendi',
    'Gucci',
    'Hermes',
    'Louis Vuitton',
    'Prada',
    'Ralph Lauren',
    'Saint Laurent',
    'Stella McCartney',
    'Telfar',
    'The Row',
    'Theory',
    'Tom Ford',
    'Tory Burch',
    'Valentino',
    '7 For All Mankind',
    "Arc'teryx",
    'aventura',
    'Banana Republic',
    'Boden',
    'Buck Mason',
    'Calvin Klein',
    'Carhartt',
    'Christy Dawn',
    'Columbia',
    'Cotopaxi',
    'Dickies',
    'Djerf Avenue',
    'Doen',
    'Edikted',
    'Everlane',
    'Faithfull the Brand',
    'Frankies Bikinis',
    'Girlfriend Collective',
    'Good American',
    'House of Sunny',
    'J.Crew',
    "Levi's",
    'Madewell',
    'Organic Basics',
    'Pact',
    'Patagonia',
    'prAna',
    'Quince',
    'RE/DONE',
    'Réalisation Par',
    'REI',
    'Sezane',
    'Spanx',
    'Summersalt',
    'tentree',
    'The North Face',
    'Tommy Hilfiger',
    'True Religion',
    'Wrangler',
    'Yes Friends',
    'Aeropostale',
    'Boohoo',
    'Cider',
    'Fashion Nova',
    'GUESS',
    'Hollister',
    'Hot Topic',
    'House of CB',
    'Mango',
    'Missguided',
    'Nasty Gal',
    'PacSun',
    'PrettyLittleThing',
    'Primark',
    'Romwe',
    'Temu',
    'Topshop',
    'Torrid',
    'Under Armour',
    "Victoria's Secret",
    'Yesstyle',
    'Ann Taylor',
    'Aerie', 
    'Garage',
    'Pink'
]

## Step 2: Create function to get data (ratings and description) from each brand

In [2]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

In [3]:
def scrape_brand_rating(brand: str):
    cols = ['brand', 'overall_rating', 'planet_score', 'people_score', 'animals_score', 'description']

    url = 'https://directory.goodonyou.eco/brand/'

    try:
        # convert brand name to url format
        brand_converted = brand.lower() # lowercase
        brand_converted = brand_converted.replace(' ', '-') # replace spaces with dashes
        brand_converted = brand_converted.replace('&', 'and') # replace '&' with 'and'
        brand_converted = brand_converted.replace('+', '-') # replace '+' with '-'
        brand_converted = brand_converted.replace("'", '') # remove apostrophes
        brand_converted = brand_converted.replace('.', '') # remove periods
        brand_converted = brand_converted.replace('/', '') # remove slashes
        brand_converted = brand_converted.replace('é', 'e') # remove accent

        # get data on the brand's page
        response = requests.get(url+brand_converted)
        soup = BeautifulSoup(response.text, 'html.parser')

        # get overall rating 
        overall_rating = soup.find('p', id='brand-rating')
        overall_rating = overall_rating.text.split(': ')[-1]

        # get subratings for planet, people, and animals
        subratings = soup.find_all('div', class_='id__RatingSingle-sc-12z6g46-9 ksJKxw')

        if not subratings:
            subratings = ['' for i in range(3)]
        else:
            # remove category name from text 
            subratings[0] = subratings[0].text.split('Planet')[1]
            subratings[1] = subratings[1].text.split('People')[1]
            subratings[2] = subratings[2].text.split('Animals')[1]

            # if there's a rating, convert to int (makes it easier to analyze later)
            for i, rating in enumerate(subratings):
                if rating == 'Not applicable':
                    subratings[i] = ''
                else:
                    subratings[i] = int(rating.split(' ')[0])

        # get description/justification
        text = soup.find('div', class_='id__BodyText-sc-12z6g46-15 eUqrmK')
        if not text:
            text = ''
        else:
            text = text.text

        # convert overall scores to numbers (scale of 1 to 5)
        numerical_scores = {
            'We avoid': 1,
            'Not good enough': 2,
            "It's a start": 3,
            'Good': 4,
            'Great': 5
        }

        # create a dictionary with the brand info
        brand_info = {
            'brand': brand,
            'overall_rating': numerical_scores.get(overall_rating),
            'planet_score': subratings[0],
            'people_score': subratings[1],
            'animals_score': subratings[2],
            'description': text
        }

        return brand_info
    except:
        print(f"{brand} is not in Good On You's directory")
        return None

## Step 3: Create dataframe, add brand information, and export to CSV file

In [4]:
def add_brand_to_dataset(brand: str) -> pd.DataFrame:
    if not os.path.exists('../data/brand_info.csv'):
        brand_df = pd.DataFrame(columns=['brand', 'overall_rating', 'planet_score', 'people_score', 'animals_score', 'description'])
    else:
        brand_df = pd.read_csv('../data/brand_info.csv')

    brand_info = scrape_brand_rating(brand)

    if not brand_info:
        return

    if not any(brand_df['brand'] == brand):
        new_brand = pd.DataFrame([brand_info], columns=brand_df.columns)
        brand_df = pd.concat([brand_df, new_brand], ignore_index=True)
        brand_df.to_csv('../data/brand_info.csv', index=False)
        print(f'{brand} has been added to brand_info.csv')
    else:
        print(f'{brand} is already in brand_info.csv')

    return brand_df

In [5]:
for brand in brands:
    add_brand_to_dataset(brand)

Princess Polly has been added to brand_info.csv
Brandy Melville has been added to brand_info.csv
Shein has been added to brand_info.csv
Nike has been added to brand_info.csv
Abercrombie & Fitch has been added to brand_info.csv
Amazon has been added to brand_info.csv
ASOS has been added to brand_info.csv
Forever 21 has been added to brand_info.csv
American Eagle has been added to brand_info.csv
Alo has been added to brand_info.csv
Reformation has been added to brand_info.csv
Acne Studios has been added to brand_info.csv
Alice + Olivia has been added to brand_info.csv
Sandy Liang has been added to brand_info.csv
Billabong has been added to brand_info.csv
Adidas has been added to brand_info.csv
Aritzia has been added to brand_info.csv
Uniqlo has been added to brand_info.csv
Area has been added to brand_info.csv
Balenciaga has been added to brand_info.csv
Bottega Veneta has been added to brand_info.csv
Brooks Brothers has been added to brand_info.csv
Burberry has been added to brand_info.c

In [6]:
# can add a brand and update the df and CSV file
brand_df = add_brand_to_dataset('Esprit')

Esprit has been added to brand_info.csv


In [7]:
# function will only add new brands
brand_df = add_brand_to_dataset('Aritzia')

Aritzia is already in brand_info.csv


In [8]:
brand_df

Unnamed: 0,brand,overall_rating,planet_score,people_score,animals_score,description
0,Princess Polly,2,2.0,2.0,4.0,Our “Planet” rating evaluates brands based on ...
1,Brandy Melville,1,1.0,1.0,0.0,This brand provides insufficient relevant info...
2,Shein,1,1.0,1.0,2.0,Our “Planet” rating evaluates brands based on ...
3,Nike,3,3.0,3.0,2.0,Our “Planet” rating evaluates brands based on ...
4,Abercrombie & Fitch,2,2.0,2.0,2.0,Abercrombie & Fitch is owned by Abercrombie Ab...
...,...,...,...,...,...,...
100,Ann Taylor,2,,,,
101,Aerie,2,,,,
102,Garage,2,,,,
103,Pink,2,,,,


# Part 2: Get all brands from Good On You's website and create a dataset of their ratings

## Step 1: Get brands recommended by Good On You

In [10]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

categories = [
    'tops',
    'dresses',
    'basics',
    'bottoms',
    'denim',
    'outerwear',
    'knitwear',
    'activewear',
    'sleepwear'
]

brands = set()

# set Chrome options for headless mode
chrome_options = Options()

# initialize WebDriver with headless mode
driver = webdriver.Chrome(options=chrome_options)

for category in categories:
    # open webpage
    driver.get('https://directory.goodonyou.eco/categories/' + category)

    # get initial height of the page
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # scroll
        # source: https://stackoverflow.com/questions/73792388/how-to-scroll-to-the-bottom-of-the-page-with-selenium-python
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        
        # wait for 2 seconds
        time.sleep(2)
        
        # get new height of page
        new_height = driver.execute_script("return document.body.scrollHeight")
        
        # stop once it gets to the bottom
        if new_height == last_height:
            break
        
        # update height
        last_height = new_height

    # 100 results are now shown on the page
    for i in range(1, 101):
        # get xpath for each brand
        brand_xpath = f'//*[@id="__next"]/div/div[4]/div/div[2]/div/div/div[{i}]/div/div/div[2]/h5/a'

        # get brand element (after 5 second delay)
        brand_element = WebDriverWait(driver, 5).until(
            EC.visibility_of_element_located((By.XPATH, brand_xpath))
        )

        # get brand name and link
        # source for link: https://stackoverflow.com/questions/54862426/python-selenium-get-href-value
        brand_name = brand_element.text
        brand_link = brand_element.get_attribute('href')

        brands.add((brand_name, brand_link))
     
# close the WebDriver
driver.quit()

## Step 2: Get data (ratings and description) from each brand

In [18]:
# create list of all recommended brands + their info
recommended = []
cols = ['brand', 'overall_rating', 'planet_score', 'people_score', 'animals_score', 'description']

for brand in brands:
    try:
        brand_name = brand[0]
        brand_href = brand[1]

        # get data on each brand's page
        response = requests.get(brand_href)
        soup = BeautifulSoup(response.text, 'html.parser')

        # get overall rating 
        overall_rating = soup.find('p', id='brand-rating')
        overall_rating = overall_rating.text.split(': ')[-1]
        # overall_rating = soup.find('h6', class_='StyledHeading-sc-1rdh4aw-0 jNSEQB id__OverallRating-sc-12z6g46-7 cjSjNJ')
        # overall_rating = overall_rating.text.split(': ')[1]
        
        # get subratings for planet, people, and animals
        subratings = soup.find_all('div', class_='id__RatingSingle-sc-12z6g46-9 ksJKxw')
        
        # remove category name from text 
        subratings[0] = subratings[0].text.split('Planet')[1]
        subratings[1] = subratings[1].text.split('People')[1]
        subratings[2] = subratings[2].text.split('Animals')[1]

        # if there's a rating, convert to int (makes it easier to analyze later)
        for i, rating in enumerate(subratings):
            if rating == 'Not applicable':
                subratings[i] = ''
            else:
                subratings[i] = int(rating.split(' ')[0])

        # get description/justification
        text = soup.find('div', class_='id__BodyText-sc-12z6g46-15 eUqrmK').text

        numerical_scores = {
            'We avoid': 1,
            'Not good enough': 2,
            "It's a start": 3,
            'Good': 4,
            'Great': 5
        }

        # create new list of current brand info and add data
        brand_info = []
        brand_info.append(brand_name)
        brand_info.append(numerical_scores.get(overall_rating))
        brand_info.append(subratings[0])
        brand_info.append(subratings[1])
        brand_info.append(subratings[2])
        brand_info.append(text)
        
        # add to overall list of brand info
        recommended.append(brand_info)
    except:
        None

## Step 3: Create dataframe

In [19]:
recommended_df = pd.DataFrame(recommended, columns=cols)

recommended_df

Unnamed: 0,brand,overall_rating,planet_score,people_score,animals_score,description
0,Enfant Terrible,4,5,4,4,Enfant Terrible's environment rating is 'great...
1,milo+nicki,4,5,3,4,milo+nicki's environment rating is 'great'. It...
2,Birdsong,4,5,5,3,Birdsong's environment rating is 'great'. It u...
3,DAYWEARLAB,4,5,3,3,DAYWEARLAB's environment rating is 'great'. It...
4,LOVETRUST,4,4,3,5,LOVETRUST's environment rating is 'good'. It u...
...,...,...,...,...,...,...
449,Pour Les Femmes,3,2,4,4,Our “Planet” rating evaluates brands based on ...
450,Purusha People,4,5,4,,Purusha People's environment rating is 'great'...
451,Princesse tam.tam,3,3,3,,Princesse tam.tam is owned by Fast Retailing.O...
452,Ayten Gasson,4,5,4,,Our “Planet” rating evaluates brands based on ...


## Step 4: Export to a CSV file

In [20]:
# this CSV file is saved in the 'data' folder
recommended_df.to_csv('../data/gou_recommended.csv', index=False)