# Getting ratings from a specific brand

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
# import string

all_brand_info = []
cols = ['brand', 'overall_rating', 'planet_score', 'people_score', 'animals_score', 'description']

url = 'https://directory.goodonyou.eco/brand/'

brands = [
    'Princess Polly',
    'Brandy Melville',
    'Shein',
    'Nike',
    'Abercrombie & Fitch',
    'Amazon',
    'ASOS',
    'Forever 21', 
    'American Eagle'
]

for brand in brands:
    # convert brand name to url format
    brand_converted = brand.lower() # lowercase
    brand_converted = brand_converted.replace(' ', '-') # replace spaces with dashes
    brand_converted = brand_converted.replace('&', 'and') # replace '&' with 'and'
    # !! need to figure out a way to remove other punctuation like apostrophes (e.g., "Levi's") without removing the dashes we just added

    # get data on each brand's page
    response = requests.get(url+brand_converted)
    soup = BeautifulSoup(response.text, 'html.parser')

    # note: when finding classes on GOU's website, you might need to disable JavaScript (since BeautifulSoup can't load dynamic content)

    # get overall rating 
    overall_rating = soup.find('h6', class_='StyledHeading-sc-1rdh4aw-0 jNSEQB id__OverallRating-sc-12z6g46-7 cjSjNJ')
    overall_rating = overall_rating.text.split(': ')[1]

    # get subratings for planet, people, and animals
    subratings = soup.find_all('div', class_='id__RatingSingle-sc-12z6g46-9 ksJKxw')
 
    # remove category name from text 
    subratings[0] = subratings[0].text.split('Planet')[1]
    subratings[1] = subratings[1].text.split('People')[1]
    subratings[2] = subratings[2].text.split('Animals')[1]

    # if there's a rating, convert to int (makes it easier to analyze later)
    for i, rating in enumerate(subratings):
        if rating != 'Not applicable':
            subratings[i] = int(rating.split(' ')[0])

    # get description/justification
    text = soup.find('div', class_='id__BodyText-sc-12z6g46-15 eUqrmK').text
    
    # create new list of current brand info and add data
    brand_info = []
    brand_info.append(brand)
    brand_info.append(overall_rating)
    brand_info.append(subratings[0])
    brand_info.append(subratings[1])
    brand_info.append(subratings[2])
    brand_info.append(text)
    
    # add to overall list of brand info
    all_brand_info.append(brand_info)

brand_df = pd.DataFrame(all_brand_info, columns=cols)

# replace Good On You's categories with numerical ratings
# source: https://saturncloud.io/blog/how-to-convert-categorical-data-to-numerical-data-with-pandas
brand_df['overall_rating'] = brand_df['overall_rating'].replace({
    'We avoid': 1,
    'Not good enough': 2,
    "It's a start": 3,
    'Good': 4,
    'Great': 5
})

brand_df

Unnamed: 0,brand,overall_rating,planet_score,people_score,animals_score,description
0,Princess Polly,2,2,2,4,Our “Planet” rating evaluates brands based on ...
1,Brandy Melville,1,1,1,0,This brand provides insufficient relevant info...
2,Shein,1,1,1,2,Our “Planet” rating evaluates brands based on ...
3,Nike,3,3,3,2,Our “Planet” rating evaluates brands based on ...
4,Abercrombie & Fitch,2,2,2,2,Abercrombie & Fitch is owned by Abercrombie Ab...
5,Amazon,2,2,2,2,Amazon's rating applies to the Amazon house ap...
6,ASOS,2,2,2,2,Our “Planet” rating evaluates brands based on ...
7,Forever 21,1,1,1,2,Forever 21 is owned by Authentic Brands Group....
8,American Eagle,2,2,2,2,American Eagle is owned by AEO Inc.Our “Planet...


In [2]:
# this CSV file is saved in the data folder
brand_df.to_csv('../data/brand_info.csv')

# Getting all brands from GOU's website and creating a dataset of their ratings

In [3]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

brands = []

# set Chrome options for headless mode
chrome_options = Options()

# initialize WebDriver with headless mode
driver = webdriver.Chrome(options=chrome_options)

# open webpage
driver.get('https://directory.goodonyou.eco/categories/tops')

# get initial height of the page
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # scroll
    # source: https://stackoverflow.com/questions/73792388/how-to-scroll-to-the-bottom-of-the-page-with-selenium-python
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    
    # wait for 2 seconds
    time.sleep(2)
    
    # get new height of page
    new_height = driver.execute_script("return document.body.scrollHeight")
    
    # stop once it gets to the bottom
    if new_height == last_height:
        break
    
    # update height
    last_height = new_height

# 100 results are now shown on the page
for i in range(1, 101):
    # get xpath for each brand
    brand_xpath = f'//*[@id="__next"]/div/div[4]/div/div[2]/div/div/div[{i}]/div/div/div[2]/h5/a'

    # get brand element (after 5 second delay)
    brand_element = WebDriverWait(driver, 5).until(
        EC.visibility_of_element_located((By.XPATH, brand_xpath))
    )

    # get brand name and link
    # source for link: https://stackoverflow.com/questions/54862426/python-selenium-get-href-value
    brand_name = brand_element.text
    brand_link = brand_element.get_attribute('href')

    brands.append((brand_name, brand_link))
     
# close the WebDriver
driver.quit()

In [4]:
recommended = []

for brand in brands:
    brand_name = brand[0]
    brand_href = brand[1]

    # get data on each brand's page
    response = requests.get(brand_href)
    soup = BeautifulSoup(response.text, 'html.parser')

    # get overall rating 
    overall_rating = soup.find('h6', class_='StyledHeading-sc-1rdh4aw-0 jNSEQB id__OverallRating-sc-12z6g46-7 cjSjNJ')
    overall_rating = overall_rating.text.split(': ')[1]
    
    # get subratings for planet, people, and animals
    subratings = soup.find_all('div', class_='id__RatingSingle-sc-12z6g46-9 ksJKxw')
    
    # remove category name from text 
    subratings[0] = subratings[0].text.split('Planet')[1]
    subratings[1] = subratings[1].text.split('People')[1]
    subratings[2] = subratings[2].text.split('Animals')[1]

    # if there's a rating, convert to int (makes it easier to analyze later)
    for i, rating in enumerate(subratings):
        if rating != 'Not applicable':
            subratings[i] = int(rating.split(' ')[0])

    # get description/justification
    text = soup.find('div', class_='id__BodyText-sc-12z6g46-15 eUqrmK').text

    # create new list of current brand info and add data
    brand_info = []
    brand_info.append(brand_name)
    brand_info.append(overall_rating)
    brand_info.append(subratings[0])
    brand_info.append(subratings[1])
    brand_info.append(subratings[2])
    brand_info.append(text)
    
    # add to overall list of brand info
    recommended.append(brand_info)

recommended_df = pd.DataFrame(recommended, columns=cols)

# replace Good On You's categories with numerical ratings
# source: https://saturncloud.io/blog/how-to-convert-categorical-data-to-numerical-data-with-pandas
recommended_df['overall_rating'] = recommended_df['overall_rating'].replace({
    'We avoid': 1,
    'Not good enough': 2,
    "It's a start": 3,
    'Good': 4,
    'Great': 5
})

recommended_df

Unnamed: 0,brand,overall_rating,planet_score,people_score,animals_score,description
0,Yes Friends,5,5,5,5,Yes Friends' environment rating is 'great'. It...
1,Monsoon Blooms,5,5,5,5,Monsoon Blooms's environment rating is 'great'...
2,Outland Denim,5,5,5,5,Our “Planet” rating evaluates brands based on ...
3,STANLEY/STELLA,5,5,4,5,Our “Planet” rating evaluates brands based on ...
4,ÉTICA,5,5,5,5,ÉTICA's environment rating is 'great'. It uses...
...,...,...,...,...,...,...
95,Enfant Terrible,4,5,4,4,Enfant Terrible's environment rating is 'great...
96,Pareto,4,5,3,5,Our “Planet” rating evaluates brands based on ...
97,Sotela,4,5,4,4,Our “Planet” rating evaluates brands based on ...
98,Dear Denier,4,5,4,4,Our “Planet” rating evaluates brands based on ...


In [5]:
# this CSV file is saved in the data folder
recommended_df.to_csv('../data/gou_recommended.csv')