Webscrapping & Data Processing <br>
SAMBATH Sïndoumady <br>
DIA2

#### <center><font  align="center"> Project : Sustainable Nike Sneaker Marketplace 👟</font> <center>
# <center><font  align="center"> Vestaire Collective Scrapping </font> <center>


The following code presents the method for retrieving data from Vestiaire Collective, focusing on Nike brand shoes. The get_vestaire_data function is structured to adapt to user preferences by incorporating parameters such as gender, colour and size, allowing the search to be tailored to specific criteria. The function also manages the retrieval of the seller's location and the calculation of their carbon footprint in relation to Paris.

In [9]:
from geopy.geocoders import Nominatim
from geopy.distance import geodesic
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, StaleElementReferenceException
import pandas as pd

# Initialization of geocoder
geolocator = Nominatim(user_agent="vestiaire_collective_carbon_footprint")

#Calculation of the carbon footprint of the journey from Paris to the seller's location 
def calculate_approximate_carbon_footprint(location):
    try:
        location_info = geolocator.geocode(location, timeout=10)

        if location_info:
            location_coordinates = (location_info.latitude, location_info.longitude)
            paris_coordinates = (48.8566, 2.3522)
            distance_km = geodesic(location_coordinates, paris_coordinates).kilometers
            emission_rate_per_km = 0.2
            carbon_footprint = distance_km * emission_rate_per_km

            return carbon_footprint

    except (ValueError, TypeError, geopy.exc.GeocoderTimedOut):
        return None

#Building of the site url according to the input query criteria
def construct_vestiaire_url(query, gender=None, color=None, size=None):
    base_url = "https://fr.vestiairecollective.com/search/?q="
    query_url = f"{base_url}{query.replace(' ', '+')}"

    #Each filter has an identifier in the url specific to the Vestaire Co site
    gender_mapping = {"femme": "Femme%231", "homme": "Homme%232", "enfant": "Enfant%233"}
    color_mapping = {
        "beige": "Beige%232", "blanc": "Blanc%231", "bleu": "Bleu%239", "bordeaux": "Bordeaux%23220",
        "camel": "Camel%23222", "gris": "Gris%2313", "marron": "Marron%238", "noir": "Noir%2314",
        "orange": "Orange%234", "rouge": "Rouge%236", "vert": "Vert%2311", "anthracite" :"Anthracite%2353",
        "argenté":"Argenté%2315", "doré": "Doré%2316", "jaune": "Jaune%233", "kaki": "Kaki%2312",
        "marine":"Marine%23224","multicolore":"Multicolore%2317", "rose":"Rose%235","violet":"Violet%237",
        "turquoise":"Turquoise%23225","métallisé":"Métallisé%23221"
    }

    size_mapping_homme = {
        "37": "37%23262", "37.5": "37%2C5%23263", "38" : "38%23261", "38.5": "38.5%23260", "39": "39%2346", "40": "40%2348", 
        "41": "41%2350", "42": "42%2352", "43": "43%2354", "44": "44%2356", "45": "45%2358", "46": "46%2360", "47": "47%2362",
        "47.5": "47.5%23259", "48" : "48%23258"
    }

    size_mapping_femme = {
        "35": "35%2363", "36": "36%2331", "37": "37%2333", "38": "38%2335", "39": "39%2337",
        "40": "40%2339", "41": "41%2341", "42": "42%2343", "43": "43%2345"
    }

    size_mapping_enfant = {
        "16": "16%2383", "17": "17%2365", "18": "18%2366", "19": "19%2367", "20": "20%2368", "21": "21%2369",
        "22": "22%2370", "23": "23%2371", "24": "24%2372", "25": "25%2373", "26": "26%2374", "27": "27%2375",
        "28": "28%2376", "29": "29%2377", "30": "30", "31": "31%2379", "32": "32%2380", "33": "33%2381",
        "34": "34%2382", "35": "35%23232", "36": "36%23233", "37": "37%23234", "38": "38%23235"
    }

    #Adding gender to the url
    if gender:
        gender_url = f"#gender={gender_mapping.get(gender.lower(), '')}"
        query_url += gender_url

    #Adding color to the url
    if color:
        color_url = f"_color={color_mapping.get(color.lower(), '')}"
        query_url += color_url
        

    size_mapping = size_mapping_femme if gender == "femme" else size_mapping_homme if gender == "homme" else size_mapping_enfant
    
    #Adding size to the url depending on the gender 
    if size and gender == "femme":
        size_url = f"_size3={size_mapping.get(size, '')}"
        query_url += size_url
    elif size and gender == "homme":
        size_url = f"_size4={size_mapping.get(size, '')}"
        query_url += size_url
    elif size and gender == "enfant":
        size_url = f"_size6={size_mapping.get(size, '')}"
        query_url += size_url

    return query_url

#Scrapping vestiaire co to retrieve data
def get_vestiaire_data(query, gender=None, color=None, size=None):
    vestiaire_url = construct_vestiaire_url(query, gender=gender, color=color, size=size)
    
    # Use Firefox Driver
    firefox_path ="C:/tools/geckodriver.exe"
    driver = webdriver.Firefox(executable_path=firefox_path)
    driver.get(vestiaire_url)

    # Cookies handling
    try:
        cookie_button = WebDriverWait(driver, 10).until(
            EC.element_to_be_clickable((By.ID, 'popin_tc_privacy_button_2'))
        )
        cookie_button.click()
    except:
        print("Cookie button not found or timed out")

    data = {"Title": [], "Price": [], "Brand": [], "Size": [], "Link": [], "Image Source": [], "Localisation": [], "Approximate Carbon Footprint": []}

    # Start to scrap articles
    while True:
        try:
            # To see the entire page
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CLASS_NAME, 'product-search_catalog__flexContainer__Dg0eL'))
            )

            #Point article container
            product_containers = driver.find_element(By.CLASS_NAME, 'product-search_catalog__flexContainer__Dg0eL')
            li_elements = product_containers.find_elements(By.TAG_NAME, 'li')
            
            #Iterateon each article
            for container in li_elements:
                try:
                    # Retreive each informations if exists
                    try:
                        title_element = container.find_element(By.XPATH, './/h2[@data-cy="productCard__productLink"]')
                    except NoSuchElementException:
                        continue

                    brand_element = title_element.find_element(By.XPATH, './/span[@data-cy="productCard__text__brand"]')
                    name_element = title_element.find_element(By.XPATH, './/span[@data-cy="productCard__text__name"]')
                    size_element = title_element.find_element(By.XPATH, './/p[@data-cy="productCard__text__size"]')
                    price_element = container.find_element(By.XPATH, './/span[@data-cy="productCard__text__price__discount"]')
                    location_element = container.find_element(By.XPATH, './/div[@data-cy="productCard__text__location"]')

                    try:
                        link_element = container.find_element(By.XPATH, './/a[@class="product-card_productCard__image__40WNk"]')
                        href = link_element.get_attribute('href')
                    except NoSuchElementException:
                        continue

                    img_element = container.find_element(By.XPATH, './/img[@class="vc-images_image__TfKYE"]')
                    img_src = img_element.get_attribute('src')

                    localisation = location_element.text.strip()
                    approximate_carbon_footprint = calculate_approximate_carbon_footprint(localisation)

                    #build dataset
                    data["Title"].append(f"{brand_element.text.strip()} {name_element.text.strip()}")
                    data["Price"].append(price_element.text.strip())
                    data["Brand"].append(brand_element.text.strip())
                    data["Size"].append(size_element.text.strip())
                    data["Link"].append(href)
                    data["Image Source"].append(img_src)
                    data["Localisation"].append(localisation)
                    data["Approximate Carbon Footprint"].append(approximate_carbon_footprint)

                except StaleElementReferenceException:
                    print("Stale Element Reference Exception - Skipping container.")
                    continue
                except ValueError as ve:
                    print(f"ValueError: {ve}")
                    continue
                except Exception as e:
                    print(f"An unexpected error occurred: {e}")
                    continue
            
            #Go to the next page
            try:
                next_button = driver.find_element(By.XPATH, "//button[@data-cy='pagination-right-arrow-btn']")
                if next_button.get_attribute("disabled"):
                    break  # Break the loop if 'next' button is disabled (last page)
                next_button.click()
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                WebDriverWait(driver, 10).until(
                    EC.presence_of_all_elements_located((By.CLASS_NAME, 'product-search_catalog__flexContainer__Dg0eL'))
                )
            except NoSuchElementException:
                break  # Break the loop if 'next' button is not found (no more pages)
        
        except NoSuchElementException:
            print("Product containers not found on the current page.")
    
    return pd.DataFrame(data)

#Data pre-processing
def process_data_vestiaire(vestiaire_data):
    vestiaire_data["Price"] = vestiaire_data['Price'].apply(lambda x: min([(float(val.replace(',', '.'))) for val in str(x).split('€') if val.strip()]))
    vestiaire_data['Size'] = vestiaire_data['Size'].str.extract(r'Taille : (\d+\.?\d*)')
    return vestiaire_data
   
#Main
query = "nike air force 1"
gender = "femme" 
color = "blanc"    
size = "40"   

vestiaire_data = get_vestiaire_data(query, gender=gender, color=color, size=size)
vestiaire_data= process_data_vestiaire(vestiaire_data)
vestiaire_data

Unnamed: 0,Title,Price,Brand,Size,Link,Image Source,Localisation,Approximate Carbon Footprint
0,NIKE Air Force 1 Baskets,45.15,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/images/...,Italie,211.591248
1,NIKE Air Force 1 Baskets,45.15,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,France,50.586158
2,NIKE Air Force 1 en cuir Baskets,96.60,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/images/...,Italie,211.591248
3,NIKE Air Force 1 Baskets,47.25,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Monaco,138.114190
4,NIKE Air Force 1 en cuir Baskets,93.44,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Pologne,1048.754100
...,...,...,...,...,...,...,...,...
64,NIKE En cuir Baskets,94.50,NIKE,40.5,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Portugal,263.698433
65,NIKE Air Force 1 en cuir Baskets,107.25,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Suisse,99.193266
66,NIKE Baskets Air Force 1 en cuir,125.86,NIKE,7,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Royaume-Uni,151.407889
67,NIKE Baskets Air Force 1 en cuir,104.07,NIKE,8.5,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Etats-Unis,1522.772797


In the analysis, a constant emission rate per kilometer was assigned, and the carbon footprint for each item's journey to Paris was calculated. Additionally, a fixed value for Nike's carbon footprint from ordering on their website was introduced, set at 405 kg CO2 per ton (explained in our report).
<br>
The "Carbon Profit" column in the DataFrame reflects the environmental benefit attained by ordering through Vinted. It takes into account the reduction in carbon footprint compared to ordering directly from Nike's website.

In [10]:
# Adding columns to the DataFrame for the approximate carbon footprint and Nike's fixed value
vestiaire_data["Approximate Carbon Footprint (kg CO2)"] = vestiaire_data["Localisation"].apply(calculate_approximate_carbon_footprint)
vestiaire_data["Approximative Carbon Print from Ordering on Nike (kg CO2/ton)"] = 405  

# Adding a column for Carbon Profit
vestiaire_data["Carbon Profit (kg CO2)"] = vestiaire_data["Approximative Carbon Print from Ordering on Nike (kg CO2/ton)"] - vestiaire_data["Approximate Carbon Footprint (kg CO2)"]

# Example usage:
vestiaire_data

Unnamed: 0,Title,Price,Brand,Size,Link,Image Source,Localisation,Approximate Carbon Footprint,Approximate Carbon Footprint (kg CO2),Approximative Carbon Print from Ordering on Nike (kg CO2/ton),Carbon Profit (kg CO2)
0,NIKE Air Force 1 Baskets,45.15,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/images/...,Italie,211.591248,211.591248,405,193.408752
1,NIKE Air Force 1 Baskets,45.15,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,France,50.586158,50.586158,405,354.413842
2,NIKE Air Force 1 en cuir Baskets,96.60,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/images/...,Italie,211.591248,211.591248,405,193.408752
3,NIKE Air Force 1 Baskets,47.25,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Monaco,138.114190,138.114190,405,266.885810
4,NIKE Air Force 1 en cuir Baskets,93.44,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Pologne,1048.754100,1048.754100,405,-643.754100
...,...,...,...,...,...,...,...,...,...,...,...
64,NIKE En cuir Baskets,94.50,NIKE,40.5,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Portugal,263.698433,263.698433,405,141.301567
65,NIKE Air Force 1 en cuir Baskets,107.25,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Suisse,99.193266,99.193266,405,305.806734
66,NIKE Baskets Air Force 1 en cuir,125.86,NIKE,7,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Royaume-Uni,151.407889,151.407889,405,253.592111
67,NIKE Baskets Air Force 1 en cuir,104.07,NIKE,8.5,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Etats-Unis,1522.772797,1522.772797,405,-1117.772797


To assess potential cost savings, a process was initiated to scrape all available prices for the specified query on the Nike website. This allows for the calculation of the average price of the desired item by aggregating the individual prices. 

In [13]:
from selenium import webdriver

from selenium.webdriver.common.by import By

def accept_cookies(driver):
    try:
        # Accept cookies
        accept_cookies_button = driver.find_element(By.CLASS_NAME, 'btn-primary-dark')
        accept_cookies_button.click()
    except:
        pass  # If the cookies banner is not present or there's an error, ignore

def get_nike_prices(search_query):
    # Set up the Firefox driver
    firefox_path ="C:/tools/geckodriver.exe"
    driver = webdriver.Firefox(executable_path=firefox_path)
    
    try:
        # Go to nike.fr and accept cookies
        driver.get("https://www.nike.com/fr/")
        accept_cookies(driver)

        # Construct the URL with the search query
        url = f"https://www.nike.com/fr/w?q={search_query.replace(' ', '%20')}"

        # Navigate to the URL with the search query
        driver.get(url)

        # Wait for the prices to load (you may need to adjust the waiting time)
        driver.implicitly_wait(10)

        # Retrieve prices
        price_elements = driver.find_elements(By.CLASS_NAME, 'product-price')
        prices = [float(price.text.replace('€', '').replace(',', '.')) for price in price_elements]

        # Calculate the average price
        average_price = sum(prices) / len(prices)

        return average_price

    except Exception as e:
        print("An error occurred:", e)

    finally:
        # Close the browser window
        driver.quit()

average_nike_price=get_nike_prices(query)
print(average_nike_price)

120.80857142857137


A mechanism is implemented to calculate the potential profit made by purchasing items on Vinted compared to the average price of the same product on the Nike website. The function computes the profit made for each item. The resulting DataFrame, vestiaire_data, is then enriched with additional columns, providing insights into the potential financial benefit of opting for Vestaire Co over the average market price on Nike.

In [14]:
def calculate_profit_made(row, average_nike_price):
    try:
        item_price = float(row['Price'])
        profit_made = average_nike_price - item_price
        return profit_made

    except ValueError:
        return None

vestiaire_data.sort_values(by='Price', inplace=True)
# Calculate profit made for Vestaire data
vestiaire_data['Average price of the product on Nike'] = average_nike_price
vestiaire_data['Profit Made'] = vestiaire_data.apply(lambda row: calculate_profit_made(row, average_nike_price), axis=1)

vestiaire_data

Unnamed: 0,Title,Price,Brand,Size,Link,Image Source,Localisation,Approximate Carbon Footprint,Approximate Carbon Footprint (kg CO2),Approximative Carbon Print from Ordering on Nike (kg CO2/ton),Carbon Profit (kg CO2),Average price of the product on Nike,Profit Made
31,NIKE Baskets Air Force 1 en cuir,22.05,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Croatie,216.010977,216.010977,405,188.989023,120.808571,98.758571
22,NIKE Air Force 1 en cuir Baskets,26.25,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Italie,211.591248,211.591248,405,193.408752,120.808571,94.558571
43,NIKE Baskets Air Force 1 en cuir,33.60,NIKE,38.5,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Italie,211.591248,211.591248,405,193.408752,120.808571,87.208571
61,NIKE Baskets Air Force 1 en toile,34.93,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Royaume-Uni,151.407889,151.407889,405,253.592111,120.808571,85.878571
25,NIKE Air Force 1 en toile Baskets,36.79,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Roumanie,341.737957,341.737957,405,63.262043,120.808571,84.018571
...,...,...,...,...,...,...,...,...,...,...,...,...,...
57,NIKE Baskets Air Force 1 en cuir,241.50,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,France,50.586158,50.586158,405,354.413842,120.808571,-120.691429
52,NIKE Baskets Air Force 1 en cuir,261.45,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Pays-Bas,87.928759,87.928759,405,317.071241,120.808571,-140.641429
58,NIKE Baskets Air Force 1 en cuir,262.50,NIKE,40.5,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,France,50.586158,50.586158,405,354.413842,120.808571,-141.691429
54,NIKE Baskets Air Force 1 en cuir,331.64,NIKE,40,https://fr.vestiairecollective.com/chaussures-...,https://images.vestiairecollective.com/cdn-cgi...,Suède,287.361879,287.361879,405,117.638121,120.808571,-210.831429
