## 🔎 Scraping Decathlon Products Reviews

### I. Importing libraries

In [1]:
from bs4 import BeautifulSoup as bs
from tqdm import tqdm
import pandas as pd
import requests



Function that extracts reviews from a product url

In [49]:
def extract_reviews(url):
    """
    Function that extracts product reviews using beautifulsoup
    """
    
    page = requests.get(url)

    soup = bs(page.content, 'html.parser')

    names = soup.find_all('span',class_='reviewer')

    cust_name = []
    for i in range(0,len(names)):
        cust_name.append(names[i].get_text())
    

    review_dates = soup.find_all('time',class_='dtreviewed')

    review_date = []
    for i in range(0,len(review_dates)):
        review_date.append(review_dates[i].get_text().strip())
        

    review_titles = soup.find_all('span',class_='fn')

    review_title = []
    for i in range(0,len(review_titles)):
        review_title.append(review_titles[i].get_text().strip())

    
    review_items = soup.find_all('div', class_='oyreviews-content-item')

    review_scores = []
    for item in review_items:
        review_span = item.find_all('span', class_='rating')
        review_scores.append(review_span)

    review_score = [score[0]['title'] for score in review_scores]


    range_used_since = soup.find_all('span', class_='range-used-since')

    range_used = []
    for i in range(0,len(range_used_since)):
        range_used.append(range_used_since[i].get_text().strip())
    
    if len(range_used) < 10:
        range_used.append("NaN")
        
        
    review_descriptions = soup.find_all('blockquote', class_='description')

    review_description = []
    for i in range(0,len(review_descriptions)):
        review_description.append(review_descriptions[i].get_text().strip())

    df = pd.DataFrame({
        'cust_name': cust_name,
        'review_date': review_date,
        'review_title': review_title,
        'review_description': review_description,
        'range_used': range_used,
        'review_score': review_score
    })

    return df

### 1. 🏋🏻 KIT HALTÈRES DE MUSCULATION 20KG (CORENGTH)

In [20]:
# Create an empty list to store all the dataframes
halteres_dfs = []

"""
Product : KIT HALTÈRES DE MUSCULATION 20KG
Ref. 8018574
"""

for page in tqdm(range(1, 31)):
    url = f"https://www.decathlon.ma/modules/openvoice/openvoice-render.php?id_model=8018574&reviews_page={page}"
    df_halteres = extract_reviews(url)
    halteres_dfs.append(df_halteres)

100%|██████████| 30/30 [00:11<00:00,  2.51it/s]


In [7]:
# Concatenate all the dataframes into a single dataframe
df_halteres = pd.concat(halteres_dfs, ignore_index=True)
df_halteres.shape

(300, 6)

In [11]:
df_halteres.head()

Unnamed: 0,cust_name,review_date,review_title,review_description,range_used,review_score
0,Ouioui,20/02/2023,Tres bon article,Merci Decathlon,Utilise ce produit depuis 2 à 8 semaines,5
1,Khalid,04/01/2023,Le prix,399dh c'est le juste prix. Pas moyen que quelq...,Utilise ce produit depuis 2 à 8 semaines,3
2,Mohamed,12/12/2022,beatiful product,very useful product with good quality i advice...,Utilise ce produit depuis 1 semaine ou moins,5
3,Ouzzine,08/12/2022,رائعة,ثمن مناسب جدا,Utilise ce produit depuis 2 à 8 semaines,5
4,Hatim,01/12/2022,Bien,Bien,Utilise ce produit depuis 1 semaine ou moins,5


In [9]:
data_path = '../data/reviews/raw/'

df_halteres.to_csv(data_path + 'kit-halteres-reviews.csv', index=False)

### 2. 👟 SANDALES DE RANDONNÉE - NH100 - HOMME (QUECHUA)

In [50]:
# Create an empty list to store all the dataframes
sandales_dfs = []

"""
Product : SANDALES DE RANDONNÉE - NH100 - HOMME
Ref. 8664487
"""

for page in tqdm(range(1, 31)):
    url = f"https://www.decathlon.ma/modules/openvoice/openvoice-render.php?id_model=8664487&reviews_page={page}"
    df_sandales = extract_reviews(url)
    sandales_dfs.append(df_sandales)

100%|██████████| 30/30 [00:11<00:00,  2.55it/s]


In [51]:
# Concatenate all the dataframes into a single dataframe
df_sandales = pd.concat(sandales_dfs, ignore_index=True)
df_sandales.shape

(300, 6)

In [52]:
df_sandales.head()

Unnamed: 0,cust_name,review_date,review_title,review_description,range_used,review_score
0,Edriaa,30/04/2023,Bien,Bien,Utilise ce produit depuis 1 semaine ou moins,5
1,Badr belmaachi,11/02/2023,jdida,Salam,Utilise ce produit depuis Plus de 2 ans,5
2,Khalid,02/01/2023,42,1,Utilise ce produit depuis 3 à 6 mois,5
3,Youssef,08/10/2022,Comfortable,It is comfortable and practical,Utilise ce produit depuis 2 à 8 semaines,5
4,Youssef,02/10/2022,Sandales,C'est bien,Utilise ce produit depuis 2 à 8 semaines,4


In [53]:
data_path = '../data/reviews/raw/'

df_sandales.to_csv(data_path + 'sandales-randonnee-reviews.csv', index=False)