In [83]:
# Importing libraries
import requests
from bs4 import BeautifulSoup
import plotly.express as px
import pandas as pd
import numpy as np
import re


In [84]:
# Download the page content

page = requests.get('https://aziza.tn/fr/home')
soup = BeautifulSoup(page.content, 'html.parser')

In [85]:
# Find all product items
products = soup.find_all('li', class_='product-item')

In [86]:
data = []

for product in products:
    # Extract Product Name (only process items that have a product name)
    name_tag = product.find('a', class_='product-item-link')
    if not name_tag:
        continue  # Skip non-product items
    product_name = name_tag.get_text(strip=True)
    
    # Category: Not available set default to "N/A"
    category = "N/A"


    # Extract Price:
    # The price number is split between text and a <span class="units"> element.
    price_wrapper = product.find('span', class_='ok price-wrapper')
    if price_wrapper:
        price_span = price_wrapper.find('span', class_='price')
        price_text = price_span.get_text(separator=' ', strip=True) if price_span else "N/A"
        # Extract currency (e.g., TND)
        currency_tag = price_wrapper.find('span', class_='currency')
        currency = currency_tag.get_text(strip=True) if currency_tag else ""
        price = f"{price_text} {currency}"
    else:
        price = "N/A"
    

    # Extract Availability:
    # Look for the button text, which in this case is "Voir disponibilité"
    avail_button = product.find('button', class_='tocart primary dispo')
    if avail_button:
        avail_link = avail_button.find('a')
        availability = avail_link.get_text(strip=True) if avail_link else "N/A"
    else:
        availability = "N/A"
    

    # Extract Promotional Details:
    # Check if there is a <div class="super"> indicating a promotion (e.g., an image badge)
    promo_div = product.find('div', class_='super')
    if promo_div:
        promo_img = promo_div.find('img')
        promo = promo_img['src'] if promo_img and promo_img.has_attr('src') else "N/A"
    else:
        promo = "N/A"
   

    
    # Append the data as a dictionary to the list
    data.append({
        "Product Name": product_name,
        "Category": category,
        "Price": price,
        "Availability": availability,
        "Promotional Details": promo
    })

In [87]:
# Convert the list of dictionaries to a DataFrame for further analysis or export
df_products = pd.DataFrame(data)
print(df_products.head())

              Product Name Category       Price        Availability  \
0  Tomate double concentré      N/A  3, 750 TND  Voir disponibilité   
1              Riz basmati      N/A  4, 780 TND  Voir disponibilité   
2         Boite de cookies      N/A  1, 100 TND  Voir disponibilité   
3        Lot 2 pots opaque      N/A  1, 150 TND  Voir disponibilité   
4       Tasse en plastique      N/A  1, 900 TND  Voir disponibilité   

                Promotional Details  
0  ../media/wysiwyg/superfriday.svg  
1  ../media/wysiwyg/superfriday.svg  
2                               N/A  
3                               N/A  
4                               N/A  


### Data Cleaning

In [88]:
# general data info
df_products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Product Name         266 non-null    object
 1   Category             266 non-null    object
 2   Price                266 non-null    object
 3   Availability         266 non-null    object
 4   Promotional Details  266 non-null    object
dtypes: object(5)
memory usage: 10.5+ KB


In [89]:
# Identifying missing values
df_products.isna().sum()

Product Name           0
Category               0
Price                  0
Availability           0
Promotional Details    0
dtype: int64

In [90]:
# Replace ambiguous "N/A" strings with np.nan so they're recognized as missing values.
df_products.replace("N/A", np.nan, inplace=True)


Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [91]:
# 2. Standardize text fields:
# Clean Product Names: remove extra whitespace and convert to title case.
df_products["Product Name"] = df_products["Product Name"].str.strip().str.title()

In [92]:
# If a Category column exists, standardize it similarly.
if "Category" in df_products.columns:
    df_products["Category"] = df_products["Category"].astype(str).str.strip().str.title()


In [None]:
# Clean Price Data:
# If Price column contains values like "3,750 TND". We'll extract the numeric part.
def clean_price(price_str):
    if pd.isnull(price_str):
        return np.nan

In [94]:
df_products.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 266 entries, 0 to 265
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Product Name         266 non-null    object
 1   Category             266 non-null    object
 2   Price                266 non-null    object
 3   Availability         100 non-null    object
 4   Promotional Details  6 non-null      object
dtypes: object(5)
memory usage: 10.5+ KB


### Data Transformation

Convert price data into numerical formats for analysis.

In [95]:
# Turning price data into float values
# Inspect unique values in the Price column to check for irregularities
print(df_products["Price"].unique())

['3, 750 TND' '4, 780 TND' '1, 100 TND' '1, 150 TND' '1, 900 TND'
 '2, 240 TND' '2, 790 TND' '2, 980 TND' '2, 990 TND' '3, 490 TND'
 '3, 880 TND' '3, 990 TND' '4, 900 TND' '4, 990 TND' '5, 000 TND'
 '5, 550 TND' '5, 900 TND' '5, 990 TND' '6, 750 TND' '6, 970 TND'
 '6, 990 TND' '7, 450 TND' '7, 980 TND' '7, 990 TND' '8, 690 TND'
 '8, 990 TND' '9, 990 TND' '11, 490 TND' '11, 780 TND' '11, 990 TND'
 '13, 990 TND' '16, 990 TND' '18, 990 TND' '19, 990 TND' '39, 990 TND'
 '49, 980 TND' '55, 900 TND' '0, 390 TND' '0, 490 TND' '0, 620 TND'
 '0, 650 TND' '0, 690 TND' '0, 710 TND' '0, 750 TND' '0, 780 TND'
 '0, 850 TND' '0, 890 TND' '1, 350 TND' '1, 360 TND' '1, 450 TND'
 '1, 490 TND' '1, 550 TND' '1, 690 TND' '1, 960 TND' '1, 990 TND'
 '2, 050 TND' '2, 260 TND' '2, 640 TND' '2, 690 TND' '2, 770 TND'
 '3, 390 TND' '3, 890 TND' '4, 150 TND' '4, 490 TND' '4, 590 TND'
 '4, 790 TND' '5, 730 TND' '5, 980 TND' '6, 490 TND' '6, 690 TND'
 '8, 590 TND' '9, 980 TND' '10, 420 TND' '10, 590 TND' '11, 650 TN

In [96]:
# Remove currency symbols and commas, then convert to float
df_products["Price"] = (
    df_products["Price"]
    .astype(str)  # Ensure all values are strings for processing
    .str.replace(r"[^\d.]", "", regex=True)  # Remove non-numeric characters except dot
    .astype(float)  # Convert to float
)

In [97]:
print(df_products["Price"].dtype)
print(df_products.head())

float64
              Product Name Category   Price        Availability  \
0  Tomate Double Concentré      Nan  3750.0  Voir disponibilité   
1              Riz Basmati      Nan  4780.0  Voir disponibilité   
2         Boite De Cookies      Nan  1100.0  Voir disponibilité   
3        Lot 2 Pots Opaque      Nan  1150.0  Voir disponibilité   
4       Tasse En Plastique      Nan  1900.0  Voir disponibilité   

                Promotional Details  
0  ../media/wysiwyg/superfriday.svg  
1  ../media/wysiwyg/superfriday.svg  
2                               NaN  
3                               NaN  
4                               NaN  


Categorize products into hierarchical groups (e.g., Electronics > Mobile Phones > Smartphones).

In [98]:
# Assigning categories
# Define a function to assign hierarchical categories based on product name keywords
def assign_category(product_name):
    name = product_name.lower()
    
    # Food / Grocery-related products
    if any(keyword in name for keyword in ["tomate", "riz", "yaourt", "biscuit", "cake", "salami", "thon", "jus", "boisson", "fromage", "pain"]):
        return "Food > Grocery"
    
    # Kitchenware / Tableware
    elif any(keyword in name for keyword in ["mug", "tasse", "verre", "bol", "carafe", "pichet", "boite de conservation", "boite de rangement"]):
        return "Kitchenware > Tableware"
    
    # Cookware and related items
    elif any(keyword in name for keyword in ["casserole", "poele", "plat à four", "planche à découper", "presse agrume", "hachoir"]):
        return "Cookware & Kitchen Tools"
    
    # Home appliances and organization
    elif any(keyword in name for keyword in ["lampe", "chargeur", "etagère", "organisateur", "poubelle", "coffret"]):
        return "Home & Appliances"
    
    # Personal care items
    elif any(keyword in name for keyword in ["shampoing", "gel douche", "dentifrice", "après shampoing"]):
        return "Personal Care"
    
    # Fall-back category for any products that don't match above
    else:
        return "Other"

# Apply the function to the Product Name column to populate the Category column
df_products["Category"] = df_products["Product Name"].apply(assign_category)

# Check the result by displaying a few rows
print(df_products[["Product Name", "Category"]].head(10))

                                   Product Name                 Category
0                       Tomate Double Concentré           Food > Grocery
1                                   Riz Basmati           Food > Grocery
2                              Boite De Cookies                    Other
3                             Lot 2 Pots Opaque                    Other
4                            Tasse En Plastique  Kitchenware > Tableware
5  Lot De 3 Boites De Conservation En Plastique                    Other
6                                           Bol  Kitchenware > Tableware
7                                     Lampe Led        Home & Appliances
8                Boite De Conservation En Verre  Kitchenware > Tableware
9             Tasse En Verre Coloré Avec Paille  Kitchenware > Tableware


### Data Exploration and Visualization

In [99]:
#Average Pricing by Category

# Compute the average price per category.
avg_price_by_category = df_products.groupby("Category")["Price"].mean().reset_index()

# Create a bar chart for average price by category.
fig_category = px.bar(avg_price_by_category,
                      x="Category", 
                      y="Price",
                      title="Average Price by Product Category",
                      labels={"Price": "Average Price", "Category": "Product Category"})
fig_category.show()

In [100]:
# Detecting Promotional Pricing Patterns

# Create a flag: True if a product has promotional details (i.e., not missing), False otherwise.
df_products["On_Promotion"] = df_products["Promotional Details"].notna()

# Compute average price for promotional and non-promotional items.
avg_price_promo = df_products.groupby("On_Promotion")["Price"].mean().reset_index()

# Create a bar chart to compare.
fig_promo = px.bar(avg_price_promo,
                   x="On_Promotion", 
                   y="Price",
                   title="Average Price: Promotional vs. Non-Promotional Products",
                   labels={"On_Promotion": "On Promotion", "Price": "Average Price"})

fig_promo.show()

In [101]:
# Assess product availability trends over time.

# Group by Category and Availability, then count the occurrences.
availability_by_category = df_products.groupby(['Category', 'Availability']).size().reset_index(name='Count')
print(availability_by_category)

fig_avail_cat = px.bar(
    availability_by_category,
    x='Category',
    y='Count',
    color='Availability',   # Different colors for different availability statuses
    title="Availability Grouped by Product Category",
    labels={'Count': 'Number of Products', 'Category': 'Product Category'},
    barmode='stack'  # This stacks the counts for each availability status within each category
)

fig_avail_cat.show()

                   Category        Availability  Count
0  Cookware & Kitchen Tools  Voir disponibilité      7
1            Food > Grocery  Voir disponibilité      4
2         Home & Appliances  Voir disponibilité     10
3   Kitchenware > Tableware  Voir disponibilité     40
4                     Other  Voir disponibilité     39
