In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px


In [2]:
products_df = pd.read_csv("../data/products.csv")
reviews_df = pd.read_csv("../data/reviews_en.csv")


In [3]:
# Set option to display all columns (None means no limit)
# pd.set_option('display.max_columns', None)

# Set option to display all rows (None means no limit)
# pd.set_option('display.max_rows', None)

# Increase the width of columns to prevent wrapping
# pd.set_option('display.max_colwidth', None)


### 1. Data cleaning


In [4]:
products_df.columns


Index(['Unnamed: 0', 'asin', 'thumbnail', 'title', 'avg_rating', 'brand',
       'category', 'feature_bullets', 'num_reviews', 'price', 'unities'],
      dtype='object')

In [5]:
products_df.head()


Unnamed: 0.1,Unnamed: 0,asin,thumbnail,title,avg_rating,brand,category,feature_bullets,num_reviews,price,unities
0,0,B07BFMNKBJ,https://m.media-amazon.com/images/I/71MyAkpL3P...,"Nett Original Tampon sans Applicateur, Super P...",4.7,Nett,Tampons,['Son ouverture en corolle permet une adaptati...,223.0,2.66,32.0
1,1,B07YQG6J8C,https://m.media-amazon.com/images/I/71z+FTDoel...,"Nett® ProComfort® Normal, Tampon Sans Applicat...",4.6,Nett,Tampons,"[""Les tampons Nett ProComfort sont ultra confo...",303.0,3.79,24.0
2,2,B07DX91LR6,https://m.media-amazon.com/images/I/81aKn18Bqs...,"Tampax Compak Pearl, Super, 18 Tampons Avec Ap...",4.7,Always,Tampons,"['La combinaison n°\xa01 de confort, protectio...",859.0,3.1,18.0
3,3,B082VVC4ZB,https://m.media-amazon.com/images/I/71IHCEr+Kr...,"Nett Original Tampon sans Applicateur, Super, ...",4.7,Nett,Tampons,['Son ouverture en corolle permet une adaptati...,144.0,2.81,32.0
4,4,B0BQJQW12P,https://m.media-amazon.com/images/I/61omM04gC2...,"Tampax Cotton Protection, Super Plus, 18 Tampo...",4.7,Always,Tampons,['Tampax offre une protection longue durée ave...,30.0,4.5,18.0


In [6]:
products_df.drop(columns=["Unnamed: 0"], inplace=True)


In [7]:
products_df.isna().sum()


asin                 0
thumbnail            0
title                0
avg_rating         179
brand                2
category             3
feature_bullets     66
num_reviews        179
price               44
unities            703
dtype: int64

In [8]:
products_df.dtypes


asin                object
thumbnail           object
title               object
avg_rating         float64
brand               object
category            object
feature_bullets     object
num_reviews        float64
price              float64
unities            float64
dtype: object

In [9]:
products_df["category"] = products_df["category"].astype(str).fillna("Unknown")


In [10]:
products_df["category"].unique()


array(['Tampons', 'nan', 'Serviettes hygièniques', 'Lames', 'Shorties',
       'Déodorant', 'Culottes', 'Culottes et slips',
       'Culottes et sous-vêtements de protection', 'Femme',
       'Hygiène et soins intimes', 'Coupes menstruelles',
       'Hygiène et épilation', 'Vitamines, minéraux et compléments',
       'Protège-slips',
       'Protections, serviettes hygiéniques, couches pour adultes, protège-slips',
       'Vêtements de contention médicaux', 'Incontinence', 'Visage',
       'Coussinets pour chaussures', 'Beauté et soins',
       "Produits d'hygiène féminine", 'Épilation à la cire', 'Bannetons',
       'Bien-être', "Diffuseurs d'huiles essentielles"], dtype=object)

In [11]:
cats_to_keep = [
    "Tampons",
    "Serviettes hygièniques",
    "Culottes et sous-vêtements de protection",
    "Coupes menstruelles",
    "Protections, serviettes hygiéniques, couches pour adultes, protège-slips",
]


In [12]:
products_right_cats = products_df[products_df["category"].isin(cats_to_keep)]
products_right_cats.head()


Unnamed: 0,asin,thumbnail,title,avg_rating,brand,category,feature_bullets,num_reviews,price,unities
0,B07BFMNKBJ,https://m.media-amazon.com/images/I/71MyAkpL3P...,"Nett Original Tampon sans Applicateur, Super P...",4.7,Nett,Tampons,['Son ouverture en corolle permet une adaptati...,223.0,2.66,32.0
1,B07YQG6J8C,https://m.media-amazon.com/images/I/71z+FTDoel...,"Nett® ProComfort® Normal, Tampon Sans Applicat...",4.6,Nett,Tampons,"[""Les tampons Nett ProComfort sont ultra confo...",303.0,3.79,24.0
2,B07DX91LR6,https://m.media-amazon.com/images/I/81aKn18Bqs...,"Tampax Compak Pearl, Super, 18 Tampons Avec Ap...",4.7,Always,Tampons,"['La combinaison n°\xa01 de confort, protectio...",859.0,3.1,18.0
3,B082VVC4ZB,https://m.media-amazon.com/images/I/71IHCEr+Kr...,"Nett Original Tampon sans Applicateur, Super, ...",4.7,Nett,Tampons,['Son ouverture en corolle permet une adaptati...,144.0,2.81,32.0
4,B0BQJQW12P,https://m.media-amazon.com/images/I/61omM04gC2...,"Tampax Cotton Protection, Super Plus, 18 Tampo...",4.7,Always,Tampons,['Tampax offre une protection longue durée ave...,30.0,4.5,18.0


In [13]:
reviews_df.head()


Unnamed: 0.1,Unnamed: 0,asin,rating,title,country,date,body,body_en
0,0,B07BFMNKBJ,1,Pas reçue,France,2020-12-02,Ont me dis que je les reçue alors que non !,I was told that I received them when I didn&#3...
1,1,B07BFMNKBJ,1,Sans commentaire,France,2018-10-11,Sans commentaire,No comment
2,2,B07BFMNKBJ,5,Parfait 👌,France,2022-01-27,"J'utilise ce modèle depuis des années, je ne c...","I have been using this model for years, I no l..."
3,3,B07BFMNKBJ,4,Bien mais,France,2021-11-21,Légèrement moins cher qu’en grande surface mai...,Slightly cheaper than supermarkets but Pro Con...
4,4,B07BFMNKBJ,4,Bons tampons,France,2022-01-29,Rien de particulier à signaler.,Nothing special to report.


In [14]:
reviews_df.drop(columns="Unnamed: 0", inplace=True)


In [15]:
reviews_df.describe()


Unnamed: 0,rating
count,12634.0
mean,4.2183
std,1.233517
min,1.0
25%,4.0
50%,5.0
75%,5.0
max,5.0


In [16]:
reviews_df.isna().sum()


asin        0
rating      0
title      19
country     0
date        0
body        0
body_en     0
dtype: int64

In [17]:
reviews_df["date"] = pd.to_datetime(reviews_df["date"], format="%Y-%m-%d")


In [18]:
reviews_df.dtypes


asin               object
rating              int64
title              object
country            object
date       datetime64[ns]
body               object
body_en            object
dtype: object

In [19]:
products_right_cats["asin"] = products_right_cats["asin"].str.strip().str.upper()
reviews_df["asin"] = reviews_df["asin"].str.strip().str.upper()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  products_right_cats['asin'] = products_right_cats['asin'].str.strip().str.upper()


Our analysis being reviews-centric, I decided to remove products without reviews.


In [20]:
# Checking if all asins of product are in reviews and vice versa
asin_in_reviews_not_in_products = set(reviews_df["asin"]) - set(
    products_right_cats["asin"]
)

asin_in_products_not_in_reviews = set(products_right_cats["asin"]) - set(
    reviews_df["asin"]
)

print(f"ASINs in reviews not in products: {asin_in_reviews_not_in_products}")
print(f"ASINs in products not in reviews: {asin_in_products_not_in_reviews}")


ASINs in reviews not in products: {'B0B5HMR5MW', 'B07XM84QGQ', 'B07QBCGYXQ', 'B01LW2OYC1', 'B09VMHGQFG', 'B08ZHFYP3T', 'B0BSV7TVJR', 'B07TFHKHM4', 'B07WYYXGL5', 'B07FCCFV15', 'B07CG98SVR', 'B084WWBCNG', 'B00B3S2QHS', 'B08KWR5R95', 'B00CP4FKVS', 'B00XAEXV1I', 'B01IDA4MFM', 'B0BV2WJZ3N', 'B073ZGL2NC', 'B07DX95FCL', 'B086N95H6B', 'B00TYY31U4', 'B00OPX2RMW', 'B08L5SWSQM', 'B0853BWTSN', 'B00I83ZD2Q', 'B00NJNJ6O6', 'B07SC8SH6W', 'B07JGTJ56P', 'B09WN36G4G', 'B09VT936MM', 'B0983JCQS9', 'B001NZHAZY', 'B00T4GNI1O', 'B07YQDXWG1', 'B00XAH04X8', 'B0052ED6T6', 'B00ET7PFFA', 'B09KHDQJWK', 'B08YLH3MB2', 'B01FUURB2O', 'B08LM4H7CP', 'B07XTKX2M2', 'B07NCRTG3L', 'B01IDA4FOU', 'B06ZY7BR3V', 'B0BW8V2HSN', 'B07XYGBR28', 'B0B77XZR88', 'B00C4UE9TC', 'B000W6KKMK', 'B07V1G729W', 'B07DXGFW54', 'B097B1NFH5', 'B01BV4XLN6', 'B00DBWR7RS', 'B07Q1C27K2', 'B00LN3RDDA', 'B09D3L71W6', 'B09NTPXH26', 'B07SB4NBKY', 'B07C94P79Q', 'B086X2MBZF', 'B07BPXDNDC', 'B07Q758P4F', 'B00BN7Y0T6', 'B084DW23M5', 'B00BSXSQP4', 'B00KH7CLA8',

In [21]:
products = products_right_cats[products_right_cats["asin"].isin(reviews_df["asin"])]
reviews = reviews_df[reviews_df["asin"].isin(products_right_cats["asin"])]


In [22]:
asin_in_reviews_not_in_products = set(reviews["asin"]) - set(products["asin"])

asin_in_products_not_in_reviews = set(products["asin"]) - set(reviews["asin"])

print(f"ASINs in reviews not in products: {asin_in_reviews_not_in_products}")
print(f"ASINs in products not in reviews: {asin_in_products_not_in_reviews}")


ASINs in reviews not in products: set()
ASINs in products not in reviews: set()


### 2. Summary Statistics


In [23]:
products.describe()


Unnamed: 0,avg_rating,num_reviews,price,unities
count,405.0,405.0,387.0,112.0
mean,4.300741,601.580247,16.980155,15.794643
std,0.452905,1384.832038,11.014215,26.71543
min,1.0,1.0,1.55,1.0
25%,4.2,31.0,8.485,1.0
50%,4.4,122.0,14.99,2.0
75%,4.6,575.0,23.295,20.0
max,5.0,14694.0,59.97,200.0


In [24]:
products_df.describe()


Unnamed: 0,avg_rating,num_reviews,price,unities
count,690.0,690.0,825.0,166.0
mean,4.302174,553.118841,15.805515,11.975904
std,0.520002,1576.638048,11.428267,23.19194
min,1.0,1.0,1.13,1.0
25%,4.1,15.0,7.64,1.0
50%,4.4,82.5,12.5,1.0
75%,4.6,382.75,20.87,16.0
max,5.0,18631.0,69.9,200.0


The removal of products without reviews does not significantly impact the average rating and price of the products but does affect the average number of reviews and quantities (unities). The datasets are largely consistent in terms of spread and maximum values, with slight variations in the mean and quartiles for number of reviews and unities, reflecting the broader range of products included in the unfiltered dataset.


### 3. Data Visualization


##### **A. Average ratings Analysis**


In [25]:
fig = px.histogram(
    products_right_cats,
    x="avg_rating",
    color="category",
    title="Distribution of Average Ratings by Category",
    labels={"avg_rating": "Average Rating"},
    color_discrete_sequence=px.colors.qualitative.Plotly,
)

fig.update_layout(bargap=0.2)  # Adjust the gap between bars
fig.show()


ps: just click on the legend to filter its awesome!


In [26]:
products_right_cats.groupby("category")["avg_rating"].mean()


category
Coupes menstruelles                                                         3.783333
Culottes et sous-vêtements de protection                                    4.266667
Protections, serviettes hygiéniques, couches pour adultes, protège-slips    4.566667
Serviettes hygièniques                                                      4.423913
Tampons                                                                     4.232636
Name: avg_rating, dtype: float64

Across the categories, menstrual products on Amazon generally receive high average ratings, ranging between 4.23 and 4.57. Notably, "Serviettes hygiéniques" (sanitary pads) and the broader category of "Protections, serviettes hygiéniques, couches pour adultes, protège-slips" emerge as the top-rated categories. This suggests that customers are particularly satisfied with these products, reflecting their quality, effectiveness, and possibly the value they provide to users. The high ratings across all categories highlight a general customer satisfaction with menstrual products available on the platform, with sanitary pads and related protective products standing out for their exceptional reviews.


In [27]:
# Sort products within each category by average rating in descending order
sorted_products = products_right_cats.sort_values(
    ["category", "avg_rating"], ascending=[True, False]
)

# Group by category and take the top 10 products from each category
top_products_per_category = sorted_products.groupby("category").head(10)


In [28]:
top_products_per_category["product"] = (
    top_products_per_category["brand"] + "-" + top_products_per_category["asin"]
)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [29]:
shades_of_pink = [
    "#FFC0CB",
    "#FFB6C1",
    "#FF69B4",
    "#FF1493",
    "#DB7093",
    "#C71585",
    "#FFAEB9",
    "#FF82AB",
    "#FF34B3",
    "#FF3E96",
]

fig8 = px.bar(
    top_products_per_category,
    x="avg_rating",
    y="product",
    color="category",
    title="Top 10 Products with Highest Average Rating per Category",
    labels={"avg_rating": "Average Rating", "product": "Product"},
    orientation="h",
    color_discrete_sequence=shades_of_pink,
)

fig8.update_xaxes(range=[3.5, 5])

fig8.update_layout(
    yaxis={"categoryorder": "total ascending"},
    xaxis_title="Average Rating",
    yaxis_title="Product Title",
    coloraxis_colorbar={"title": "Category"},
)


fig8.show()


##### **B. Number of reviews Analysis**


In [30]:
nreviews_per_cat = (
    products_right_cats.groupby("category")["num_reviews"].sum().reset_index()
)
nreviews_per_cat.sort_values("num_reviews", ascending=False, inplace=True)


In [31]:
fig2 = px.bar(
    nreviews_per_cat,
    y="num_reviews",
    x="category",
    title="Number of reviews per category",
    labels={"num_reviews": "Number of Reviews"},
    color_discrete_sequence=shades_of_pink,
)

fig2.update_layout(
    width=800, height=600
)  # Example: Set width to 800 pixels and height to 600 pixels

fig2.show()


When plotting the number of reviews per product to identify popular products, it's observed that "Serviettes hygiéniques" (sanitary pads) and "Tampons" dominate in terms of review volume, significantly outnumbering the reviews received by products in other categories. This disparity suggests that sanitary pads and tampons are the most engaged-with and possibly the most purchased or used menstrual products among the range available on Amazon.


In [32]:
reviews.head()


Unnamed: 0,asin,rating,title,country,date,body,body_en
0,B07BFMNKBJ,1,Pas reçue,France,2020-12-02,Ont me dis que je les reçue alors que non !,I was told that I received them when I didn&#3...
1,B07BFMNKBJ,1,Sans commentaire,France,2018-10-11,Sans commentaire,No comment
2,B07BFMNKBJ,5,Parfait 👌,France,2022-01-27,"J'utilise ce modèle depuis des années, je ne c...","I have been using this model for years, I no l..."
3,B07BFMNKBJ,4,Bien mais,France,2021-11-21,Légèrement moins cher qu’en grande surface mai...,Slightly cheaper than supermarkets but Pro Con...
4,B07BFMNKBJ,4,Bons tampons,France,2022-01-29,Rien de particulier à signaler.,Nothing special to report.


In [33]:
avg_nreviews = (
    products_right_cats.groupby("category")["num_reviews"].mean().reset_index()
)
avg_nreviews


Unnamed: 0,category,num_reviews
0,Coupes menstruelles,3247.0
1,Culottes et sous-vêtements de protection,1524.666667
2,"Protections, serviettes hygiéniques, couches p...",1459.833333
3,Serviettes hygièniques,592.951087
4,Tampons,395.610879


In [34]:
fig8 = px.bar(
    avg_nreviews,
    x="category",
    y="num_reviews",
    orientation="v",
    labels={"num_reviews": "Average Number of Reviews"},
    title="Average number of reviews per category",
    color="category",
    color_discrete_sequence=shades_of_pink,
    text="num_reviews",
)

# Customize text by updating traces
fig8.update_traces(
    texttemplate="%{text:.2s}",  # This template shows the text. '.2s' formats numbers with 2 significant digits
    textposition="outside",
)

# Adjust bar width (value between 0 and 1; lower values make bars thinner)
fig8.update_layout(
    margin=dict(l=20, r=20, t=30, b=20)
)  # Adjust left, right, top, bottom margins
fig8.update_layout(width=800, height=600)  # Adjust the figure size

fig8.show()


- **High Engagement with Menstrual Cups:** The "Coupes menstruelles" category significantly outpaces others in average review count, suggesting a high level of user interaction and possibly a strong community of users who share their experiences. This could indicate high satisfaction or interest in menstrual cups as a sustainable alternative to traditional menstrual products.

- **Protective Underwear and Broad Protection Category:** "Culottes et sous-vêtements de protection" and the broader category of protection products, including sanitary pads and incontinence products, also show relatively high engagement. This could reflect the growing market demand for comfortable and reliable protection, especially in categories that offer innovative solutions or cater to specific needs.

- **Lower Engagement with Sanitary Pads and Tampons:** The comparatively lower average review counts for "Serviettes hygiéniques" and "Tampons" might suggest a mature market with established products, where customers feel less compelled to leave reviews. Alternatively, this could indicate a wider acceptance and usage of these products, making individual reviews less common as the products are considered standard and users feel less need to share experiences.

Overall Implications:

- **Product Innovation and Sustainability:** The high engagement in the menstrual cups category might reflect a consumer trend towards sustainable and reusable menstrual products. It could also point to a community that actively discusses and promotes these alternatives.

- **Market Maturity and Consumer Habits:** The lower engagement in traditional categories like sanitary pads and tampons may highlight market saturation or stable consumer habits, where the incentive to review is lower unless the product stands out significantly in terms of innovation or quality.

- **Opportunities for Growth:** The varied levels of engagement across categories indicate opportunities for brands to innovate and engage customers, particularly in categories with lower average review counts. Highlighting benefits, engaging in community building, and encouraging reviews could be effective strategies.


##### **C. Price Analysis**


In [35]:
price_v_cat = products.groupby("category")["price"].mean().reset_index()


In [36]:
fig3 = px.histogram(
    products_right_cats,
    x="price",
    color="category",
    title="Price distribution per Category",
    hover_data=["unities"],
)
fig3.show()


In [37]:
fig4 = px.scatter(
    products_right_cats,
    x="price",
    y="num_reviews",
    color="category",
    title="Price distribution per Category",
    labels={"num_reviews": "Number of Reviews", "price": "Price"},
    hover_data=["unities"],
)
fig4.show()


In [38]:
avg_unit = products_right_cats[products_right_cats["price"] <= 20]
avg_unit["unities"] = avg_unit["unities"].astype(float)
avg_unit = avg_unit.groupby("category")["unities"].mean()
avg_unit




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



category
Coupes menstruelles                                                          1.000000
Culottes et sous-vêtements de protection                                          NaN
Protections, serviettes hygiéniques, couches pour adultes, protège-slips          NaN
Serviettes hygièniques                                                      12.000000
Tampons                                                                     14.651376
Name: unities, dtype: float64

In [42]:
fig9 = px.box(
    products_right_cats, x="category", y="price", color_discrete_sequence=["#C71585"]
)

fig9.update_layout(width=800, height=800)

fig9.show()


**Tampons** broader price range extends up to 60 euros, 50% of tampon prices are concentrated between 2 and 20 euros. This indicates a significant portion of the tampon market is accessible and affordable, catering to the essential needs of consumers. The lower half of the price spectrum underscores the availability of cost-effective options, likely appealing to a wide demographic seeking reliable menstrual care without a high price tag. The extension of the price range up to 60 euros reveals the presence of premium offerings within the tampon category. These higher-priced options may include tampons made from organic materials, those boasting specialized applicators for enhanced comfort.

**Serviettes hygiéniques** (sanitary pads) show a slightly broader price range, from 1.5 to 20 euros, with an average packaging size of 12 units for products within this interval. This implies that sanitary pads are generally available in smaller pack sizes compared to tampons, possibly reflecting distinct usage patterns, consumer preferences, or manufacturers' product positioning strategies.
Additionally, the analysis brings attention to products such as:

**Coupes menstruelles** (menstrual cups), which are highlighted for their higher price points ranging from 17 to 32 euros. However, it's important to note their reusable nature, which offers long-term value and sustainability benefits, potentially justifying the higher upfront cost.

**Culottes et sous-vêtements de protection**(protective underwear) also feature higher prices, ranging from 22 to 31 euros. This category includes products offering innovative solutions for menstrual and incontinence protection, reflecting a growing consumer interest in comfort, reliability, and sustainability.

**Protections, serviettes hygiéniques, couches pour adultes, protège-slips** (a broad category encompassing various sanitary and incontinence products) have price ranges from 16 to 30 euros, indicating a wide market offering catering to different needs and preferences.


However, most variation in price could be explained by the number of units per package. The solve this bias, lets normalize the data with a price per unit column.


In [40]:
products_right_cats["unit_price"] = (
    products_right_cats["price"] / products_right_cats["unities"]
)
products_right_cats.head()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



Unnamed: 0,asin,thumbnail,title,avg_rating,brand,category,feature_bullets,num_reviews,price,unities,unit_price
0,B07BFMNKBJ,https://m.media-amazon.com/images/I/71MyAkpL3P...,"Nett Original Tampon sans Applicateur, Super P...",4.7,Nett,Tampons,['Son ouverture en corolle permet une adaptati...,223.0,2.66,32.0,0.083125
1,B07YQG6J8C,https://m.media-amazon.com/images/I/71z+FTDoel...,"Nett® ProComfort® Normal, Tampon Sans Applicat...",4.6,Nett,Tampons,"[""Les tampons Nett ProComfort sont ultra confo...",303.0,3.79,24.0,0.157917
2,B07DX91LR6,https://m.media-amazon.com/images/I/81aKn18Bqs...,"Tampax Compak Pearl, Super, 18 Tampons Avec Ap...",4.7,Always,Tampons,"['La combinaison n°\xa01 de confort, protectio...",859.0,3.1,18.0,0.172222
3,B082VVC4ZB,https://m.media-amazon.com/images/I/71IHCEr+Kr...,"Nett Original Tampon sans Applicateur, Super, ...",4.7,Nett,Tampons,['Son ouverture en corolle permet une adaptati...,144.0,2.81,32.0,0.087813
4,B0BQJQW12P,https://m.media-amazon.com/images/I/61omM04gC2...,"Tampax Cotton Protection, Super Plus, 18 Tampo...",4.7,Always,Tampons,['Tampax offre une protection longue durée ave...,30.0,4.5,18.0,0.25


#### **D. Reviews Analysis**


In [43]:
reviews.head()


Unnamed: 0,asin,rating,title,country,date,body,body_en
0,B07BFMNKBJ,1,Pas reçue,France,2020-12-02,Ont me dis que je les reçue alors que non !,I was told that I received them when I didn&#3...
1,B07BFMNKBJ,1,Sans commentaire,France,2018-10-11,Sans commentaire,No comment
2,B07BFMNKBJ,5,Parfait 👌,France,2022-01-27,"J'utilise ce modèle depuis des années, je ne c...","I have been using this model for years, I no l..."
3,B07BFMNKBJ,4,Bien mais,France,2021-11-21,Légèrement moins cher qu’en grande surface mai...,Slightly cheaper than supermarkets but Pro Con...
4,B07BFMNKBJ,4,Bons tampons,France,2022-01-29,Rien de particulier à signaler.,Nothing special to report.


In [44]:
avg_ratings = reviews.groupby("asin").agg({"rating": "mean", "body": "size"})
avg_ratings.rename(columns={"rating": "avg_rating", "body": "num_review"}, inplace=True)
avg_ratings


Unnamed: 0_level_0,avg_rating,num_review
asin,Unnamed: 1_level_1,Unnamed: 2_level_1
B00028O5RY,3.977528,89
B000E3DXCA,3.862745,51
B000FAG6X0,4.500000,10
B0012ZNZF4,3.333333,15
B0013CA9KK,4.621622,37
...,...,...
B0CL3VMWJR,5.000000,1
B0CL56DM4L,5.000000,1
B0CL56VDZV,4.000000,2
B0CL9VMF6K,4.000000,1


In [45]:
fig6 = px.scatter(
    avg_ratings,
    x="avg_rating",
    y="num_review",
    title="Correlation between Average Rating and Number of Reviews",
    labels={"avg_rating": "Average Rating", "num_review": "Number of Reviews"},
)

# Set all markers to a specific shade of pink
fig6.update_traces(marker=dict(color="#FF69B4"))  # Example: Using Hot Pink

fig6.show()


In [46]:
correlation = avg_ratings["avg_rating"].corr(avg_ratings["num_review"])
print(
    f"The Pearson correlation coefficient between average rating and number of reviews is: {correlation}"
)


The Pearson correlation coefficient between average rating and number of reviews is: 0.059693198536587756


The analysis of the correlation between the average rating of a product and the number of reviews it has reveals a Pearson correlation coefficient close to 0. This outcome indicates a no or weak linear relationship between these two variables. In practical terms, this suggests that the average rating a product receives on Amazon does not strongly influence how many reviews it gets. This could imply that customers are not necessarily more inclined to leave reviews for products with higher or lower ratings. Instead, factors other than the average rating—such as product visibility, marketing strategies, or consumer interest—might play a more significant role in determining the number of reviews a product receives. This finding highlights the complexity of consumer behavior on e-commerce platforms and suggests that encouraging customer reviews may require strategies beyond simply aiming for higher product ratings.


In [47]:
import scipy.stats as stats

# Assuming avg_ratings is your DataFrame with 'avg_rating' and 'review_count' columns
spearman_corr, spearman_pvalue = stats.spearmanr(
    avg_ratings["avg_rating"], avg_ratings["num_review"]
)

print(f"Spearman's rank correlation coefficient: {spearman_corr}")
print(f"P-value: {spearman_pvalue}")


Spearman's rank correlation coefficient: -0.043851631375280115
P-value: 0.3787548217055793


The results from Spearman's rank correlation analysis reveal a coefficient of -0.04385, suggesting a very weak inverse relationship between the average rating of a product and the number of reviews it receives. The near-zero value indicates that as the average rating slightly decreases, the number of reviews slightly increases, or vice versa, but this relationship is very weak and likely not meaningful in practice.

Moreover, the p-value of 0.37875 exceeds common significance levels (e.g., 0.05), indicating that the observed correlation (or lack thereof) could very well be due to chance. In statistical terms, we fail to reject the null hypothesis of no correlation, meaning there isn't statistically significant evidence to suggest a meaningful non-linear relationship between these two variables based on Spearman's rank correlation analysis.

Conclusion:
Both linear (Pearson) and non-linear (Spearman) correlation analyses suggest that there is no strong or statistically significant relationship between the average rating of a product and the number of reviews it has.


#### **E. Price Sensitivity Analysis**


In [48]:
fig9 = px.scatter(
    products_df,
    x="avg_rating",
    y="price",
    title="Correlation between Average Rating and price of product",
    labels={"avg_rating": "Average Rating", "price": "Price"},
)

# Set all markers to a specific shade of pink
fig9.update_traces(marker=dict(color="#FF69B4"))  # Example: Using Hot Pink

fig9.show()


In [49]:
corr = products_df.dropna()


In [50]:
import scipy.stats as stats

# Assuming avg_ratings is your DataFrame with 'avg_rating' and 'review_count' columns
spearman_corr, spearman_pvalue = stats.spearmanr(
    corr["avg_rating"], corr["num_reviews"]
)

print(f"Spearman's rank correlation coefficient: {spearman_corr}")
print(f"P-value: {spearman_pvalue}")


Spearman's rank correlation coefficient: 0.13668414841201879
P-value: 0.14522481713308405


While there appears to be a slight positive relationship between the average rating of products and the number of reviews they receive, this correlation is not strong and not statistically significant based on the data provided.


##### **F. Brand Analysis**


In [51]:
brand_counts = products_df.groupby("brand").size().reset_index()
brand_counts.rename(columns={0: "n_products"}, inplace=True)
brand_counts = brand_counts.sort_values(by="n_products", ascending=False)
top5_global_brands = brand_counts.head()


In [52]:
fig10 = px.bar(
    top5_global_brands,
    x="brand",
    y="n_products",
    title="Top 5 brands with the most products",
    labels={"n_products": "Number of Product", "brand": "Brand"},
)

fig10.update_traces(marker=dict(color="#FF69B4"))

fig10.show()


- **Always with 92 products** stands out significantly with the highest variety of products available. This brand's extensive range indicates its dominant presence in the market, catering to a broad spectrum of consumer needs and preferences.
  LUNA with 33 products.

- **Luna, holding the second position** offers a considerably lower number of products compared to Always but still maintains a strong market presence. This suggests that Luna has a focused yet diverse product line, appealing to specific consumer segments.

- **Carefree with 28 products** Carefree secures the third spot with a close range to Luna, indicating it also has a substantial variety of offerings. This brand's presence suggests a solid market position with a commitment to meeting diverse consumer needs.

The significant gap between the first brand (Always) and the second (LUNA) highlights the dominant market position of Always, reflecting its widespread consumer acceptance and trust. LUNA and Carefree, while having fewer products, still show strong brand strength and consumer loyalty, likely catering to niche markets or specific consumer needs with their product offerings. This analysis underlines the importance of product diversity in brand strength and market presence.


In [53]:
brand_cat_size = (
    products_right_cats.groupby(["brand", "category"])
    .size()
    .reset_index(name="n_product")
)

# Get top 5 brands within each category
top5_brand_cat = (
    brand_cat_size.groupby("category")
    .apply(lambda x: x.nlargest(5, "n_product"))
    .reset_index(drop=True)
)

# Sorting for better visualization
top5_brand_cat.sort_values(
    by=["category", "n_product"], inplace=True, ascending=[True, False]
)


In [59]:
fig11 = px.bar(
    top5_brand_cat,
    x="brand",
    y="n_product",
    labels={"brand": "Brand", "n_product": "Number Of Products"},
    title="Top 5 brands by number of products for each category",
    color="category",
    color_discrete_sequence=shades_of_pink,
    barmode="group",
    category_orders={"category": top5_brand_cat["category"].unique()},
)  # Optional: to maintain the group ordering by category

fig11.update_layout(
    xaxis={"categoryorder": "total descending"}
)  # Optional: sort bars within groups
fig11.show()


The analysis of product distribution across various feminine hygiene categories reveals distinct market leaders for each specific category, underscoring the diverse preferences and needs of consumers.

In the category of `menstrual cups`, **LOKBY** emerges as the top brand, indicating a strong consumer preference for their product offerings in this segment.

Similarly, **Fulidngzg** leads the category for `protective underwear`, highlighting its dominance in providing solutions for menstrual protection beyond traditional methods.

For `protèges-slips` (panty liners), **Tena** stands out as the market leader, reflecting its success in catering to daily feminine hygiene needs.

In the highly competitive `pads` category, **Always** and **Nana** are the top two brands, respectively, demonstrating their strong market presence and consumer trust in their sanitary pad products.

In the `tampons` category, **Luna**, **Tampax**, and **Cup** rank as the top three brands, showcasing a varied consumer preference within this segment. Each of these brands has successfully established itself in a niche market, catering to the specific preferences and needs of their customer base.

This diversity in leading brands across different categories highlights the importance of targeted product development and marketing strategies in the feminine hygiene market. It reflects the varied consumer preferences that exist within this market and the importance of offering a range of products to meet the diverse needs of consumers.


# **Testing new categorization**


In [55]:
import pandas as pd
import inflect
import unidecode

# Initialize the inflect engine
p = inflect.engine()


# Keywords dictionary
keywords_dict = {
    "Tampons": ["Tampon", "Tampons"],
    "Culottes et sous-vêtements de protection": ["Culotte", "Culottes"],
    "Coupes menstruelles": [
        "Coupe menstruelle",
        "Coupes menstruelles",
        "Menstrual Cup",
        "Cup menstruelle",
        "cup menstruelle",
    ],
    "Serviettes hygièniques": [
        "Serviette hygiénique",
        "Sanitary Napkin",
        "Serviettes hygièniques",
    ],
    "Protège-slips": ["Protège-slip", "Protège slip", "protege slip"],
}


def normalize_text(text):
    if not isinstance(text, str):
        return ""  # Return an empty string if the input is not a string
    return unidecode.unidecode(text).lower().replace("-", " ")


In [56]:
def find_category(title):
    print(f"Processing title: {title} (Type: {type(title)})")  # Debugging line
    if pd.isna(title) or not isinstance(title, str):
        return None
    if pd.isna(title) or not isinstance(title, str):
        return None
    normalized_title = normalize_text(title)
    for category, keywords in keywords_dict.items():
        for keyword in keywords:
            normalized_keyword = normalize_text(keyword)
            if (
                normalized_keyword in normalized_title
                or normalize_text(p.plural(keyword)) in normalized_title
                or normalize_text(p.singular_noun(keyword)) in normalized_title
            ):
                return category
    return None


In [57]:
for index, row in products_df.iterrows():
    title = row["title"]
    category = find_category(title)
    print(f"Title: {title} => Category: {category}")
    # Optionally, directly assign the category to the DataFrame for verification
    products_df.at[index, "true_cat"] = category


Processing title: Nett Original Tampon sans Applicateur, Super Plus, Boite de 24 Tampons (Type: <class 'str'>)
Title: Nett Original Tampon sans Applicateur, Super Plus, Boite de 24 Tampons => Category: Tampons
Processing title: Nett® ProComfort® Normal, Tampon Sans Applicateur, pour Flux Normal, Boite 24 Tampons (Type: <class 'str'>)
Title: Nett® ProComfort® Normal, Tampon Sans Applicateur, pour Flux Normal, Boite 24 Tampons => Category: Tampons
Processing title: Tampax Compak Pearl, Super, 18 Tampons Avec Applicateur, Flux Moyens à Abondants, Triple Confort avec Applicateur doux, MotionFit, Voile Douceur, Certifié Oeko-Tex (Type: <class 'str'>)
Title: Tampax Compak Pearl, Super, 18 Tampons Avec Applicateur, Flux Moyens à Abondants, Triple Confort avec Applicateur doux, MotionFit, Voile Douceur, Certifié Oeko-Tex => Category: Tampons
Processing title: Nett Original Tampon sans Applicateur, Super, Boite de 32 Tampons (Type: <class 'str'>)
Title: Nett Original Tampon sans Applicateur, Su