In [2]:
import pandas as pd
import re
from collections import Counter

In [3]:
# 1) Charger les données
df = pd.read_csv("Headphone_Dataset.csv")


In [23]:
# 2) Fonctions d'extraction depuis la colonne "Comment"

def extract_title(text):
    lines = text.splitlines()
    return lines[0].strip() if lines else None

def extract_country(text):
    match = re.search(r"Reviewed in (.*?) on", text)
    return match.group(1).strip() if match else None

def extract_date(text):
    match = re.search(r"Reviewed in .*? on (.*)", text)
    if match:
        line = match.group(1).splitlines()[0]
        return line.strip()
    return None

def extract_color(text):
    for line in text.splitlines():
        if line.startswith("Color:"):
            color = line.replace("Color:", "").strip()
            
            # enlever Verified Purchase si collé
            color = color.replace("Verified Purchase", "").strip()
            
            # couper tout après un |
            if "|" in color:
                color = color.split("|")[0].strip()
            
            # couper tout après un mot qui n'est pas une couleur
            # ici on prend juste le premier mot alphabétique
            color = color.split()[0].strip()
            
            return color
    return None




def extract_verified(text):
    return "Verified Purchase" in text

def extract_useful_votes(text):
    match = re.search(r"(\d+)\s+people found this helpful", text)
    return int(match.group(1)) if match else 0

def extract_review_text(text):
    lines = text.splitlines()
    if len(lines) <= 3:
        return None

    body_lines = []
    for line in lines[3:]:
        if re.search(r"\d+\s+people found this helpful", line):
            break
        if line.strip() in ("Helpful", "Report"):
            break
        if line.strip() == "":
            continue
        body_lines.append(line.strip())

    return " ".join(body_lines) if body_lines else None


In [5]:

df["title"] = df["Comment"].apply(extract_title)
df["country"] = df["Comment"].apply(extract_country)
df["date"] = df["Comment"].apply(extract_date)
df["color"] = df["Comment"].apply(extract_color)
df["verified"] = df["Comment"].apply(extract_verified)
df["useful_votes"] = df["Comment"].apply(extract_useful_votes)
df["review_text"] = df["Comment"].apply(extract_review_text)

In [6]:
print(df[["title", "country", "date", "color", "verified", "useful_votes"]].head())

print("\n--- APERÇU COMMENTAIRE ---")
print(df[["title", "review_text"]].head())


                                               title            country  \
0                 Faulty product. Beware of warranty  the United States   
1         Maybe bad electronics, maybe bad shipping?  the United States   
2  DO NOT BUY THESE!!!! They are trying to trick ...  the United States   
3  Newest version is verified by Soundcore to be ...  the United States   
4                            Good specs but FRAGILE!  the United States   

                 date  color  verified  useful_votes  
0        June 9, 2024  Black      True            36  
1       April 8, 2024  Black      True            24  
2    January 12, 2024  Black      True           122  
3  September 20, 2024  Black      True             0  
4    December 7, 2022  Black      True           157  

--- APERÇU COMMENTAIRE ---
                                               title  \
0                 Faulty product. Beware of warranty   
1         Maybe bad electronics, maybe bad shipping?   
2  DO NOT BUY THESE!!!!

In [7]:
###Structure du dataset

print("\n--- DIMENSIONS ---")
print(df.shape)

print("\n--- TYPES ---")
print(df.dtypes)



--- DIMENSIONS ---
(10498, 9)

--- TYPES ---
Comment         object
Star             int64
title           object
country         object
date            object
color           object
verified          bool
useful_votes     int64
review_text     object
dtype: object


In [8]:
###Qualité des données

print("\n--- MISSING VALUES ---")
print(df.isna().sum())


--- MISSING VALUES ---
Comment         0
Star            0
title           0
country         0
date            0
color           0
verified        0
useful_votes    0
review_text     3
dtype: int64


In [9]:
###Distribution des notes
print("\n--- STAR DISTRIBUTION ---")
print(df["Star"].value_counts().sort_index())



--- STAR DISTRIBUTION ---
Star
1    1505
2    1186
3    1451
4    2166
5    4190
Name: count, dtype: int64


In [10]:
###Répartition par pays
print("\n--- TOP COUNTRIES ---")
print(df["country"].value_counts().head(10))


--- TOP COUNTRIES ---
country
the United States     8781
the United Kingdom     962
Canada                 474
Germany                133
Spain                   37
France                  34
Australia               27
Italy                   23
Mexico                  13
Brazil                   6
Name: count, dtype: int64


In [24]:
####Répartition couleur

df["color"] = df["Comment"].apply(extract_color)

print("\n--- COLORS ---")
print(df["color"].value_counts().head(20))



--- COLORS ---
color
Black         9124
Silver         652
Blue           613
Pink            63
SilverVine      21
BlackVine       20
BlackEarly       5
Name: count, dtype: int64


In [20]:
print(df["Comment"].iloc[0])


Faulty product. Beware of warranty
Reviewed in the United States on June 9, 2024
Color: BlackVerified Purchase
I've supported Anker products for 8 or 9 years now. I started buying their flashlights and nightlights years ago when my children were small, because they were well built and fair priced, so I began buying their other products as well, but never anything really expensive. I decided to give these headphones a try, rather than a more expensive name brand and was pleasantly surprised by their sound quality and noise cancelling ability. That is, until I wasn't. The right earphone began demonstrating a static sound that sounded almost like a radio station that doesn't have anything playing on it. It was even worse if you touched the earphone or moved/tilted your head. Obviously something inside of the earphone failed, so being what I thought was a great company, I looked up Ankers warranty, which was 18 months. I thought, great! Now when did I buy these? As it turned out, my warran

In [12]:
###Longueur des commentaires
df["comment_length"] = df["review_text"].apply(lambda x: len(str(x)))

print("\n--- COMMENT LENGTH STATS ---")
print(df["comment_length"].describe())



--- COMMENT LENGTH STATS ---
count    10498.000000
mean       513.358735
std        670.222564
min          3.000000
25%        187.250000
50%        346.000000
75%        611.000000
max      15503.000000
Name: comment_length, dtype: float64


In [14]:

###Mots les plus fréquents 
import re
from collections import Counter

def clean_for_freq(text):
    if text is None:
        return ""
    text = text.lower()
    text = re.sub(r"[^a-z\s]", " ", text)
    return text


df["clean_text"] = df["review_text"].apply(clean_for_freq)

all_words = " ".join(df["clean_text"]).split()
counter = Counter(all_words)

print("\n--- MOST COMMON WORDS ---")
print([w for w in counter.most_common(30) if len(w[0])>3])



--- MOST COMMON WORDS ---
[('they', 11772), ('these', 11311), ('headphones', 11059), ('that', 8692), ('with', 8641), ('them', 8530), ('noise', 8478), ('sound', 8191), ('this', 7345), ('have', 7186), ('good', 5822)]
