In [45]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from numpy.linalg import norm

In [46]:
path = "data/data_with_best_model_sentiment_analysis.csv"
df = pd.read_csv(path, encoding='latin-1')


# Embedding des labels

In [47]:
labels=[
  'customer service', 'service', 'service_experience', 'service quality', 'service refusal',
  'staff behavior', 'staffing', 'staff attitude', 'manager_interaction', 'staff friendliness', 'staff_behavior', 'management', 'behavior',
  'food quality', 'food_quality', 'product consistency',
  'operating hours', 'closing time',
  'order accuracy', 'order_accuracy', 'order resolution', 'order', 'order issue', 'wrong_order',
  'location', 'location size', 'decor','atmosphere',
  'dining experience', 'general experience', 'general_experience', 'satisfaction', 'overall experience', 'overall_experience', 'experience',
  'cleanliness',
  'refund_policy',
  'recommendation',
  'drive-thru experience', 'drive_through_experience',
  'safety concerns', 'security',
  'advice',
  'receipt issue',
  'menu options', 'dissatisfaction with menu', 'menu_options',
  'customer loyalty', 'app_rewards',
  'language barrier',
  'long wait time', 'wait_time', 'service speed', 'service_speed', 'speed', 'slow_service',
  'left without ordering',
  'product availability',
  'convenience',
  'affordability',
  'busyness',
  'request',
 ]


In [48]:
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') #embeding à 384 dimentions, max token = 256
labels_embedded = model.encode(labels)

In [49]:
# Group definitions
label_groups = {
    "customer service": ['customer service', 'service', 'service_experience', 'service quality', 'service refusal'],
    "staff": ['staff behavior', 'staffing', 'staff attitude', 'manager_interaction', 'staff friendliness', 'staff_behavior', 'management', 'behavior'],
    "food quality": ['food quality', 'food_quality', 'product consistency'],
    "operating hours": ['operating hours', 'closing time'],
    "order accuracy": ['order accuracy', 'order_accuracy', 'order resolution', 'order', 'order issue', 'wrong_order'],
    "location": ['location', 'location size', 'decor', 'atmosphere'],
    "general experience": ['dining experience', 'general experience', 'general_experience', 'satisfaction', 'overall experience', 'overall_experience', 'experience'],
    "cleanliness": ['cleanliness'],
    "refund_policy": ['refund_policy'],
    "recommendation": ['recommendation'],
    "drive-thru experience": ['drive-thru experience', 'drive_through_experience'],
    "security": ['safety concerns', 'security'],
    "advice": ['advice'],
    "receipt issue": ['receipt issue'],
    "menu": ['menu options', 'dissatisfaction with menu', 'menu_options'],
    "customer loyalty": ['customer loyalty', 'app_rewards'],
    "language barrier": ['language barrier'],
    "waiting time": ['long wait time', 'wait_time', 'service speed', 'service_speed', 'speed', 'slow_service'],
    "left without ordering": ['left without ordering'],
    "product availability": ['product availability'],
    "convenience": ['convenience'],
    "affordability": ['affordability'],
    "busyness": ['busyness'],
    "request": ['request'],
}

# Inverse mapping: label -> group
label_to_group = {}
for group, group_labels in label_groups.items():
    for label in group_labels:
        label_to_group[label] = group


In [50]:
len(label_groups)

24

# Embedding et dot product sur tout le dataset

In [51]:
from tqdm.notebook import tqdm

# Remplacer les valeurs manquantes par une chaîne vide
df['tokenized_reviews'] = df['tokenized_reviews'].fillna('')

# Charger le modèle SentenceTransformer
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

# Encoder les avis avec une barre de progression
reviews_embedded = []
for review in tqdm(df["tokenized_reviews"], desc="Encoding reviews"):
    reviews_embedded.append(model.encode(review))

# Ajouter les embeddings au DataFrame
df['review_embedded'] = reviews_embedded

Encoding reviews:   0%|          | 0/22250 [00:00<?, ?it/s]

In [52]:
df.to_csv("data/data_with_topics.csv", index=False)
df= pd.read_csv("data/data_with_topics.csv", encoding='latin-1')

In [53]:
from collections import defaultdict
import numpy as np
from numpy.linalg import norm

# Prepare list of list of tuples: top 5 (group, score) per review
top_group_scores = []

for review in reviews_embedded:
    group_scores = defaultdict(list)

    for i, label in enumerate(labels):
        group = label_to_group.get(label, label)
        score = np.dot(review, labels_embedded[i]) / (norm(review) * norm(labels_embedded[i]))
        group_scores[group].append(score)

    # For each group, keep the highest score
    group_max_scores = [(group, max(scores)) for group, scores in group_scores.items()]
    group_max_scores.sort(key=lambda x: x[1], reverse=True)

    # Only top 5 tuples
    top_group_scores.append(group_max_scores[:5])

# Assign each top tuple to its own column
for i in range(5):
    df[f"top_{i+1}"] = [
        group_scores[i] if i < len(group_scores) else (None, None)
        for group_scores in top_group_scores
    ]



In [54]:
# List of all unique group labels
all_group_labels = list(set(label_to_group.values()))

# This will be a list of dicts, one per review
group_score_dicts = []

for review in reviews_embedded:
    group_scores = defaultdict(list)

    for i, label in enumerate(labels):
        group = label_to_group.get(label, label)
        score = np.dot(review, labels_embedded[i]) / (norm(review) * norm(labels_embedded[i]))
        group_scores[group].append(score)

    # Take max score per group
    group_max_scores = {group: max(scores) for group, scores in group_scores.items()}
    group_score_dicts.append(group_max_scores)

# Convert list of dicts to DataFrame (each group becomes a column)
group_scores_df = pd.DataFrame(group_score_dicts)

# Merge into your main DataFrame
df = pd.concat([df.reset_index(drop=True), group_scores_df], axis=1)


In [55]:
df[['review',
       'actual_sentiment', 'Roberta_label',
       'top_1', 'top_2', 'top_3', 'top_4', 'top_5', 'customer service',
       'staff', 'food quality', 'operating hours', 'order accuracy',
       'location', 'general experience', 'cleanliness', 'refund_policy',
       'recommendation', 'drive-thru experience', 'security', 'advice',
       'receipt issue', 'menu', 'customer loyalty', 'language barrier',
       'waiting time', 'left without ordering', 'product availability',
       'convenience', 'affordability', 'busyness', 'request']].head()

Unnamed: 0,review,actual_sentiment,Roberta_label,top_1,top_2,top_3,top_4,top_5,customer service,staff,...,menu,customer loyalty,language barrier,waiting time,left without ordering,product availability,convenience,affordability,busyness,request
0,Why does it look like someone spit on my food?...,negative,negative,"(food quality, 0.31562406)","(cleanliness, 0.27512178)","(general experience, 0.23939143)","(staff, 0.23229381)","(busyness, 0.19266683)",0.155685,0.232294,...,0.158993,0.074503,0.112069,0.141913,0.13921,0.112632,0.095136,0.010976,0.192667,0.132173
1,It'd McDonalds. It is what it is as far as the...,positive,positive,"(general experience, 0.5023371)","(food quality, 0.4219886)","(staff, 0.42071745)","(customer service, 0.35618728)","(customer loyalty, 0.33402154)",0.356187,0.420717,...,0.233368,0.334022,0.090067,0.185619,0.071343,0.103034,0.268924,0.173768,0.214261,-0.001474
2,Made a mobile order got to the speaker and che...,negative,negative,"(customer service, 0.44631246)","(order accuracy, 0.43302262)","(refund_policy, 0.38280222)","(receipt issue, 0.3628762)","(staff, 0.27365482)",0.446312,0.273655,...,0.194078,0.263592,0.146208,0.235918,0.259681,0.184906,0.142167,0.091285,0.226593,0.101869
3,My mc. Crispy chicken sandwich was ..............,positive,positive,"(customer service, 0.39361775)","(food quality, 0.3920789)","(customer loyalty, 0.29757115)","(general experience, 0.27215844)","(order accuracy, 0.26110753)",0.393618,0.216679,...,0.202433,0.297571,-0.000655,0.221744,0.139198,0.162949,0.133394,0.095757,0.143017,0.158758
4,"I repeat my order 3 times in the drive thru, a...",negative,negative,"(order accuracy, 0.43826103)","(drive-thru experience, 0.41875166)","(general experience, 0.3367521)","(food quality, 0.29102868)","(busyness, 0.2879883)",0.258598,0.262975,...,0.277522,0.119351,0.226186,0.234919,0.234002,0.115486,0.218603,0.180055,0.287988,0.026835


In [56]:
df=df[['reviewer_id',
       'store_address', 'City', 'State','longitude', 'latitude',
       'review_date', 'review', 'clean_reviews','tokenized_reviews',
       'rating', 'actual_sentiment', 'Roberta_label','Roberta_score',
       'top_1', 'top_2', 'top_3', 'top_4','top_5',
       'customer service', 'staff', 'food quality', 'operating hours',
       'order accuracy', 'location', 'general experience', 'cleanliness',
       'refund_policy', 'recommendation', 'drive-thru experience', 'security',
       'advice', 'receipt issue', 'menu', 'customer loyalty',
       'language barrier', 'waiting time', 'left without ordering',
       'product availability', 'convenience', 'affordability', 'busyness',
       'request' ]]
df.head()

Unnamed: 0,reviewer_id,store_address,City,State,longitude,latitude,review_date,review,clean_reviews,tokenized_reviews,...,menu,customer loyalty,language barrier,waiting time,left without ordering,product availability,convenience,affordability,busyness,request
0,1,13749 US-183 Hwy,Austin,TX,-97.792874,30.460718,2025-01-15,Why does it look like someone spit on my food?...,why does it look like someone spit on my food ...,look like someone spit food normal transaction...,...,0.158993,0.074503,0.112069,0.141913,0.13921,0.112632,0.095136,0.010976,0.192667,0.132173
1,2,13749 US-183 Hwy,Austin,TX,-97.792874,30.460718,2025-04-10,It'd McDonalds. It is what it is as far as the...,itd mcdonalds it is what it is as far as the f...,itd mcdonalds far food atmosphere go staff mak...,...,0.233368,0.334022,0.090067,0.185619,0.071343,0.103034,0.268924,0.173768,0.214261,-0.001474
2,3,13749 US-183 Hwy,Austin,TX,-97.792874,30.460718,2025-04-10,Made a mobile order got to the speaker and che...,made a mobile order got to the speaker and che...,made mobile order got speaker checked line mov...,...,0.194078,0.263592,0.146208,0.235918,0.259681,0.184906,0.142167,0.091285,0.226593,0.101869
3,4,13749 US-183 Hwy,Austin,TX,-97.792874,30.460718,2025-03-15,My mc. Crispy chicken sandwich was ..............,my mc crispy chicken sandwich was customer ser...,mc crispy chicken sandwich customer service qu...,...,0.202433,0.297571,-0.000655,0.221744,0.139198,0.162949,0.133394,0.095757,0.143017,0.158758
4,5,13749 US-183 Hwy,Austin,TX,-97.792874,30.460718,2025-02-15,"I repeat my order 3 times in the drive thru, a...",i repeat my order times in the drive thru and ...,repeat order times drive thru still manage mes...,...,0.277522,0.119351,0.226186,0.234919,0.234002,0.115486,0.218603,0.180055,0.287988,0.026835


In [57]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22250 entries, 0 to 22249
Data columns (total 43 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   reviewer_id            22250 non-null  int64  
 1   store_address          22250 non-null  object 
 2   City                   22250 non-null  object 
 3   State                  22250 non-null  object 
 4   longitude              22250 non-null  float64
 5   latitude               22250 non-null  float64
 6   review_date            22250 non-null  object 
 7   review                 22250 non-null  object 
 8   clean_reviews          22250 non-null  object 
 9   tokenized_reviews      22179 non-null  object 
 10  rating                 22250 non-null  int64  
 11  actual_sentiment       22250 non-null  object 
 12  Roberta_label          22250 non-null  object 
 13  Roberta_score          22250 non-null  float64
 14  top_1                  22250 non-null  object 
 15  to

In [58]:
df.to_csv("data/data_with_topics.csv", index=False)

In [59]:
# Convertir les colonnes de labels en numériques
for label in df.columns[19:43]:
    df[label] = pd.to_numeric(df[label], errors='coerce')

In [60]:
df.to_csv("data/data_with_topics.csv", index=False)

In [61]:
contains_str = df.applymap(lambda x: isinstance(x, str)).any()
contains_str[contains_str==True]

  contains_str = df.applymap(lambda x: isinstance(x, str)).any()


store_address        True
City                 True
State                True
review_date          True
review               True
clean_reviews        True
tokenized_reviews    True
actual_sentiment     True
Roberta_label        True
dtype: bool