# SETUP

In [62]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy, re

# Fonctions

In [113]:
def remove_emojis(input_text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F700-\U0001F77F"  # alchemical symbols
        u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
        u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
        u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
        u"\U0001FA00-\U0001FA6F"  # Chess Symbols
        u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
        u"\U00002702-\U000027B0"  # Dingbats
        u"\U000024C2-\U0001F251" 
        "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', input_text)

# Créer une fonction pour nettoyer le texte
def clean_text(text):
    # Supprimer les espaces blancs
    text = text.strip()
    # Supprimer les retours à la ligne
    text = text.replace('\n', ' ')
    # Supprimer les espaces multiples
    text = re.sub(' +', ' ', text)
    # Supprimer les caractères spéciaux
    text = re.sub('[^A-Za-z0-9]+', ' ', text)
    # Supprimer les emojis
    text = remove_emojis(text)
    return text


# Traite du csv

In [114]:
nlp = spacy.load("en_core_web_sm")

# Load csv file
df = pd.read_csv('sample_data/Restaurant_reviews.csv', sep=',', encoding='utf-8')

# delete columns that are not needed
df.drop(['Reviewer', 'Metadata', 'Time', 'Pictures','7514'], axis=1, inplace=True)

# delete rows where restaurant, rating, and review are missing
df.dropna(subset=['Restaurant', 'Rating', 'Review'], inplace=True)

# Appliquer la fonction de nettoyage au texte
df['Review'] = df['Review'].apply(clean_text)

# Conversion de la colonne Rating en type entier
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Ajouter une nouvelle colonne pour l'étiquette
df['Label'] = np.where(df['Rating'] > 3, 1, 0)

# Create a new CSV file with cleaned data
df.to_csv('sample_data/reviews_clean.csv', index=False)

# Infomartion sur le dataset

In [115]:
positive = df[df['Label'] == 1]
negative = df[df['Label'] == 0]

print('Total reviews: ', len(df))
print('Positive reviews: ', len(positive))
print('Negative reviews: ', len(negative))
    
df = pd.concat([positive, negative], ignore_index=True)
df = df.sample(frac=1).reset_index(drop=True)

# Create a new CSV file with balanced data
df.to_csv('sample_data/reviews_balanced.csv', index=False)

print('Total reviews équilibré: ', len(df))
print('Positive reviews: ', len(positive))
print('Negative reviews: ', len(negative))

Total reviews:  9955
Positive reviews:  6268
Negative reviews:  3687
Total reviews équilibré:  7374
Positive reviews:  3687
Negative reviews:  3687


In [116]:
pos_labels_train = np.ones(len(positive))
neg_labels_train = np.zeros(len(negative))

# concaténation de ces deux array pour créer la colonne de labels
Y = np.hstack((pos_labels_train, neg_labels_train))

In [117]:
nlp_data = pd.DataFrame({"text":df['Review'], "label":Y})
nlp_data

Unnamed: 0,text,label
0,Delicious food Amazing place Reminds me of my ...,1.0
1,Food is not bad but not like northeast Service...,1.0
2,Best north Indian food in Hyderabad aloo and p...,1.0
3,If you crave for some Mumbai style chaat then ...,1.0
4,Great food Esp the Know Suey Starters were goo...,1.0
...,...,...
7369,Very good at quality guys even packaging is ex...,0.0
7370,Visited on Christmas eve I must say overall am...,0.0
7371,worst taste buffet is not at all good service ...,0.0
7372,The worst place I went there with a frnd today...,0.0


# Utilisation de la lib nlp

# Connexion et insertion en base