## Import the dataframe "french_to_english_product.csv"

In [None]:
# Install dependencies as needed:
# pip install kagglehub[pandas-datasets]
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "french_to_english_product.csv"

# Load the latest version
fr_en_kaggle_dataset = kagglehub.load_dataset(
  KaggleDatasetAdapter.PANDAS,
  "dargolex/french-reviews-on-amazon-items-and-en-translation",
  file_path,
  # Provide any additional arguments like
  # sql_query or pandas_kwargs. See the
  # documenation for more information:
  # https://github.com/Kaggle/kagglehub/blob/main/README.md#kaggledatasetadapterpandas
)

from pathlib import Path

# Persist a local copy so other notebooks can reuse it
DATA_PATH = Path("../data/raw/french_to_english_product.csv")
DATA_PATH.parent.mkdir(parents=True, exist_ok=True)
fr_en_kaggle_dataset.to_csv(DATA_PATH, index=False)
print(f"Saved dataset to: {DATA_PATH.resolve()}")

print("First 5 records:", fr_en_kaggle_dataset.head())

  fr_en_kaggle_dataset = kagglehub.load_dataset(


Downloading from https://www.kaggle.com/api/v1/datasets/download/dargolex/french-reviews-on-amazon-items-and-en-translation?dataset_version_number=1&file_name=french_to_english_product.csv...


100%|██████████| 23.7M/23.7M [00:00<00:00, 47.6MB/s]

Extracting zip of french_to_english_product.csv...





First 5 records:    rating                                             review  \
0       1  A déconseiller - Article n'a fonctionné qu'une...   
1       1  Si vous voulez être déçu achetez le produit ! ...   
2       1  Écran de mauvaise qualité, car il s'use en peu...   
3       1  Cet engin ne sert à rien les sons sont pourris...   
4       1  Très beau produit mais la grue n'a pas fonctio...   

                                         translation  
0  A discouragement - article Na worked that once...  
1  If you want to be disappointed buy the product...  
2  Screen of poor quality because it suses in a s...  
3  This machine does not serve the sounds are rot...  
4  Very nice product but the crane did not work v...  


In [None]:
# Total number of records
print("Total records:", len(fr_en_kaggle_dataset))

# Number of records per rating (assuming the column is called 'rating')
print("\nCount of reviews per rating:")
print(fr_en_kaggle_dataset['rating'].value_counts().sort_index())


Total records: 200000

Count of reviews per rating:
rating
1    40000
2    40000
3    40000
4    40000
5    40000
Name: count, dtype: int64


## Reduce the dataset size

In [None]:
# fr_en_dataset = fr_en_kaggle_dataset.groupby('rating').apply(lambda x: x.sample(n=15000, random_state=42)).reset_index(drop=True)

import pandas as pd
# Sample the data
samples = []
for rating, size in zip([1, 2, 4, 5], [5000, 5000, 5000, 5000]):
    samples.append(fr_en_kaggle_dataset[fr_en_kaggle_dataset['rating'] == rating].sample(n=size, random_state=42))
samples.append(fr_en_kaggle_dataset[fr_en_kaggle_dataset['rating'] == 3].sample(n=10000, random_state=42))
fr_en_dataset = pd.concat(samples).sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
fr_en_dataset

Unnamed: 0,rating,review,translation
0,1,ne peux pas l'enfiler et seule personne pour aide,can not lenfiler and only person for help
1,3,La protection ne couvre pas entièrement l'écra...,Protection does not cover entirely on the phon...
2,3,article clinquant en plastique alors que je pe...,Flying plastic article As I thought he was in ...
3,3,A première vue ce produit semble correspondre ...,At first glance this product seems to match my...
4,1,"Je suis très déçu du produit, ce n'est pas un ...","I am very disappointed with the product, it is..."
...,...,...,...
29995,3,J'avais cru qu'il s'agissait de la housse + la...,I thought he sacked with the cover + the kettl...
29996,2,J'ai acheté ce produit sur la base des comment...,I bought this product based on virtually all u...
29997,1,Je n’ai jamais reçu la table a repasser. Je n’...,I have never received the table to iron.I only...
29998,5,Très bien je l'utilise pour mes spray cheveux ...,Very good I use for my homemade hair spray


### Convert rating range from range(1,5) to range(1,3)

In [None]:
X = fr_en_dataset.drop(columns=["rating"])
y = fr_en_dataset['rating']

def convert_rating_to_sentiment(rating):
    if rating in [1, 2]:
        return 0  # Negative
    elif rating == 3:
        return 1  # Neutral
    else:
        return 2  # Positive

# Apply it to your dataset
new_y = y.apply(convert_rating_to_sentiment)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer


# Split data into training (70%) and test_val set (30%) for test and validztion sets
X_train, X_test_val, y_train, y_test_val = train_test_split(X, new_y, test_size=0.30, random_state=42, stratify=new_y)

# Split temp set into validation (15%) and test (15%)
X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val, test_size=0.50, random_state=42, stratify=y_test_val)


## Data Preprocessing
*   removing stop words (Removes common words (e.g., the, is, in, et, le, la))
*   removing punctuations and special characters (Eliminates noise from the text)
*   Lemmatizing (Converts words to their base form)
*   Tokenization (Splits text into words or phrases)
*   lowercase conversion (Standardizes text by converting everything to lowercase)

In [None]:
!python -m spacy download fr_core_news_sm

Collecting fr-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/fr_core_news_sm-3.8.0/fr_core_news_sm-3.8.0-py3-none-any.whl (16.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m16.3/16.3 MB[0m [31m82.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fr-core-news-sm
Successfully installed fr-core-news-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('fr_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report


import re
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download stopwords if needed
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab') # Download the punkt_tab resource


# Load English and French stopwords
stop_words_en = set(stopwords.words('english'))
stop_words_fr = set(stopwords.words('french'))

# Load Spacy models for Lemmatization
nlp_en = spacy.load("en_core_web_sm", disable=["parser", "ner"])
nlp_fr = spacy.load("fr_core_news_sm", disable=["parser", "ner"])

# Function to clean and preprocess text
def preprocess_text(text, lang):
    # Lowercase
    text = text.lower()

    # Remove punctuation & special characters
    text = re.sub(r'[^a-zA-ZÀ-ÿ\s]', '', text)

    if lang == "english":
        doc = nlp_en(text)  # Process entire text
        tokens = [token.lemma_ for token in doc if token.text not in stop_words_en]
    else:
        doc = nlp_fr(text)  # Process entire text
        tokens = [token.lemma_ for token in doc if token.text not in stop_words_fr]

    return " ".join(tokens)  # Return processed text
# This line is for converting list of words to a string as TF-IDF and BoW require text in a string format.


# Preprocess text
preprocessed_train_fr = [preprocess_text(text, "french") for text in X_train['review']]
preprocessed_train_en = [preprocess_text(text, "english") for text in X_train['translation']]

print("French Processed:", preprocessed_train_fr)
print("English Processed:", preprocessed_train_en)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.




In [None]:
preprocessed_test_fr = [preprocess_text(text, "french") for text in X_test['review']]
preprocessed_test_en = [preprocess_text(text, "english") for text in X_test['translation']]

preprocessed_val_fr = [preprocess_text(text, "french") for text in X_val['review']]
preprocessed_val_en = [preprocess_text(text, "english") for text in X_val['translation']]


## Data Vectorization using TF-IDF and Bag of Words

### 1) BoW

In [None]:
bag_of_word_vectorizer = CountVectorizer(strip_accents= "unicode")

en_X_train_bag_of_word = bag_of_word_vectorizer.fit_transform(preprocessed_train_en)
en_X_test_bag_of_word = bag_of_word_vectorizer.transform(preprocessed_test_en)
en_X_val_bag_of_word = bag_of_word_vectorizer.transform(preprocessed_val_en)

fr_X_train_bag_of_word = bag_of_word_vectorizer.fit_transform(preprocessed_train_fr)
fr_X_test_bag_of_word = bag_of_word_vectorizer.transform(preprocessed_test_fr)
fr_X_val_bag_of_word = bag_of_word_vectorizer.transform(preprocessed_val_fr)

### 2) TF-IDF

In [None]:
tf_idf_vectorizer = TfidfVectorizer(strip_accents= "unicode")

en_X_train_tf_idf = tf_idf_vectorizer.fit_transform(preprocessed_train_en)
en_X_test_tf_idf = tf_idf_vectorizer.transform(preprocessed_test_en)
en_X_val_tf_idf = tf_idf_vectorizer.transform(preprocessed_val_en)

fr_X_train_tf_idf = tf_idf_vectorizer.fit_transform(preprocessed_train_fr)
fr_X_test_tf_idf = tf_idf_vectorizer.transform(preprocessed_test_fr)
fr_X_val_tf_idf = tf_idf_vectorizer.transform(preprocessed_val_fr)

### 3) Word2Vec

Word2Vec is a word embedding algorithm that transforms words into numerical vectors, offering a more context-aware representation than traditional methods like TF-IDF and Bag-of-Words (BoW). To evaluate its impact on our data and the performance of various machine learning models, we applied Word2Vec to the English translations of the reviews. However, applying it to the original French reviews was not feasible due to the large size of the French Word2Vec model, which requires significant time and bandwidth to download. Therefore, we limited the Word2Vec-based vectorization to the English dataset.



In [None]:
!pip install gensim

Collecting gensim
  Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (8.1 kB)
Collecting numpy<2.0,>=1.18.5 (from gensim)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scipy<1.14.0,>=1.7.0 (from gensim)
  Downloading scipy-1.13.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.6/60.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
Downloading gensim-4.3.3-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m26.7/26.7 MB[0m [31m65.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.3 MB)
[2K   [90m━━━━━━━━━━━

In [None]:
!pip install --upgrade numpy
!pip install --upgrade gensim

In [None]:
import gensim
import gensim.downloader as api
from gensim.models import KeyedVectors
import numpy as np

# Apply Word2Vec models for English and French reviews

en_word2vec_vectors = api.load('fasttext-wiki-news-subwords-300')


# average word vectors for a sentence
def get_sentence_vector(sentence, model, vector_size):
    words = sentence.split()
    word_vectors = []

    for word in words:
        if word in model:
            word_vectors.append(model[word])

    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(vector_size)

# Vector sizes for each model
en_vector_size = 300
# fr_vector_size = 300

# English
en_X_train_w2v = np.array([get_sentence_vector(sent, en_word2vec_vectors, en_vector_size) for sent in preprocessed_train_en])
en_X_test_w2v = np.array([get_sentence_vector(sent, en_word2vec_vectors, en_vector_size) for sent in preprocessed_test_en])
en_X_val_w2v = np.array([get_sentence_vector(sent, en_word2vec_vectors, en_vector_size) for sent in preprocessed_val_en])

ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [None]:
# As Naive Bayes models accept non-negative inputs, we use MinMaxScaler to shift Word2Vec values into a positive range.
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
en_X_train_w2v = scaler.fit_transform(en_X_train_w2v)
en_X_val_w2v = scaler.transform(en_X_val_w2v)
en_X_test_w2v = scaler.transform(en_X_test_w2v)

#### The French Word2Vec model code to show how can we apply it, but we didn't use it.
[Reference](https://fasttext.cc/docs/en/crawl-vectors.html)

In [None]:
# !git clone https://github.com/facebookresearch/fastText.git
# !cd fastText
# !sudo python setup.py install

In [None]:
# !pip install fasttext

In [None]:
# import fasttext.util
# fasttext.util.download_model('fr', if_exists='ignore')  # French
# ft = fasttext.load_model('cc.fr.300.bin')

In [None]:
# fr_word2vec_vectors = KeyedVectors.load_word2vec_format("cc.fr.300.vec", encoding='utf-8')

# # French
# fr_X_train_w2v = np.array([get_sentence_vector(sent, fr_word2vec_vectors, fr_vector_size) for sent in preprocessed_train_fr])
# fr_X_test_w2v = np.array([get_sentence_vector(sent, fr_word2vec_vectors, fr_vector_size) for sent in preprocessed_test_fr])
# fr_X_val_w2v = np.array([get_sentence_vector(sent, fr_word2vec_vectors, fr_vector_size) for sent in preprocessed_val_fr])

In [None]:
# from sklearn.model_selection import GridSearchCV
# C=[0.05,0.1,0.2,0.3,0.25,0.4,0.5,0.6,0.7,0.8,0.9,1]
# gamma=[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1.0]
# kernel=['rbf','linear']
# hyper={'kernel':kernel,'C':C,'gamma':gamma}
# gd=GridSearchCV(estimator=svm.SVC(),param_grid=hyper,verbose=True)
# gd.fit(X,Y)
# print(gd.best_score_)
# print(gd.best_estimator_)

### Lists to add all models aaccuracies for French and English reviews

In [None]:
en_results = []
fr_results = []
fr_en_results = []

## Baseline Model: KNN + TF-IDF

In [None]:
from sklearn.neighbors import KNeighborsClassifier

# English TF-IDF Vectorized KNN
en_tf_idf_knn = KNeighborsClassifier(n_neighbors=5)
en_tf_idf_knn.fit(en_X_train_tf_idf, y_train)

# Predict on the validation set
en_y_val_pred_tf_idf_knn = en_tf_idf_knn.predict(en_X_val_tf_idf)

# Compute classification accuracy
en_accuracy_tf_idf_knn = accuracy_score(y_val, en_y_val_pred_tf_idf_knn)

# French TF-IDF Vectorized KNN
fr_tf_idf_knn = KNeighborsClassifier(n_neighbors=5)
fr_tf_idf_knn.fit(fr_X_train_tf_idf, y_train)

# Predict on the validation set
fr_y_val_pred_tf_idf_knn = fr_tf_idf_knn.predict(fr_X_val_tf_idf)

# Compute classification accuracy
fr_accuracy_tf_idf_knn = accuracy_score(y_val, fr_y_val_pred_tf_idf_knn)

# Create accuracy comparison table
tf_idf_knn_results_df = pd.DataFrame({
    "Model": ["KNN-English", "KNN-French"],
    "Accuracy (TF-IDF)": [en_accuracy_tf_idf_knn, fr_accuracy_tf_idf_knn]
})

print("\nAccuracy Comparison Table:")
print(tf_idf_knn_results_df)

# Add model accuracy data to list of all models
en_results.append({"Model": "KNN", "Word embedding": "TF-IDF", "language": "En", "Accuracy": en_accuracy_tf_idf_knn})
fr_results.append({"Model": "KNN", "Word embedding": "TF-IDF", "language": "Fr", "Accuracy": fr_accuracy_tf_idf_knn})

# ------------------------
print("\nEvaluation for English KNN:")
print(classification_report(y_val, en_y_val_pred_tf_idf_knn))

print("\nEvaluation for French KNN:")
print(classification_report(y_val, fr_y_val_pred_tf_idf_knn))


Accuracy Comparison Table:
         Model  Accuracy (TF-IDF)
0  KNN-English           0.497556
1   KNN-French           0.522222

Evaluation for English KNN:
              precision    recall  f1-score   support

           0       0.51      0.62      0.56      1500
           1       0.41      0.33      0.37      1500
           2       0.55      0.53      0.54      1500

    accuracy                           0.50      4500
   macro avg       0.49      0.50      0.49      4500
weighted avg       0.49      0.50      0.49      4500


Evaluation for French KNN:
              precision    recall  f1-score   support

           0       0.54      0.65      0.59      1500
           1       0.42      0.32      0.36      1500
           2       0.58      0.60      0.59      1500

    accuracy                           0.52      4500
   macro avg       0.51      0.52      0.51      4500
weighted avg       0.51      0.52      0.51      4500



## Naive Bayes Models
Sklearn provides 5 types of Naive Bayes:
- GaussianNB
- CategoricalNB
- BernoulliNB
- MultinomialNB
- ComplementNB

Based on table below and because our data is balanced, we apply MultinomialNB and ComplementNB on our data and choose the one with better


| **Classifier**     | **Features**                      | **Use Cases**                                           | **Details**                                                       | **Text Data** |
|--------------------|-----------------------------------|---------------------------------------------------------|-------------------------------------------------------------------|---------------|
| **GaussianNB**     | continuous                        | Sensor data, medical measurements                       |                                                                   | No            |
| **CategoricalNB**  | categorical (discrete variables)  |                                                         |                                                                   | No            |
| **BernoulliNB**    | binary                            | Text classification with binary BoW, spam detection     | Considers word frequency, unlike BernoulliNB.                    | Yes           |
| **MultinomialNB**  | multinomial (discrete variables)  | Sentiment analysis, spam detection                      | Imbalanced text classification problems                          | Yes           |
| **ComplementNB**   | multinomial (discrete variables)  | Sentiment analysis, spam detection                      | Similar to MultinomialNB, designed for imbalanced datasets       | Yes           |


In [None]:
from sklearn.naive_bayes import ComplementNB, MultinomialNB


### 1) Naive Bayes with BoW for English textes

In [None]:
# ComplementNB
en_bag_of_word_complementNB = ComplementNB()
en_bag_of_word_complementNB.fit(en_X_train_bag_of_word, y_train)

# Predict on the validation set
en_y_val_pred_bow_complementNB = en_bag_of_word_complementNB.predict(en_X_val_bag_of_word)

# Compute classification accuracy
en_accuracy_bow_complementNB = accuracy_score(y_val, en_y_val_pred_bow_complementNB)

# MultinomialNB
en_bag_of_word_multinomialNB = MultinomialNB()
en_bag_of_word_multinomialNB.fit(en_X_train_bag_of_word, y_train)

# Predict on the validation set
en_y_val_pred_bow_multinomialNB = en_bag_of_word_multinomialNB.predict(en_X_val_bag_of_word)

# Compute classification accuracy
en_accuracy_bow_multinomialNB = accuracy_score(y_val, en_y_val_pred_bow_multinomialNB)


# Create accuracy table
en_bow_NB_results_df = pd.DataFrame({
    "Model": ["ComplementNB", "MultinomialNB"],
    "Accuracy (BoW - English)": [en_accuracy_bow_complementNB, en_accuracy_bow_multinomialNB]
})

print("\n Accuracy Comparison Table:")
print(en_bow_NB_results_df)

# Add model accuracy data to list of all models
en_results.append({"model": "ComplementNB", "word embedding": "BoW", "language": "En", "accuracy": en_accuracy_bow_complementNB})
en_results.append({"model": "MultinomialNB", "word embedding": "BoW", "language": "En", "accuracy": en_accuracy_bow_multinomialNB})

# ------------------------
# Select best model based on accuracy
if en_accuracy_bow_multinomialNB > en_accuracy_bow_complementNB:
    print("\n The MultinomialNB model is selected based on better accuracy.")
    print(classification_report(y_val, en_y_val_pred_bow_multinomialNB))

else:
    print("\n The ComplementNB model is selected based on better accuracy.")
    print(classification_report(y_val, en_y_val_pred_bow_complementNB))

en_bow_comparison_df = pd.DataFrame({
    "Actual Rating": y_val,
    "Predicted Rating": en_y_val_pred_bow_multinomialNB
})



# Show the first few rows
print("\n Actual vs. Predicted Ratings:")
print(en_bow_comparison_df.head(10))



 Accuracy Comparison Table:
           Model  Accuracy (BoW - English)
0   ComplementNB                  0.583111
1  MultinomialNB                  0.588222

 The MultinomialNB model is selected based on better accuracy.
              precision    recall  f1-score   support

           0       0.63      0.65      0.64      1500
           1       0.48      0.49      0.49      1500
           2       0.66      0.62      0.64      1500

    accuracy                           0.59      4500
   macro avg       0.59      0.59      0.59      4500
weighted avg       0.59      0.59      0.59      4500


 Actual vs. Predicted Ratings:
       Actual Rating  Predicted Rating
24058              1                 0
12620              1                 0
8167               2                 2
23570              0                 0
5548               2                 2
23723              2                 1
21400              1                 0
19004              1                 1
18255         

### 2) Naive Bayes with BoW for French textes

In [None]:
# ComplementNB
fr_bag_of_word_complementNB = ComplementNB()
fr_bag_of_word_complementNB.fit(fr_X_train_bag_of_word, y_train)

# Predict on the validation set
fr_y_val_pred_bow_complementNB = fr_bag_of_word_complementNB.predict(fr_X_val_bag_of_word)

# Compute classification accuracy
fr_accuracy_bow_complementNB = accuracy_score(y_val, fr_y_val_pred_bow_complementNB)

# MultinomialNB
fr_bag_of_word_multinomialNB = MultinomialNB()
fr_bag_of_word_multinomialNB.fit(fr_X_train_bag_of_word, y_train)

# Predict on the validation set
fr_y_val_pred_bow_multinomialNB = fr_bag_of_word_multinomialNB.predict(fr_X_val_bag_of_word)

# Compute classification accuracy
fr_accuracy_bow_multinomialNB = accuracy_score(y_val, fr_y_val_pred_bow_multinomialNB)


# Create accuracy table
fr_bow_results_df = pd.DataFrame({
    "Model": ["ComplementNB", "MultinomialNB"],
    "Accuracy (BoW - French)": [fr_accuracy_bow_complementNB, fr_accuracy_bow_multinomialNB]
})

print("\n Accuracy Comparison Table:")
print(fr_bow_results_df)

# Add model accuracy data to list of all models
fr_results.append({"model": "ComplementNB", "word embedding": "BoW", "language": "Fr", "accuracy": fr_accuracy_bow_complementNB})
fr_results.append({"model": "MultinomialNB", "word embedding": "BoW", "language": "Fr", "accuracy": fr_accuracy_bow_multinomialNB})

# ------------------------
# Select best model based on accuracy
if fr_accuracy_bow_multinomialNB > fr_accuracy_bow_complementNB:
    print("\n The MultinomialNB model is selected based on better accuracy.")
    print(classification_report(y_val, fr_y_val_pred_bow_multinomialNB))

else:
    print("\n The ComplementNB model is selected based on better accuracy.")
    print(classification_report(y_val, fr_y_val_pred_bow_complementNB))

fr_bow_comparison_df = pd.DataFrame({
    "Actual Rating": y_val,
    "Predicted Rating": fr_y_val_pred_bow_multinomialNB
})

# Show the first few rows
print("\n Actual vs. Predicted Ratings:")
print(fr_bow_comparison_df.head(10))


 Accuracy Comparison Table:
           Model  Accuracy (BoW - French)
0   ComplementNB                 0.609333
1  MultinomialNB                 0.616667

 The MultinomialNB model is selected based on better accuracy.
              precision    recall  f1-score   support

           0       0.66      0.65      0.66      1500
           1       0.50      0.49      0.49      1500
           2       0.68      0.71      0.70      1500

    accuracy                           0.62      4500
   macro avg       0.61      0.62      0.62      4500
weighted avg       0.61      0.62      0.62      4500


 Actual vs. Predicted Ratings:
       Actual Rating  Predicted Rating
24058              1                 0
12620              1                 0
8167               2                 2
23570              0                 1
5548               2                 2
23723              2                 2
21400              1                 0
19004              1                 0
18255            

### 3) Naive Bayes with TF-IDF for English textes

In [None]:
# ComplementNB
en_tf_idf_complementNB = ComplementNB()
en_tf_idf_complementNB.fit(en_X_train_tf_idf, y_train)

# Predict on the validation set
en_y_val_pred_tf_idf_complementNB = en_tf_idf_complementNB.predict(en_X_val_tf_idf)

# Compute classification accuracy
en_accuracy_tf_idf_complementNB = accuracy_score(y_val, en_y_val_pred_tf_idf_complementNB)

# MultinomialNB
en_tf_idf_multinomialNB = MultinomialNB()
en_tf_idf_multinomialNB.fit(en_X_train_tf_idf, y_train)

# Predict on the validation set
en_y_val_pred_tf_idf_multinomialNB = en_tf_idf_multinomialNB.predict(en_X_val_tf_idf)

# Compute classification accuracy
en_accuracy_tf_idf_multinomialNB = accuracy_score(y_val, en_y_val_pred_tf_idf_multinomialNB)


# Create accuracy table
en_tf_idf_results_df = pd.DataFrame({
    "Model": ["ComplementNB", "MultinomialNB"],
    "Accuracy (TF-IDF - English)": [en_accuracy_tf_idf_complementNB, en_accuracy_tf_idf_multinomialNB]
})

print("\n Accuracy Comparison Table:")
print(en_tf_idf_results_df)

# Add model accuracy data to list of all models
en_results.append({"Model": "ComplementNB", "Word embedding": "TF-IDF", "language": "En", "Accuracy": en_accuracy_tf_idf_complementNB})
en_results.append({"Model": "MultinomialNB", "Word embedding": "TF-IDF", "language": "En", "Accuracy": en_accuracy_tf_idf_multinomialNB})


# ------------------------
# Select best model based on accuracy
if en_accuracy_tf_idf_multinomialNB > en_accuracy_tf_idf_complementNB:
    print("\n The MultinomialNB model is selected based on better accuracy.")
    print(classification_report(y_val, en_y_val_pred_tf_idf_multinomialNB))
else:
    print("\n The ComplementNB model is selected based on better accuracy.")
    print(classification_report(y_val, en_y_val_pred_tf_idf_complementNB))

en_tf_idf_comparison_df = pd.DataFrame({
    "Actual Rating": y_val,
    "Predicted Rating": en_y_val_pred_tf_idf_multinomialNB
})

# Show the first few rows
print("\n Actual vs. Predicted Ratings:")
print(en_tf_idf_comparison_df.head(10))



 Accuracy Comparison Table:
           Model  Accuracy (TF-IDF - English)
0   ComplementNB                     0.578000
1  MultinomialNB                     0.580667

 The MultinomialNB model is selected based on better accuracy.
              precision    recall  f1-score   support

           0       0.63      0.65      0.64      1500
           1       0.47      0.50      0.48      1500
           2       0.66      0.58      0.62      1500

    accuracy                           0.58      4500
   macro avg       0.59      0.58      0.58      4500
weighted avg       0.59      0.58      0.58      4500


 Actual vs. Predicted Ratings:
       Actual Rating  Predicted Rating
24058              1                 0
12620              1                 0
8167               2                 2
23570              0                 0
5548               2                 1
23723              2                 1
21400              1                 0
19004              1                 1
18255

### 4) Naive Bayes with TF-IDF for French textes

In [None]:
# ComplementNB
fr_tf_idf_complementNB = ComplementNB()
fr_tf_idf_complementNB.fit(fr_X_train_tf_idf, y_train)

# Predict on the validation set
fr_y_val_pred_tf_idf_complementNB = fr_tf_idf_complementNB.predict(fr_X_val_tf_idf)

# Compute classification accuracy
fr_accuracy_tf_idf_complementNB = accuracy_score(y_val, fr_y_val_pred_tf_idf_complementNB)

# MultinomialNB
fr_tf_idf_multinomialNB = MultinomialNB()
fr_tf_idf_multinomialNB.fit(fr_X_train_tf_idf, y_train)

# Predict on the validation set
fr_y_val_pred_tf_idf_multinomialNB = fr_tf_idf_multinomialNB.predict(fr_X_val_tf_idf)

# Compute classification accuracy
fr_accuracy_tf_idf_multinomialNB = accuracy_score(y_val, fr_y_val_pred_tf_idf_multinomialNB)


# Create accuracy table
fr_tf_idf_results_df = pd.DataFrame({
    "Model": ["ComplementNB", "MultinomialNB"],
    "Accuracy (TF-IDF - French)": [fr_accuracy_tf_idf_complementNB, fr_accuracy_tf_idf_multinomialNB]
})

print("\n Accuracy Comparison Table:")
print(fr_tf_idf_results_df)

# Add model accuracy data to list of all models
en_results.append({"Model": "ComplementNB", "Word embedding": "TF-IDF", "language": "Fr", "Accuracy": fr_accuracy_tf_idf_complementNB})
en_results.append({"Model": "MultinomialNB", "Word embedding": "TF-IDF", "language": "Fr", "Accuracy": fr_accuracy_tf_idf_multinomialNB})

# ------------------------
# Select best model based on accuracy
if fr_accuracy_tf_idf_multinomialNB > fr_accuracy_tf_idf_complementNB:
    print("\n The MultinomialNB model is selected based on better accuracy.")
    print(classification_report(y_val, fr_y_val_pred_tf_idf_multinomialNB))
else:
    print("\n The ComplementNB model is selected based on better accuracy.")
    print(classification_report(y_val, fr_y_val_pred_tf_idf_complementNB))

fr_tf_idf_comparison_df = pd.DataFrame({
    "Actual Rating": y_val,
    "Predicted Rating": fr_y_val_pred_tf_idf_multinomialNB
})

# Show the first few rows
print("\n Actual vs. Predicted Ratings:")
print(fr_tf_idf_comparison_df.head(10))


 Accuracy Comparison Table:
           Model  Accuracy (TF-IDF - French)
0   ComplementNB                    0.607556
1  MultinomialNB                    0.616444

 The MultinomialNB model is selected based on better accuracy.
              precision    recall  f1-score   support

           0       0.66      0.66      0.66      1500
           1       0.50      0.52      0.51      1500
           2       0.70      0.67      0.68      1500

    accuracy                           0.62      4500
   macro avg       0.62      0.62      0.62      4500
weighted avg       0.62      0.62      0.62      4500


 Actual vs. Predicted Ratings:
       Actual Rating  Predicted Rating
24058              1                 0
12620              1                 0
8167               2                 2
23570              0                 1
5548               2                 2
23723              2                 2
21400              1                 0
19004              1                 1
18255   

### 5) Naive Bayes with Word2Vec for English textes

In [None]:
# ComplementNB
en_w2v_complementNB = ComplementNB()
en_w2v_complementNB.fit(en_X_train_w2v, y_train)

# Predict on the validation set
en_y_val_pred_w2v_complementNB = en_w2v_complementNB.predict(en_X_val_w2v)

# Compute classification accuracy
en_accuracy_w2v_complementNB = accuracy_score(y_val, en_y_val_pred_w2v_complementNB)

# MultinomialNB
en_w2v_multinomialNB = MultinomialNB()
en_w2v_multinomialNB.fit(en_X_train_w2v, y_train)

# Predict on the validation set
en_y_val_pred_w2v_multinomialNB = en_w2v_multinomialNB.predict(en_X_val_w2v)

# Compute classification accuracy
en_accuracy_w2v_multinomialNB = accuracy_score(y_val, en_y_val_pred_w2v_multinomialNB)


# Create accuracy table
en_w2v_results_df = pd.DataFrame({
    "Model": ["ComplementNB", "MultinomialNB"],
    "Accuracy (W2V - English)": [en_accuracy_w2v_complementNB, en_accuracy_w2v_multinomialNB]
})

print("\n Accuracy Comparison Table:")
print(en_w2v_results_df)

# Add model accuracy data to list of all models
en_results.append({"Model": "ComplementNB", "Word embedding": "W2V", "language": "En", "Accuracy": en_accuracy_w2v_complementNB})
en_results.append({"Model": "MultinomialNB", "Word embedding": "W2V", "language": "En", "Accuracy": en_accuracy_w2v_multinomialNB})


# ------------------------
# Select best model based on accuracy
if en_accuracy_w2v_multinomialNB > en_accuracy_w2v_complementNB:
    print("\n The MultinomialNB model is selected based on better accuracy.")
    print(classification_report(y_val, en_y_val_pred_w2v_multinomialNB))
else:
    print("\n The ComplementNB model is selected based on better accuracy.")
    print(classification_report(y_val, en_y_val_pred_w2v_complementNB))

en_w2v_comparison_df = pd.DataFrame({
    "Actual Rating": y_val,
    "Predicted Rating": en_y_val_pred_w2v_multinomialNB
})

# Show the first few rows
print("\n Actual vs. Predicted Ratings:")
print(en_w2v_comparison_df.head(10))



 Accuracy Comparison Table:
           Model  Accuracy (W2V - English)
0   ComplementNB                  0.497111
1  MultinomialNB                  0.507111

 The MultinomialNB model is selected based on better accuracy.
              precision    recall  f1-score   support

           0       0.51      0.59      0.55      1500
           1       0.44      0.41      0.42      1500
           2       0.57      0.51      0.54      1500

    accuracy                           0.51      4500
   macro avg       0.51      0.51      0.51      4500
weighted avg       0.51      0.51      0.51      4500


 Actual vs. Predicted Ratings:
       Actual Rating  Predicted Rating
24058              1                 0
12620              1                 0
8167               2                 2
23570              0                 0
5548               2                 1
23723              2                 2
21400              1                 0
19004              1                 2
18255         

## References:

1.   https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
2.   https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html
3.   https://medium.com/data-science/why-how-to-use-the-naive-bayes-algorithms-in-a-regulated-industry-with-sklearn-python-code-dbd8304ab2cf


