### Look if there is a difference in accurcay when change the encoding of the sentences

In [1]:
import pandas as pd

# Reload the dataset
training_data = pd.read_csv('/home/nathan/OneDrive/GitHub/Nvidia/Data/training_data.csv')

# Check for missing values and basic statistics
training_data.head()

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


In [2]:
import re
import string
import spacy

# Load the French language model for spaCy
nlp = spacy.load("fr_core_news_sm")

# Text Cleaning Function
def clean_text(text):
    # Lowercasing the text
    text = text.lower()
    # Removing punctuations
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Removing numbers and other non-letter characters
    text = re.sub(r'[^a-zàâçéèêëîïôûùüÿñæœ]', ' ', text)
    return text

# POS Tagging Function
def pos_tagging(text):
    doc = nlp(text)
    pos_tags = [token.pos_ for token in doc]
    return pos_tags

# Applying the cleaning function to the dataset
training_data['cleaned_sentence'] = training_data['sentence'].apply(clean_text)

# Feature Engineering
# Adding sentence length and word count
training_data['sentence_length'] = training_data['cleaned_sentence'].apply(len)
training_data['word_count'] = training_data['cleaned_sentence'].apply(lambda x: len(x.split()))

# Adding POS tagging
training_data['pos_tags'] = training_data['cleaned_sentence'].apply(pos_tagging)

# Displaying the first few rows of the updated dataset
training_data.head()


2023-11-19 19:31:47.999986: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-19 19:31:48.000650: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-19 19:31:48.249425: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-19 19:31:48.743181: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-19 19:31:54.149130: E external/local_xla/xla/

Unnamed: 0,id,sentence,difficulty,cleaned_sentence,sentence_length,word_count,pos_tags
0,0,Les coûts kilométriques réels peuvent diverger...,C1,les coûts kilométriques réels peuvent diverger...,247,38,"[DET, NOUN, ADJ, ADJ, VERB, VERB, ADV, ADP, NO..."
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,le bleu cest ma couleur préférée mais je naime...,58,12,"[DET, NOUN, VERB, DET, NOUN, VERB, CCONJ, PRON..."
2,2,Le test de niveau en français est sur le site ...,A1,le test de niveau en français est sur le site ...,64,13,"[DET, NOUN, ADP, NOUN, ADP, NOUN, VERB, ADP, D..."
3,3,Est-ce que ton mari est aussi de Boston?,A1,estce que ton mari est aussi de boston,38,8,"[NOUN, SCONJ, PROPN, NOUN, AUX, ADV, ADP, PROPN]"
4,4,"Dans les écoles de commerce, dans les couloirs...",B1,dans les écoles de commerce dans les couloirs ...,200,32,"[ADP, DET, NOUN, ADP, NOUN, ADP, DET, NOUN, AD..."


: 

In [None]:
# Encode the difficulty levels
from sklearn.calibration import LabelEncoder


label_encoder = LabelEncoder()
training_data['difficulty_encoded'] = label_encoder.fit_transform(training_data['difficulty'])

# Display the first few rows of the modified dataframe
df_encoded_head = training_data.head()
encoded_classes = label_encoder.classes_

df_encoded_head, encoded_classes

In [None]:
from transformers import CamembertTokenizer, CamembertModel
import torch
import numpy as np

# Initialize the tokenizer and model
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertModel.from_pretrained("camembert-base")

# Function to encode sentences in batches
def encode_sentences_in_batches(sentences, batch_size=64):
    model.eval()  # Set the model to evaluation mode
    batched_embeddings = []

    for i in range(0, len(sentences), batch_size):
        batch_sentences = sentences[i:i + batch_size]
        inputs = tokenizer(batch_sentences, padding=True, truncation=True, return_tensors="pt", max_length=512)
        
        with torch.no_grad():
            outputs = model(**inputs)
        embeddings = outputs.last_hidden_state[:, 0, :].numpy()
        batched_embeddings.append(embeddings)

    return np.vstack(batched_embeddings)

# Tokenize and encode sentences in batches
encoded_sentences = encode_sentences_in_batches(training_data['sentence'].tolist())

# Use encoded_sentences for training the logistic regression model


In [None]:
from sklearn.model_selection import train_test_split


X_train, X_val, y_train, y_val = train_test_split(encoded_sentences, training_data['difficulty_encoded'], test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train, y_train)

In [None]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Logistic Regression
lr_predictions = logistic_regression_model.predict(X_val)
print("Logistic Regression:")
print(classification_report(y_val, lr_predictions))
print("Accuracy:", accuracy_score(y_val, lr_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_val, lr_predictions))
