# IMPORT THE DATA

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd

# Adjust the path based on where you've stored the files
training_data = '/content/drive/MyDrive/Colab Notebooks/2024_Data_science/Final_Project_Kaggle_Competition/training_data.csv'
unlabelled_data = '/content/drive/MyDrive/Colab Notebooks/2024_Data_science/Final_Project_Kaggle_Competition/unlabelled_test_data.csv'
sample_submission = '/content/drive/MyDrive/Colab Notebooks/2024_Data_science/Final_Project_Kaggle_Competition/sample_submission.csv'

training_data_pd = pd.read_csv(training_data)
unlabelled_data_pd = pd.read_csv(unlabelled_data)
sample_submission_pd = pd.read_csv(sample_submission)


In [None]:
display(training_data_pd)
display(unlabelled_data_pd)
display(sample_submission_pd)

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1
...,...,...,...
4795,4795,"C'est pourquoi, il décida de remplacer les hab...",B2
4796,4796,Il avait une de ces pâleurs splendides qui don...,C1
4797,4797,"Et le premier samedi de chaque mois, venez ren...",A2
4798,4798,Les coûts liés à la journalisation n'étant pas...,C2


Unnamed: 0,id,sentence
0,0,Nous dûmes nous excuser des propos que nous eû...
1,1,Vous ne pouvez pas savoir le plaisir que j'ai ...
2,2,"Et, paradoxalement, boire froid n'est pas la b..."
3,3,"Ce n'est pas étonnant, car c'est une saison my..."
4,4,"Le corps de Golo lui-même, d'une essence aussi..."
...,...,...
1195,1195,C'est un phénomène qui trouve une accélération...
1196,1196,Je vais parler au serveur et voir si on peut d...
1197,1197,Il n'était pas comme tant de gens qui par pare...
1198,1198,Ils deviennent dangereux pour notre économie.


Unnamed: 0,id,difficulty
0,0,A1
1,1,A1
2,2,A1
3,3,A1
4,4,A1
...,...,...
1195,1195,A1
1196,1196,A1
1197,1197,A1
1198,1198,A1


# LOGISTIC REGRESSION

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import string
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')  # Ensure the tokenizer is downloaded

# Function to preprocess text
def preprocess_text(text):
    # Convert to lower case
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text)
    # Join back into a single string
    return ' '.join(tokens)

# Applying preprocessing to the sentences in the training data
training_data_pd['processed_sentence'] = training_data_pd['sentence'].apply(preprocess_text)

# Vectorization
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(training_data_pd['processed_sentence'])
y = training_data_pd['difficulty']

# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# We will use these splits for training and evaluating the logistic regression model


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)

# Train the model
model.fit(X_train, y_train)

# Predict on the testing set
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print("Accuracy of the model:", accuracy)
print("Classification Report:\n", class_report)


Accuracy of the model: 0.465625
Classification Report:
               precision    recall  f1-score   support

          A1       0.52      0.78      0.62       166
          A2       0.36      0.38      0.37       158
          B1       0.41      0.33      0.36       166
          B2       0.48      0.39      0.43       153
          C1       0.45      0.42      0.43       152
          C2       0.55      0.48      0.52       165

    accuracy                           0.47       960
   macro avg       0.46      0.46      0.46       960
weighted avg       0.46      0.47      0.46       960



# KNN

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'X_train', 'X_test', 'y_train', 'y_test' are already defined and ready to use from previous steps

# Initialize the KNN model
# You can adjust the 'n_neighbors' parameter based on your validation results or use GridSearchCV to find the optimal value
knn_model = KNeighborsClassifier(n_neighbors=5)

# Train the model
knn_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_knn = knn_model.predict(X_test)

# Evaluate the model
accuracy_knn = accuracy_score(y_test, y_pred_knn)
class_report_knn = classification_report(y_test, y_pred_knn)

print("Accuracy of the KNN model:", accuracy_knn)
print("Classification Report for KNN:\n", class_report_knn)


Accuracy of the KNN model: 0.19479166666666667
Classification Report for KNN:
               precision    recall  f1-score   support

          A1       0.21      0.96      0.34       166
          A2       0.10      0.06      0.07       158
          B1       0.11      0.04      0.05       166
          B2       0.14      0.02      0.03       153
          C1       0.11      0.01      0.01       152
          C2       0.80      0.05      0.09       165

    accuracy                           0.19       960
   macro avg       0.24      0.19      0.10       960
weighted avg       0.25      0.19      0.10       960



In [None]:
# Assuming 'vectorizer' is already fitted on the training data

# Preprocess and vectorize the unlabelled data
X_unlabelled_knn = vectorizer.transform(unlabelled_data_pd['processed_sentence'])

# Make predictions with KNN
predictions_knn = knn_model.predict(X_unlabelled_knn)

# Prepare the submission DataFrame
submission_knn = pd.DataFrame({
    'id': unlabelled_data_pd['id'],
    'difficulty': predictions_knn
})
"""
# Save the DataFrame to a CSV file for submission
submission_knn.to_csv('/content/drive/My Drive/Colab Notebooks/2024_Data_science/Final_Project_Kaggle_Competition/submission_knn.csv', index=False)
"""
display(submission_knn)


Unnamed: 0,id,difficulty
0,0,C2
1,1,A1
2,2,A1
3,3,A1
4,4,B2
...,...,...
1195,1195,A1
1196,1196,A1
1197,1197,A2
1198,1198,A1


# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'X_train', 'X_test', 'y_train', 'y_test' are already defined and ready to use from the previous steps

# Initialize the Decision Tree model
dt_model = DecisionTreeClassifier(random_state=42)

# Train the model
dt_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_dt = dt_model.predict(X_test)

# Evaluate the model
accuracy_dt = accuracy_score(y_test, y_pred_dt)
class_report_dt = classification_report(y_test, y_pred_dt)

print("Accuracy of the Decision Tree model:", accuracy_dt)
print("Classification Report for Decision Tree:\n", class_report_dt)


Accuracy of the Decision Tree model: 0.31666666666666665
Classification Report for Decision Tree:
               precision    recall  f1-score   support

          A1       0.47      0.52      0.50       166
          A2       0.24      0.23      0.24       158
          B1       0.28      0.22      0.25       166
          B2       0.29      0.35      0.32       153
          C1       0.26      0.30      0.28       152
          C2       0.35      0.27      0.30       165

    accuracy                           0.32       960
   macro avg       0.31      0.32      0.31       960
weighted avg       0.32      0.32      0.31       960



In [None]:
# Assuming 'vectorizer' is already fitted on the training data

# Preprocess and vectorize the unlabelled data
X_unlabelled_dt = vectorizer.transform(unlabelled_data_pd['processed_sentence'])

# Make predictions with the Decision Tree
predictions_dt = dt_model.predict(X_unlabelled_dt)

# Prepare the submission DataFrame
submission_dt = pd.DataFrame({
    'id': unlabelled_data_pd['id'],
    'difficulty': predictions_dt
})
"""
# Save the DataFrame to a CSV file for submission
submission_dt.to_csv('/content/drive/My Drive/Colab Notebooks/2024_Data_science/Final_Project_Kaggle_Competition/submission_dt.csv', index=False)
"""
display(submission_dt)


Unnamed: 0,id,difficulty
0,0,C1
1,1,A2
2,2,A1
3,3,A2
4,4,C2
...,...,...
1195,1195,B1
1196,1196,A1
1197,1197,C2
1198,1198,A1


# Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Assuming 'X_train', 'X_test', 'y_train', 'y_test' are already defined and ready to use from the previous vectorization and splitting steps

# Initialize the Random Forest model
# You can adjust 'n_estimators' and 'max_depth' among other parameters
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
rf_model.fit(X_train, y_train)

# Predict on the testing set
y_pred_rf = rf_model.predict(X_test)

# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
class_report_rf = classification_report(y_test, y_pred_rf)

print("Accuracy of the Random Forest model:", accuracy_rf)
print("Classification Report for Random Forest:\n", class_report_rf)


Accuracy of the Random Forest model: 0.34270833333333334
Classification Report for Random Forest:
               precision    recall  f1-score   support

          A1       0.35      0.86      0.50       166
          A2       0.29      0.22      0.25       158
          B1       0.28      0.17      0.21       166
          B2       0.31      0.29      0.30       153
          C1       0.32      0.26      0.28       152
          C2       0.52      0.25      0.34       165

    accuracy                           0.34       960
   macro avg       0.35      0.34      0.31       960
weighted avg       0.35      0.34      0.32       960



In [None]:
# Assuming 'vectorizer' is already fitted on the training data

# Preprocess and vectorize the unlabelled data
X_unlabelled_rf = vectorizer.transform(unlabelled_data_pd['processed_sentence'])

# Make predictions with the Random Forest
predictions_rf = rf_model.predict(X_unlabelled_rf)

# Prepare the submission DataFrame
submission_rf = pd.DataFrame({
    'id': unlabelled_data_pd['id'],
    'difficulty': predictions_rf
})
"""
# Save the DataFrame to a CSV file for submission
submission_rf.to_csv('/content/drive/My Drive/Colab Notebooks/2024_Data_science/Final_Project_Kaggle_Competition/submission_rf.csv', index=False)
"""
display(submission_rf)


Unnamed: 0,id,difficulty
0,0,A1
1,1,B1
2,2,A1
3,3,A1
4,4,C2
...,...,...
1195,1195,B1
1196,1196,A1
1197,1197,C2
1198,1198,A1


# Other technique: CamenBERT from **BERT**

In [None]:
!pip install --no-cache-dir accelerate==0.29.3
!pip install --no-cache-dir transformers[torch]==4.40.1

In [None]:
from transformers import CamembertTokenizer, CamembertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from datasets import Dataset, load_metric
import numpy as np

# Load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Function to encode data
def encode_data(tokenizer, df):
    texts = df['sentence'].tolist()
    labels = df['difficulty'].map({'A1': 0, 'A2': 1, 'B1': 2, 'B2': 3, 'C1': 4, 'C2': 5}).tolist()
    encodings = tokenizer(texts, truncation=True, padding='max_length', max_length=128)
    return Dataset.from_dict({
        'input_ids': encodings['input_ids'],
        'attention_mask': encodings['attention_mask'],
        'labels': labels
    })

# Split the DataFrame into training and validation sets
train_data, val_data = train_test_split(training_data_pd, test_size=0.4, random_state=42)

# Tokenize and prepare datasets
train_dataset = encode_data(tokenizer, train_data)
val_dataset = encode_data(tokenizer, val_data)

# Load the model
model = CamembertForSequenceClassification.from_pretrained('camembert-base', num_labels=6)

# Metric for evaluation
def compute_metrics(eval_pred):
    metric = load_metric("accuracy")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = metric.compute(predictions=predictions, references=labels)
    return {"accuracy": accuracy['accuracy']}

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=6,  # Slight adjustment if needed based on results
    per_device_train_batch_size=8,  # A bit smaller to adjust gradient updates
    warmup_steps=500,  # Adjust based on the total number of steps (num_epochs * total_data / batch_size)
    weight_decay=0.05,  # Good for regularization
    logging_dir='./logs',
    logging_steps=50,  # Less frequent to reduce logging overhead
    learning_rate=5e-5,  # Adjusted for potentially better convergence
    fp16=True,  # Make sure your hardware supports FP16 for this to be effective
    evaluation_strategy="epoch",  # Change to steps if you need more frequent evaluation
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model='accuracy',
    greater_is_better=True
)

# Initialize the Trainer with added compute_metrics for dynamic metric calculation
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,  # Ensure metrics are computed during evaluation
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]

)

# Train the model
trainer.train()


In [None]:

from transformers import CamembertTokenizer
import numpy as np

# Load the tokenizer
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')

# Tokenize the sentences from the 'sentence' column, specifying max_length
max_length = 512  # Typical max length for BERT-based models
unlabelled_encodings = tokenizer(list(unlabelled_data_pd['sentence']), truncation=True, padding=True, max_length=max_length, return_tensors="pt")

# Assuming 'trainer' is your trained model's Trainer instance
from torch.utils.data import Dataset, DataLoader

class SimpleDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: val[idx] for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

# Create a dataset from the tokenized data
dataset = SimpleDataset(unlabelled_encodings)

# Predict using the trainer
predictions = trainer.predict(dataset)
predicted_labels = np.argmax(predictions.predictions, axis=1)

# Define a mapping from numeric labels to categories
difficulty_levels = {0: 'A1', 1: 'A2', 2: 'B1', 3: 'B2', 4: 'C1', 5: 'C2'}
predicted_difficulties = [difficulty_levels[label] for label in predicted_labels]

import pandas as pd

# Create a DataFrame for submission
submission_df = pd.DataFrame({
    'id': unlabelled_data_pd['id'],
    'difficulty': predicted_difficulties
})

# Save the DataFrame to a CSV file for submission
submission_df.to_csv('/content/drive/My Drive/Colab Notebooks/2024_Data_science/Final_Project_Kaggle_Competition/submission_CamemBERT_15_22_0001.csv', index=False)


# Comparaison table of the different models

## Which is the best model?

## Confusion Matrix

## Exemples of some erroneous predictions

## Other Analysis