## Load the data

In [58]:
import pandas as pd
import nltk
import re
from transformers import CamembertTokenizer, CamembertModel
import torch


# Load the training data
training_data_path = 'Data/training_data.csv'
training_data = pd.read_csv(training_data_path)

# Display the first few rows of the dataset
training_data.head()

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


### Text Cleaning: 
We'll remove special characters, numbers, and unnecessary punctuation from the sentences using regular expressions.
### Tokenization: 
We'll use nltk to tokenize the sentences. This involves breaking down each sentence into individual words.
### Text Embedding: 
We'll use a pre-trained French language model from the transformers library for embedding.

In [59]:
# Text cleaning function
def clean_text(text):
    text = re.sub(r'[^A-Za-zÀ-ÖØ-öø-ÿ\s]', '', text)  # Remove non-alphabetic characters
    return text.lower().strip()

# Clean the sentences
training_data['cleaned_sentence'] = training_data['sentence'].apply(clean_text)

# Initialize the tokenizer and model for embedding
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertModel.from_pretrained("camembert-base")

# Initialize the tokenizer and PyTorch model for embedding
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
pytorch_model = CamembertModel.from_pretrained("camembert-base")

# Tokenization and Embedding function
def embed_sentence(sentence):
    # Tokenize the sentence
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
    
    # Embedding with the pre-trained PyTorch model
    with torch.no_grad():
        outputs = pytorch_model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Embed all sentences in the dataset --> takes +- 5 minutes
#full_embedded = training_data['cleaned_sentence'].apply(embed_sentence)

In [None]:
trainingdataafterbert = pd.read_csv('Data/training_data_after_bert.csv')

### How much time will it take to embedd all entries

In [60]:
import time

# Select a small sample of sentences
sample_data = training_data['cleaned_sentence'].head(50)

# Start the timer
start_time = time.time()

# Embed the sample sentences
sample_embedded = sample_data.apply(embed_sentence)

# Calculate the time taken
time_taken = time.time() - start_time
average_time_per_sentence = time_taken / 50
estimated_total_time = average_time_per_sentence * 4800

print(f"Estimated total time for embedding 4800 sentences: {estimated_total_time} seconds")


Estimated total time for embedding 4800 sentences: 222.41665649414065 seconds


### Feature preperation, label encoding and splitting data in train and test sets

In [70]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import numpy as np



# Assuming 'sample_embedded' contains the embedded vectors for a subset of your data
# Since the elements are already numpy arrays, we can directly use them
features = np.array([embedding for embedding in full_embedded])

# Label Encoding
label_encoder = LabelEncoder()
training_data['difficulty_encoded'] = label_encoder.fit_transform(training_data['difficulty'])

# Extract the labels
labels = training_data['difficulty_encoded'].values

# Splitting the Dataset into Training and Validation Sets
X_train, X_val, y_train, y_val = train_test_split(features, labels, test_size=0.01)



### Models training

#### LogReg

In [62]:
from sklearn.linear_model import LogisticRegression
logistic_regression_model = LogisticRegression(max_iter=1000)
logistic_regression_model.fit(X_train, y_train)
from sklearn.neighbors import KNeighborsClassifier
knn_model = KNeighborsClassifier(n_neighbors=11)  # You can tune 'n_neighbors'
knn_model.fit(X_train, y_train)
from sklearn.tree import DecisionTreeClassifier
decision_tree_model = DecisionTreeClassifier()
decision_tree_model.fit(X_train, y_train)
from sklearn.ensemble import RandomForestClassifier
random_forest_model = RandomForestClassifier(n_estimators=100)
random_forest_model.fit(X_train, y_train)
from sklearn.svm import SVC
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)

### Performance Statistics + confusion matrix

In [63]:
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Logistic Regression
lr_predictions = logistic_regression_model.predict(X_val)
print("Logistic Regression:")
print(classification_report(y_val, lr_predictions))
print("Accuracy:", accuracy_score(y_val, lr_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_val, lr_predictions))

# k-Nearest Neighbors
knn_predictions = knn_model.predict(X_val)
print("\nk-Nearest Neighbors:")
print(classification_report(y_val, knn_predictions))
print("Accuracy:", accuracy_score(y_val, knn_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_val, knn_predictions))

# Decision Tree
dt_predictions = decision_tree_model.predict(X_val)
print("\nDecision Tree:")
print(classification_report(y_val, dt_predictions))
print("Accuracy:", accuracy_score(y_val, dt_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_val, dt_predictions))

# Random Forest
rf_predictions = random_forest_model.predict(X_val)
print("\nRandom Forest:")
print(classification_report(y_val, rf_predictions))
print("Accuracy:", accuracy_score(y_val, rf_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_val, rf_predictions))

# Support Vector Machine
svm_predictions = svm_model.predict(X_val)
print("\nSupport Vector Machine:")
print(classification_report(y_val, svm_predictions))
print("Accuracy:", accuracy_score(y_val, svm_predictions))
print("Confusion Matrix:\n", confusion_matrix(y_val, svm_predictions))


Logistic Regression:
              precision    recall  f1-score   support

           0       1.00      0.58      0.74        12
           1       0.29      0.57      0.38         7
           2       0.40      0.22      0.29         9
           3       0.40      0.29      0.33         7
           4       0.20      1.00      0.33         2
           5       1.00      0.64      0.78        11

    accuracy                           0.50        48
   macro avg       0.55      0.55      0.47        48
weighted avg       0.66      0.50      0.53        48

Accuracy: 0.5
Confusion Matrix:
 [[7 5 0 0 0 0]
 [0 4 3 0 0 0]
 [0 5 2 1 1 0]
 [0 0 0 2 5 0]
 [0 0 0 0 2 0]
 [0 0 0 2 2 7]]

k-Nearest Neighbors:
              precision    recall  f1-score   support

           0       1.00      0.42      0.59        12
           1       0.50      0.86      0.63         7
           2       0.50      0.44      0.47         9
           3       0.25      0.14      0.18         7
           4       

### Hyperparameter gridsearch 

#### LogReg

In [64]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

# Parameter grid
param_grid_lr = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'solver': ['lbfgs', 'liblinear']
}

# Model
logistic_regression_model = LogisticRegression(max_iter=10)
grid_search_lr = GridSearchCV(logistic_regression_model, param_grid_lr, cv=5, scoring='accuracy')
grid_search_lr.fit(X_train, y_train)

# Best parameters
print("Best parameters for Logistic Regression:", grid_search_lr.best_params_)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Best parameters for Logistic Regression: {'C': 1, 'solver': 'liblinear'}




#### KNN

In [65]:
from sklearn.neighbors import KNeighborsClassifier

# Parameter grid
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9 , 11, 13, 15, 17, 19],
    'weights': ['uniform', 'distance']
}

# Model
knn_model = KNeighborsClassifier()
grid_search_knn = GridSearchCV(knn_model, param_grid_knn, cv=5, scoring='accuracy')
grid_search_knn.fit(X_train, y_train)

# Best parameters
print("Best parameters for kNN:", grid_search_knn.best_params_)


Best parameters for kNN: {'n_neighbors': 11, 'weights': 'uniform'}


#### Decision tree

In [None]:
#from sklearn.tree import DecisionTreeClassifier

# Parameter grid
#param_grid_dt = {
#    'max_depth': [10, 20, 30, None],
#    'min_samples_split': [2, 5, 10],
#    'min_samples_leaf': [1, 2, 4]
#}

# Model
#decision_tree_model = DecisionTreeClassifier()
#grid_search_dt = GridSearchCV(decision_tree_model, param_grid_dt, cv=5, scoring='accuracy')
#grid_search_dt.fit(X_train, y_train)

# Best parameters
#print("Best parameters for Decision Tree:", grid_search_dt.best_params_)


#### Random Forest ==> takes to long

#### SVM 

In [67]:
from sklearn.svm import SVC

# Parameter grid
param_grid_svm = {
    'C': [0.1, 1, 10],
    'kernel': ['linear', 'rbf']
}

# Model
svm_model = SVC()
grid_search_svm = GridSearchCV(svm_model, param_grid_svm, cv=5, scoring='accuracy')
grid_search_svm.fit(X_train, y_train)

# Best parameters
print("Best parameters for SVM:", grid_search_svm.best_params_)


Best parameters for SVM: {'C': 10, 'kernel': 'rbf'}


### Retraining the models

In [71]:
logistic_regression_optimized = LogisticRegression(**grid_search_lr.best_params_)
logistic_regression_optimized.fit(X_train, y_train)
knn_optimized = KNeighborsClassifier(**grid_search_knn.best_params_)
knn_optimized.fit(X_train, y_train)
#decision_tree_optimized = DecisionTreeClassifier(**grid_search_dt.best_params_)
# Takes to long to train
#decision_tree_optimized.fit(X_train, y_train)
#random_forest_optimized = RandomForestClassifier(**grid_search_rf.best_params_)
# Takes to long to train
#random_forest_optimized.fit(X_train, y_train)
svm_optimized = SVC(**grid_search_svm.best_params_)
svm_optimized.fit(X_train, y_train)

### Evaluate Model Performance Again

In [72]:
from sklearn.metrics import classification_report, accuracy_score

# Evaluate each model
models = [logistic_regression_optimized, knn_optimized, svm_optimized]
model_names = ["Logistic Regression", "k-Nearest Neighbors", "SVM"]

for model, name in zip(models, model_names):
    predictions = model.predict(X_val)
    print(f"Model: {name}")
    print(classification_report(y_val, predictions))
    print("Accuracy:", accuracy_score(y_val, predictions))
    print("\n")


Model: Logistic Regression
              precision    recall  f1-score   support

           0       0.40      1.00      0.57         4
           1       0.57      0.44      0.50         9
           2       0.80      0.53      0.64        15
           3       0.38      1.00      0.55         3
           4       0.67      0.57      0.62         7
           5       0.57      0.40      0.47        10

    accuracy                           0.56        48
   macro avg       0.56      0.66      0.56        48
weighted avg       0.63      0.56      0.56        48

Accuracy: 0.5625


Model: k-Nearest Neighbors
              precision    recall  f1-score   support

           0       0.25      0.25      0.25         4
           1       0.33      0.22      0.27         9
           2       0.56      0.60      0.58        15
           3       0.00      0.00      0.00         3
           4       0.30      0.43      0.35         7
           5       0.50      0.40      0.44        10

    

: 

## Apply the model on the unlabeled data

In [None]:
# Step 1: Load Unlabeled Data
unlabelled_test_data = pd.read_csv('Data/unlabelled_test_data.csv')

# Step 2: Preprocess and Embed the Unlabeled Data

# Clean the sentences in the unlabeled data
unlabelled_test_data['cleaned_sentence'] = unlabelled_test_data['sentence'].apply(clean_text)

# Embed all sentences in the unlabeled dataset
# Note: This might take some time depending on the size of your dataset
unlabeled_test_features = unlabelled_test_data['cleaned_sentence'].apply(embed_sentence)


In [None]:
import numpy as np

# Convert the series of arrays into a 2D NumPy array
unlabeled_test_features_array = np.array([feature for feature in unlabeled_test_features])

# Now use the SVM model to make predictions
# Ensure unlabeled_test_features_array is in the correct format expected by the SVM model
predictions = svm_optimized.predict(unlabeled_test_features_array)

# Convert Predictions back to Difficulty Labels
predicted_difficulties = label_encoder.inverse_transform(predictions)

# Step 5: Output the Predictions
unlabelled_test_data['difficulty'] = predicted_difficulties
unlabelled_test_data[['id', 'difficulty']].to_csv('Data/Nvidia_submission.csv', index=False)