# Data loading

In [1]:
import pandas as pd

# Load the dataset
file_path_training = "Data/training_data.csv"
file_path_test = "Data/unlabelled_test_data.csv"
training_data = pd.read_csv(file_path_training)
test_data = pd.read_csv(file_path_test)


In [2]:
# Display the first few rows of the DataFrame
print("First few rows of the DataFrame:")
print(training_data.head(20))
print(test_data.head())

First few rows of the DataFrame:
    id                                           sentence difficulty
0    0  Les coûts kilométriques réels peuvent diverger...         C1
1    1  Le bleu, c'est ma couleur préférée mais je n'a...         A1
2    2  Le test de niveau en français est sur le site ...         A1
3    3           Est-ce que ton mari est aussi de Boston?         A1
4    4  Dans les écoles de commerce, dans les couloirs...         B1
5    5  voilà une autre histoire que j'ai beaucoup aimée.         A2
6    6  Les médecins disent souvent qu'on doit boire u...         A2
7    7  Il est particulièrement observé chez les perso...         B2
8    8  J'ai retrouvé le plaisir de manger un oeuf à l...         A2
9    9  Nous allons bien, nous habitons dans une petit...         B1
10  10                            Bonjour et bonne année.         A1
11  11  La presse s'est abondamment fait l'écho des ef...         B2
12  12  Pour que le rocher s'ouvre, il faut le toucher...         B1
1

## Data cleaning

In [3]:
import re

def preprocess_text(sentence):
    # Convert to lowercase
    sentence = sentence.lower()
    # Remove unnecessary punctuation while retaining French accents and special characters
    sentence = re.sub(r'[^a-zàâçéèêëîïôûùüÿñæœ\s]', '', sentence)
    return sentence

# Apply the preprocessing to each sentence
training_data['processed_sentence'] = training_data['sentence'].apply(preprocess_text)
test_data['processed_sentence'] = test_data['sentence'].apply(preprocess_text)
# Display the first few rows of the dataframe after preprocessing
print(training_data.head())
print(test_data.head())

   id                                           sentence difficulty  \
0   0  Les coûts kilométriques réels peuvent diverger...         C1   
1   1  Le bleu, c'est ma couleur préférée mais je n'a...         A1   
2   2  Le test de niveau en français est sur le site ...         A1   
3   3           Est-ce que ton mari est aussi de Boston?         A1   
4   4  Dans les écoles de commerce, dans les couloirs...         B1   

                                  processed_sentence  
0  les coûts kilométriques réels peuvent diverger...  
1  le bleu cest ma couleur préférée mais je naime...  
2  le test de niveau en français est sur le site ...  
3             estce que ton mari est aussi de boston  
4  dans les écoles de commerce dans les couloirs ...  
   id                                           sentence  \
0   0  Nous dûmes nous excuser des propos que nous eû...   
1   1  Vous ne pouvez pas savoir le plaisir que j'ai ...   
2   2  Et, paradoxalement, boire froid n'est pas la b...   
3  

## Feature engineering

In [4]:
from transformers import CamembertTokenizer, CamembertModel
import torch
import textstat
import pyphen

# Initialize CamemBERT tokenizer and model
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
model = CamembertModel.from_pretrained('camembert-base')
# Initialize Pyphen for syllable counting
dic = pyphen.Pyphen(lang='fr')

def get_camembert_embedding(sentence):
    # Tokenize and encode the sentence for CamemBERT
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
    # Get the embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    # Extract the embeddings for the [CLS] token (representing the entire sentence)
    embeddings = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return embeddings


def count_syllables(word):
    # Count the hyphens as an approximation of syllable count
    hyphenated = dic.inserted(word)
    return hyphenated.count('-') + 1


# Apply feature engineering and concatenate CamemBERT embeddings
def add_features(df):
    # Adding features based on text properties
    df['LEN'] = df['processed_sentence'].apply(lambda x: len(x.split()))
    df['UNIQUE_WORD_COUNT'] = df['processed_sentence'].apply(lambda x: len(set(x.split())))
    df['AVG_WORD_LENGTH'] = df['processed_sentence'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()) if x.split() else 0)
    df['DCRS'] = df['processed_sentence'].apply(lambda x: textstat.dale_chall_readability_score(x))
    df['FKG'] = df['processed_sentence'].apply(lambda x: textstat.flesch_kincaid_grade(x))
    df['ARI'] = df['processed_sentence'].apply(lambda x: textstat.automated_readability_index(x))
    df['SYLLABLE_COUNT'] = df['processed_sentence'].apply(lambda x: sum(count_syllables(word) for word in x.split()))

    # Adding CamemBERT embeddings
    embeddings = df['processed_sentence'].apply(get_camembert_embedding).tolist()
    # Generate column names for embeddings
    embedding_column_names = [f'embedding_{i}' for i in range(len(embeddings[0]))]
    embeddings_df = pd.DataFrame(embeddings, columns=embedding_column_names)
    df = pd.concat([df, embeddings_df], axis=1)

    return df
# Apply feature engineering
training_data = add_features(training_data)
test_data = add_features(test_data)

# Encoding the difficulty levels
difficulty_encoding = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}
training_data['difficulty_encoded'] = training_data['difficulty'].map(difficulty_encoding)

# Display the dataframe with new features
print(training_data.head())
print(test_data.head())

## Add binairy difficulty column
This way the difficulty is first divided in easy and difficult and later the CEFR label is given.

In [None]:
def map_difficulty_to_binary(difficulty):
    # Mapping lower difficulties (A1 to B1) to 0 and higher difficulties (B2 to C2) to 1
    return 0 if difficulty in ['A1', 'A2', 'B1'] else 1

# Apply the mapping to create a new boolean column
training_data['difficulty_binary'] = training_data['difficulty'].apply(map_difficulty_to_binary)

# Display the dataframe with the new column
training_data.head(20)


## Create another column to split the difficulty per letter

In [None]:
def map_difficulty_to_group(difficulty):
    if difficulty in ['A1', 'A2']:
        return 0
    elif difficulty in ['B1', 'B2']:
        return 1
    else: # Assuming remaining are 'C1' and 'C2'
        return 2

# Apply the mapping to create the new column
training_data['difficulty_group'] = training_data['difficulty'].apply(map_difficulty_to_group)

# Display the first few rows to verify the new column
training_data.head()

Unnamed: 0,id,sentence,difficulty,processed_sentence,LEN,UNIQUE_WORD_COUNT,AVG_WORD_LENGTH,SYLLABLE_COUNT,FKG,ARI,...,embedding_759,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767,difficulty_group
0,0,Les coûts kilométriques réels peuvent diverger...,C1,les coûts kilométriques réels peuvent diverger...,38,29,5.526316,65,18.1,23.6,...,-0.067223,-0.065459,0.1343,-0.01138,0.022304,0.008905,-0.214924,0.004331,-0.054215,2
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,le bleu cest ma couleur préférée mais je naime...,12,11,3.916667,15,0.9,3.0,...,-0.152003,-0.081141,0.126171,0.05308,0.040183,0.09243,-0.128848,-0.03445,0.043938,0
2,2,Le test de niveau en français est sur le site ...,A1,le test de niveau en français est sur le site ...,13,11,4.0,18,3.6,3.9,...,-0.074644,-0.177957,0.087216,0.031253,0.014485,0.073736,-0.044085,0.000471,-0.027738,0
3,3,Est-ce que ton mari est aussi de Boston?,A1,estce que ton mari est aussi de boston,8,8,3.875,11,2.9,0.8,...,-0.096884,-0.183525,0.121947,-0.008364,0.084223,0.092719,-0.132519,0.058858,0.007657,0
4,4,"Dans les écoles de commerce, dans les couloirs...",B1,dans les écoles de commerce dans les couloirs ...,32,24,5.09375,45,13.4,18.5,...,-0.074465,-0.070021,0.052438,0.013443,0.033107,0.002078,-0.140131,0.009309,-0.019106,1


## Save the data in a CSV file

In [None]:
import pandas as pd
training_data.to_csv('Data/3model_training_data.csv', index= False)
test_data.to_csv('Data/3model_test_data.csv', index= False)

## Load the data

In [None]:
training_data = pd.read_csv('Data/3model_training_data.csv')
test_data = pd.read_csv('Data/3model_test_data.csv')

In [None]:
print(training_data.head())
print(test_data.head())

## Prepare data for the first training
Now we going to train a model that can classify the training in 2 categories: easy and hard

In [28]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import numpy as np

# Extract features and labels
X = training_data.drop(['id', 'sentence', 'difficulty', 'processed_sentence', 'difficulty_encoded', 'difficulty_binary', 'difficulty_group'], axis=1)
y = training_data['difficulty_group']

# Ensure all column names are of type string
X.columns = X.columns.astype(str)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)



CV Scores: [0.73854167 0.74166667 0.74583333 0.75208333 0.740625  ]
Average CV Score: 0.74375


In [29]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# Initialize the SVM classifier
model = SVC(random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(model, X_scaled, y, cv=5)

# Print the cross-validation results
print("Cross-Validation Scores:", cv_scores)
print("Average CV Score:", cv_scores.mean())


Cross-Validation Scores: [0.75625    0.76666667 0.74375    0.74479167 0.73958333]
Average CV Score: 0.7502083333333334


## Hyperparameter tuning

In [30]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10], 
    'gamma': ['scale', 'auto'], 
    'kernel': ['rbf', 'poly']
}

# Create a GridSearchCV object
grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5, scoring='accuracy', verbose=2)

# Perform grid search
grid_search.fit(X_scaled, y)

# Print the best parameters and the corresponding score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)


Fitting 5 folds for each of 24 candidates, totalling 120 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  16.6s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  15.3s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  17.0s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  16.2s
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time=  15.8s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  17.5s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  17.3s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  17.2s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  17.4s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=  17.4s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  20.7s
[CV] END ....................C=0.1, gamma=scale