# Data loading

In [1]:
import pandas as pd

# Load the dataset
file_path_training = "Data/training_data.csv"
file_path_test = "Data/unlabelled_test_data.csv"
training_data = pd.read_csv(file_path_training)
test_data = pd.read_csv(file_path_test)


In [2]:
# Display the first few rows of the DataFrame
print("First few rows of the DataFrame:")
print(training_data.head(20))

First few rows of the DataFrame:
    id                                           sentence difficulty
0    0  Les coûts kilométriques réels peuvent diverger...         C1
1    1  Le bleu, c'est ma couleur préférée mais je n'a...         A1
2    2  Le test de niveau en français est sur le site ...         A1
3    3           Est-ce que ton mari est aussi de Boston?         A1
4    4  Dans les écoles de commerce, dans les couloirs...         B1
5    5  voilà une autre histoire que j'ai beaucoup aimée.         A2
6    6  Les médecins disent souvent qu'on doit boire u...         A2
7    7  Il est particulièrement observé chez les perso...         B2
8    8  J'ai retrouvé le plaisir de manger un oeuf à l...         A2
9    9  Nous allons bien, nous habitons dans une petit...         B1
10  10                            Bonjour et bonne année.         A1
11  11  La presse s'est abondamment fait l'écho des ef...         B2
12  12  Pour que le rocher s'ouvre, il faut le toucher...         B1
1

## Data cleaning

In [3]:
import re

def preprocess_text(sentence):
    # Convert to lowercase
    sentence = sentence.lower()
    # Remove unnecessary punctuation while retaining French accents and special characters
    sentence = re.sub(r'[^a-zàâçéèêëîïôûùüÿñæœ\s]', '', sentence)
    return sentence

# Apply the preprocessing to each sentence
training_data['processed_sentence'] = training_data['sentence'].apply(preprocess_text)
test_data['processed_sentence'] = test_data['sentence'].apply(preprocess_text)
# Display the first few rows of the dataframe after preprocessing
print(training_data.head())
print(test_data.head())

   id                                           sentence difficulty  \
0   0  Les coûts kilométriques réels peuvent diverger...         C1   
1   1  Le bleu, c'est ma couleur préférée mais je n'a...         A1   
2   2  Le test de niveau en français est sur le site ...         A1   
3   3           Est-ce que ton mari est aussi de Boston?         A1   
4   4  Dans les écoles de commerce, dans les couloirs...         B1   

                                  processed_sentence  
0  les coûts kilométriques réels peuvent diverger...  
1  le bleu cest ma couleur préférée mais je naime...  
2  le test de niveau en français est sur le site ...  
3             estce que ton mari est aussi de boston  
4  dans les écoles de commerce dans les couloirs ...  
   id                                           sentence  \
0   0  Nous dûmes nous excuser des propos que nous eû...   
1   1  Vous ne pouvez pas savoir le plaisir que j'ai ...   
2   2  Et, paradoxalement, boire froid n'est pas la b...   
3  

## Feature engineering

In [4]:
from transformers import CamembertTokenizer, CamembertModel
import torch
import textstat
import pyphen

# Initialize CamemBERT tokenizer and model
tokenizer = CamembertTokenizer.from_pretrained('camembert-base')
model = CamembertModel.from_pretrained('camembert-base')
# Initialize Pyphen for syllable counting
dic = pyphen.Pyphen(lang='fr')

def get_camembert_embedding(sentence):
    # Tokenize and encode the sentence for CamemBERT
    inputs = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
    # Get the embeddings
    with torch.no_grad():
        outputs = model(**inputs)
    # Extract the embeddings for the [CLS] token (representing the entire sentence)
    embeddings = outputs.last_hidden_state[:, 0, :].squeeze().numpy()
    return embeddings


def count_syllables(word):
    # Count the hyphens as an approximation of syllable count
    hyphenated = dic.inserted(word)
    return hyphenated.count('-') + 1


# Apply feature engineering and concatenate CamemBERT embeddings
def add_features(df):
    # Adding features based on text properties
    df['LEN'] = df['processed_sentence'].apply(lambda x: len(x.split()))
    df['UNIQUE_WORD_COUNT'] = df['processed_sentence'].apply(lambda x: len(set(x.split())))
    df['AVG_WORD_LENGTH'] = df['processed_sentence'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()) if x.split() else 0)
    df['DCRS'] = df['processed_sentence'].apply(lambda x: textstat.dale_chall_readability_score(x))
    df['FKG'] = df['processed_sentence'].apply(lambda x: textstat.flesch_kincaid_grade(x))
    df['ARI'] = df['processed_sentence'].apply(lambda x: textstat.automated_readability_index(x))
    df['SYLLABLE_COUNT'] = df['processed_sentence'].apply(lambda x: sum(count_syllables(word) for word in x.split()))

    # Adding CamemBERT embeddings
    embeddings = df['processed_sentence'].apply(get_camembert_embedding).tolist()
    # Generate column names for embeddings
    embedding_column_names = [f'embedding_{i}' for i in range(len(embeddings[0]))]
    embeddings_df = pd.DataFrame(embeddings, columns=embedding_column_names)
    df = pd.concat([df, embeddings_df], axis=1)

    return df
# Apply feature engineering
training_data = add_features(training_data)
test_data = add_features(test_data)

# Encoding the difficulty levels
difficulty_encoding = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}
training_data['difficulty_encoded'] = training_data['difficulty'].map(difficulty_encoding)

# Display the dataframe with new features
print(training_data.head())
print(test_data.head())

   id                                           sentence difficulty  \
0   0  Les coûts kilométriques réels peuvent diverger...         C1   
1   1  Le bleu, c'est ma couleur préférée mais je n'a...         A1   
2   2  Le test de niveau en français est sur le site ...         A1   
3   3           Est-ce que ton mari est aussi de Boston?         A1   
4   4  Dans les écoles de commerce, dans les couloirs...         B1   

                                  processed_sentence  LEN  UNIQUE_WORD_COUNT  \
0  les coûts kilométriques réels peuvent diverger...   38                 29   
1  le bleu cest ma couleur préférée mais je naime...   12                 11   
2  le test de niveau en français est sur le site ...   13                 11   
3             estce que ton mari est aussi de boston    8                  8   
4  dans les écoles de commerce dans les couloirs ...   32                 24   

   AVG_WORD_LENGTH   DCRS   FKG   ARI  ...  embedding_759  embedding_760  \
0         5.5263

## Add binairy difficulty column
This way the difficulty is first divided in easy and difficult and later the CEFR label is given.

In [5]:
def map_difficulty_to_binary(difficulty):
    # Mapping lower difficulties (A1 to B1) to 0 and higher difficulties (B2 to C2) to 1
    return 0 if difficulty in ['A1', 'A2', 'B1'] else 1

# Apply the mapping to create a new boolean column
training_data['difficulty_binary'] = training_data['difficulty'].apply(map_difficulty_to_binary)

# Display the dataframe with the new column
training_data.head(20)


Unnamed: 0,id,sentence,difficulty,processed_sentence,LEN,UNIQUE_WORD_COUNT,AVG_WORD_LENGTH,DCRS,FKG,ARI,...,embedding_760,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767,difficulty_encoded,difficulty_binary
0,0,Les coûts kilométriques réels peuvent diverger...,C1,les coûts kilométriques réels peuvent diverger...,38,29,5.526316,17.57,18.1,23.6,...,-0.065459,0.1343,-0.01138,0.022304,0.008905,-0.214924,0.004331,-0.054215,5,1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,le bleu cest ma couleur préférée mais je naime...,12,11,3.916667,18.71,0.9,3.0,...,-0.081141,0.126171,0.05308,0.040183,0.09243,-0.128848,-0.03445,0.043938,1,0
2,2,Le test de niveau en français est sur le site ...,A1,le test de niveau en français est sur le site ...,13,11,4.0,16.43,3.6,3.9,...,-0.177957,0.087216,0.031253,0.014485,0.073736,-0.044085,0.000471,-0.027738,1,0
3,3,Est-ce que ton mari est aussi de Boston?,A1,estce que ton mari est aussi de boston,8,8,3.875,17.85,2.9,0.8,...,-0.183525,0.121947,-0.008364,0.084223,0.092719,-0.132519,0.058858,0.007657,1,0
4,4,"Dans les écoles de commerce, dans les couloirs...",B1,dans les écoles de commerce dans les couloirs ...,32,24,5.09375,16.57,13.4,18.5,...,-0.070021,0.052438,0.013443,0.033107,0.002078,-0.140131,0.009309,-0.019106,3,0
5,5,voilà une autre histoire que j'ai beaucoup aimée.,A2,voilà une autre histoire que jai beaucoup aimée,8,8,5.0,19.82,2.9,6.1,...,-0.136071,0.123643,0.066058,0.008499,0.042318,-0.13474,0.015552,-0.023559,2,0
6,6,Les médecins disent souvent qu'on doit boire u...,A2,les médecins disent souvent quon doit boire un...,15,14,4.466667,19.12,3.2,7.1,...,-0.07771,0.077807,0.024878,0.099138,0.01385,-0.133629,0.008823,-0.001436,2,0
7,7,Il est particulièrement observé chez les perso...,B2,il est particulièrement observé chez les perso...,31,27,5.612903,18.42,17.7,20.5,...,-0.093944,0.137133,0.076429,0.055389,0.041365,-0.03301,0.055074,-0.050868,4,1
8,8,J'ai retrouvé le plaisir de manger un oeuf à l...,A2,jai retrouvé le plaisir de manger un oeuf à la...,11,11,3.818182,18.54,1.7,2.1,...,-0.137939,0.020195,0.013345,0.054082,0.04753,-0.168198,-0.061109,0.011835,2,0
9,9,"Nous allons bien, nous habitons dans une petit...",B1,nous allons bien nous habitons dans une petite...,15,14,4.866667,19.12,6.8,9.0,...,-0.168125,0.132555,0.172277,0.040058,0.115313,-0.087904,0.057913,-0.012454,3,0


## Create another column to split the difficulty per letter

In [6]:
def map_difficulty_to_group(difficulty):
    if difficulty in ['A1', 'A2']:
        return 0
    elif difficulty in ['B1', 'B2']:
        return 1
    else: # Assuming remaining are 'C1' and 'C2'
        return 2

# Apply the mapping to create the new column
training_data['difficulty_group'] = training_data['difficulty'].apply(map_difficulty_to_group)

# Display the first few rows to verify the new column
training_data.head()

Unnamed: 0,id,sentence,difficulty,processed_sentence,LEN,UNIQUE_WORD_COUNT,AVG_WORD_LENGTH,DCRS,FKG,ARI,...,embedding_761,embedding_762,embedding_763,embedding_764,embedding_765,embedding_766,embedding_767,difficulty_encoded,difficulty_binary,difficulty_group
0,0,Les coûts kilométriques réels peuvent diverger...,C1,les coûts kilométriques réels peuvent diverger...,38,29,5.526316,17.57,18.1,23.6,...,0.1343,-0.01138,0.022304,0.008905,-0.214924,0.004331,-0.054215,5,1,2
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,le bleu cest ma couleur préférée mais je naime...,12,11,3.916667,18.71,0.9,3.0,...,0.126171,0.05308,0.040183,0.09243,-0.128848,-0.03445,0.043938,1,0,0
2,2,Le test de niveau en français est sur le site ...,A1,le test de niveau en français est sur le site ...,13,11,4.0,16.43,3.6,3.9,...,0.087216,0.031253,0.014485,0.073736,-0.044085,0.000471,-0.027738,1,0,0
3,3,Est-ce que ton mari est aussi de Boston?,A1,estce que ton mari est aussi de boston,8,8,3.875,17.85,2.9,0.8,...,0.121947,-0.008364,0.084223,0.092719,-0.132519,0.058858,0.007657,1,0,0
4,4,"Dans les écoles de commerce, dans les couloirs...",B1,dans les écoles de commerce dans les couloirs ...,32,24,5.09375,16.57,13.4,18.5,...,0.052438,0.013443,0.033107,0.002078,-0.140131,0.009309,-0.019106,3,0,1


## Save the data in a CSV file

In [7]:
import pandas as pd
training_data.to_csv('Data/3model_training_data.csv', index= False)
test_data.to_csv('Data/3model_test_data.csv', index= False)

## Load the data

In [8]:
training_data = pd.read_csv('Data/3model_training_data.csv')
test_data = pd.read_csv('Data/3model_test_data.csv')

In [9]:
print(training_data.head())
print(test_data.head())

   id                                           sentence difficulty  \
0   0  Les coûts kilométriques réels peuvent diverger...         C1   
1   1  Le bleu, c'est ma couleur préférée mais je n'a...         A1   
2   2  Le test de niveau en français est sur le site ...         A1   
3   3           Est-ce que ton mari est aussi de Boston?         A1   
4   4  Dans les écoles de commerce, dans les couloirs...         B1   

                                  processed_sentence  LEN  UNIQUE_WORD_COUNT  \
0  les coûts kilométriques réels peuvent diverger...   38                 29   
1  le bleu cest ma couleur préférée mais je naime...   12                 11   
2  le test de niveau en français est sur le site ...   13                 11   
3             estce que ton mari est aussi de boston    8                  8   
4  dans les écoles de commerce dans les couloirs ...   32                 24   

   AVG_WORD_LENGTH   DCRS   FKG   ARI  ...  embedding_761  embedding_762  \
0         5.5263

## Prepare data for the first training
Now we going to train a model that can classify the training in 2 categories: easy and hard

In [10]:
from sklearn.model_selection import cross_val_score, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import numpy as np

# Extract features and labels
X = training_data.drop(['id', 'sentence', 'difficulty', 'processed_sentence', 'difficulty_encoded', 'difficulty_binary', 'difficulty_group'], axis=1)
y = training_data['difficulty_group']

# Ensure all column names are of type string
X.columns = X.columns.astype(str)

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

# Initialize the SVM classifier
model = SVC(random_state=42)

# Perform cross-validation
cv_scores = cross_val_score(model, X_scaled, y, cv=5)

# Print the cross-validation results
print("Cross-Validation Scores:", cv_scores)
print("Average CV Score:", cv_scores.mean())


Cross-Validation Scores: [0.75729167 0.765625   0.74375    0.74583333 0.740625  ]
Average CV Score: 0.7506250000000001


## Hyperparameter tuning

In [12]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.1, 1, 10], 
    'gamma': ['scale', 'auto'], 
    'kernel': ['rbf', 'poly']
}

# Create a GridSearchCV object
grid_search = GridSearchCV(SVC(random_state=42), param_grid, cv=5, scoring='accuracy', verbose=2)

# Perform grid search
grid_search.fit(X_scaled, y)

# Print the best parameters and the corresponding score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation score:", grid_search.best_score_)


Fitting 5 folds for each of 12 candidates, totalling 60 fits
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   8.8s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   8.7s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   8.7s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   8.8s
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time=   8.7s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  10.3s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  10.0s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=  10.6s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   9.4s
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time=   9.5s
[CV] END ......................C=0.1, gamma=auto, kernel=rbf; total time=   8.6s
[CV] END ......................C=0.1, gamma=auto

## Final model evaluation
For the first part

In [13]:
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Train the final model
final_model = SVC(**grid_search.best_params_, random_state=42)
final_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = final_model.predict(X_test)

# Print classification report and confusion matrix
print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.79      0.85      0.82       324
           1       0.63      0.58      0.60       319
           2       0.79      0.80      0.80       317

    accuracy                           0.74       960
   macro avg       0.74      0.74      0.74       960
weighted avg       0.74      0.74      0.74       960

Confusion Matrix:
[[274  49   1]
 [ 69 185  65]
 [  2  61 254]]


## Three part classification

In [52]:
# Assuming final_model is your trained model and X_scaled contains all features
group_predictions = final_model.predict(X_scaled)

## Sub-classificiation within each group

### Group 0 = A 

In [53]:
# Filter data for group 0 (A1-A2)
group_0_data = training_data[(group_predictions == 0) & (training_data['difficulty_encoded'].isin([1, 2]))]
X_group_0 = group_0_data.drop(['id', 'sentence', 'difficulty', 'processed_sentence', 'difficulty_binary', 'difficulty_group', 'difficulty_encoded'], axis=1)
y_group_0 = group_0_data['difficulty_encoded']  # 'difficulty_encoded' with values 1 and 2


In [54]:
print(X_group_0.head(50))

     LEN  UNIQUE_WORD_COUNT  AVG_WORD_LENGTH   DCRS  FKG   ARI  \
1     12                 11         3.916667  18.71  0.9   3.0   
2     13                 11         4.000000  16.43  3.6   3.9   
3      8                  8         3.875000  17.85  2.9   0.8   
5      8                  8         5.000000  19.82  2.9   6.1   
6     15                 14         4.466667  19.12  3.2   7.1   
8     11                 11         3.818182  18.54  1.7   2.1   
10     4                  4         4.750000  19.62  3.7   2.9   
13    10                 10         4.000000  19.92  0.1   2.4   
17    15                 14         3.733333  18.07  3.2   3.6   
19     8                  8         3.750000  19.82  2.9   0.2   
24     6                  6         4.833333  19.72 -1.5   4.3   
26    11                 10         4.636364  18.54  4.0   5.9   
27     6                  6         4.000000  17.09  0.9   0.4   
37     4                  4         3.500000  19.62 -2.3  -3.0   
40     6  

In [55]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_group_0_scaled = scaler.transform(X_group_0)  # Use the same scaler as before

# Train-test split for group 0
X_train, X_test, y_train, y_test = train_test_split(X_group_0_scaled, y_group_0, test_size=0.2, random_state=42)

# Initialize and train a new model for group 0
model_group_0 = SVC(**grid_search.best_params_,random_state=42)
model_group_0.fit(X_train, y_train)

# Evaluate the model for group 0
y_pred_group_0 = model_group_0.predict(X_test)
print("Accuracy for group 0 (A1-A2):", accuracy_score(y_test, y_pred_group_0))


Accuracy for group 0 (A1-A2): 0.7339743589743589


### Group 1 = B

In [56]:
# Filter data for group 1 (B1-B2)
group_1_data = training_data[(group_predictions == 1) & (training_data['difficulty_encoded'].isin([3, 4]))]
X_group_1 = group_1_data.drop(['id', 'sentence', 'difficulty', 'processed_sentence', 'difficulty_binary', 'difficulty_group', 'difficulty_encoded'], axis=1)
y_group_1 = group_1_data['difficulty_encoded']  # 'difficulty_encoded' with values 3 and 4

In [57]:
print(X_group_1.head(50))

     LEN  UNIQUE_WORD_COUNT  AVG_WORD_LENGTH   DCRS   FKG   ARI  \
4     32                 24         5.093750  16.57  13.4  18.5   
7     31                 27         5.612903  18.42  17.7  20.5   
9     15                 14         4.866667  19.12   6.8   9.0   
11    17                 16         5.176471  19.34   7.6  11.5   
14    15                 14         5.333333  18.07   9.1  11.2   
16    16                 16         5.250000  19.23   7.2  11.3   
18    23                 22         4.913043  19.19  11.1  13.2   
20     9                  9         5.777778  18.12  11.5  10.3   
28    10                 10         4.300000  18.34   6.0   3.8   
32    16                 16         5.250000  20.22   7.2  11.3   
35    38                 29         5.078947  17.16  16.9  21.5   
36    14                 13         5.142857  18.99   5.2   9.8   
47    13                 12         5.153846  18.86   6.0   9.3   
51    26                 25         5.000000  18.89  13.4  15.

In [28]:
# Scale features for group 1
X_group_1_scaled = scaler.transform(X_group_1)  # Use the same scaler as before

# Train-test split for group 1
X_train, X_test, y_train, y_test = train_test_split(X_group_1_scaled, y_group_1, test_size=0.2, random_state=42)

# Initialize and train a new model for group 1
model_group_1 = SVC(**grid_search.best_params_,random_state=42)
model_group_1.fit(X_train, y_train)

# Evaluate the model for group 1
y_pred_group_1 = model_group_1.predict(X_test)
print("Accuracy for group 1 (B1-B2):", accuracy_score(y_test, y_pred_group_1))

Accuracy for group 1 (B1-B2): 0.8379310344827586


### Group 2 = C

In [31]:
# Filter data for group 2 (C1-C2)
group_2_data = training_data[(group_predictions == 2) & (training_data['difficulty_encoded'].isin([5, 6]))]
X_group_2 = group_2_data.drop(['id', 'sentence', 'difficulty', 'processed_sentence', 'difficulty_binary', 'difficulty_group', 'difficulty_encoded'], axis=1)
y_group_2 = group_2_data['difficulty_encoded']  # 'difficulty_encoded' with values 5 and 6

In [None]:
print(X_group_2.head(50))

In [58]:
# Scale features for group 2
X_group_2_scaled = scaler.transform(X_group_2)  # Use the same scaler as before

# Train-test split for group 2
X_train, X_test, y_train, y_test = train_test_split(X_group_2_scaled, y_group_2, test_size=0.2, random_state=42)

# Initialize and train a new model for group 2
model_group_2 = SVC(**grid_search.best_params_,random_state=42)
model_group_2.fit(X_train, y_train)

# Evaluate the model for group 2
y_pred_group_2 = model_group_2.predict(X_test)
print("Accuracy for group 2 (C1-C2):", accuracy_score(y_test, y_pred_group_2))

Accuracy for group 2 (C1-C2): 0.7605177993527508


# Predicting on the unseen data

In [59]:
# Load the test data
test_data = pd.read_csv('Data/3model_test_data.csv')

In [38]:
print(test_data.head())

   id                                           sentence  \
0   0  Nous dûmes nous excuser des propos que nous eû...   
1   1  Vous ne pouvez pas savoir le plaisir que j'ai ...   
2   2  Et, paradoxalement, boire froid n'est pas la b...   
3   3  Ce n'est pas étonnant, car c'est une saison my...   
4   4  Le corps de Golo lui-même, d'une essence aussi...   

                                  processed_sentence  LEN  UNIQUE_WORD_COUNT  \
0  nous dûmes nous excuser des propos que nous eû...   10                  8   
1  vous ne pouvez pas savoir le plaisir que jai d...   14                 14   
2  et paradoxalement boire froid nest pas la bonn...    9                  9   
3  ce nest pas étonnant car cest une saison mysté...    9                  9   
4  le corps de golo luimême dune essence aussi su...   72                 56   

   AVG_WORD_LENGTH   DCRS   FKG   ARI  SYLLABLE_COUNT  ...  embedding_758  \
0         5.000000  16.76   4.8   7.1              15  ...      -0.091074   
1   

In [60]:
X_new = test_data.drop(['id','sentence', 'processed_sentence'], axis=1)

# Scale the features
X_new_scaled = scaler.transform(X_new)

## Predict Group Classification and Exact CEFR Level

In [61]:
# Predict group classification
group_predictions_new = final_model.predict(X_new_scaled)

In [64]:
# Example for group 0 (A1-A2)
group_0_indices = group_predictions_new == 0
X_group_0_new = X_new_scaled[group_0_indices]
predicted_cefr_group_0 = model_group_0.predict(X_group_0_new)

In [65]:
# Predict exact CEFR level within Group 1
group_1_indices = group_predictions_new == 1
X_group_1_new = X_new_scaled[group_1_indices]
predicted_cefr_group_1 = model_group_1.predict(X_group_1_new)


In [66]:
# Predict exact CEFR level within Group 2
group_2_indices = group_predictions_new == 2
X_group_2_new = X_new_scaled[group_2_indices]
predicted_cefr_group_2 = model_group_2.predict(X_group_2_new)


## Combine the predictions again

In [67]:
# Initialize an array to store all predictions
combined_predictions = np.empty(len(test_data), dtype=object)

# Assign predictions from each group to the combined array
combined_predictions[group_0_indices] = predicted_cefr_group_0
combined_predictions[group_1_indices] = predicted_cefr_group_1
combined_predictions[group_2_indices] = predicted_cefr_group_2


### Reverse encoding

In [68]:
def map_numeric_to_cefr(numeric_value):
    cefr_mapping = {1: 'A1', 2: 'A2', 3: 'B1', 4: 'B2', 5: 'C1', 6: 'C2'}
    return cefr_mapping.get(numeric_value, "Unknown")

# Apply the mapping to the predictions of each group
predicted_cefr_group_0_mapped = [map_numeric_to_cefr(num) for num in predicted_cefr_group_0]
predicted_cefr_group_1_mapped = [map_numeric_to_cefr(num) for num in predicted_cefr_group_1]
predicted_cefr_group_2_mapped = [map_numeric_to_cefr(num) for num in predicted_cefr_group_2]

# Initialize an array to store all mapped predictions
combined_predictions_mapped = np.empty(len(test_data), dtype=object)

# Assign mapped predictions from each group to the combined array
combined_predictions_mapped[group_0_indices] = predicted_cefr_group_0_mapped
combined_predictions_mapped[group_1_indices] = predicted_cefr_group_1_mapped
combined_predictions_mapped[group_2_indices] = predicted_cefr_group_2_mapped

# Add predicted CEFR level to the original data
test_data['difficulty'] = combined_predictions_mapped

# Output the final DataFrame with the original order of sentences and predicted CEFR levels
final_output = test_data[['sentence', 'difficulty']]


In [69]:
print(final_output.head())

                                            sentence difficulty
0  Nous dûmes nous excuser des propos que nous eû...         C2
1  Vous ne pouvez pas savoir le plaisir que j'ai ...         B1
2  Et, paradoxalement, boire froid n'est pas la b...         B1
3  Ce n'est pas étonnant, car c'est une saison my...         A1
4  Le corps de Golo lui-même, d'une essence aussi...         C2


In [51]:
test_data[['id', 'difficulty']].to_csv('Data/3model_Nvidia_submission.csv', index=False)