## Data loading

In [1]:
import pandas as pd

# Load the dataset
file_path = "Data/training_data.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


## Data Preprocessing

In [2]:
from sklearn.preprocessing import LabelEncoder

# Check for missing values
missing_values = data.isnull().sum()

# Encode the 'difficulty' column
label_encoder = LabelEncoder()
data['difficulty_encoded'] = label_encoder.fit_transform(data['difficulty'])

# Display missing values and the first few rows after encoding
missing_values, data.head()

(id            0
 sentence      0
 difficulty    0
 dtype: int64,
    id                                           sentence difficulty  \
 0   0  Les coûts kilométriques réels peuvent diverger...         C1   
 1   1  Le bleu, c'est ma couleur préférée mais je n'a...         A1   
 2   2  Le test de niveau en français est sur le site ...         A1   
 3   3           Est-ce que ton mari est aussi de Boston?         A1   
 4   4  Dans les écoles de commerce, dans les couloirs...         B1   
 
    difficulty_encoded  
 0                   4  
 1                   0  
 2                   0  
 3                   0  
 4                   2  )

## Test set loading

In [3]:
test_set = pd.read_csv("Data/unlabelled_test_data.csv")
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        1200 non-null   int64 
 1   sentence  1200 non-null   object
dtypes: int64(1), object(1)
memory usage: 18.9+ KB


## Clean the data

In [4]:
# Load the French language model
import re
import spacy


nlp = spacy.load("fr_core_news_sm")

def clean_french_sentences(sentence):
    if isinstance(sentence, str):
        # Apply cleaning steps
        sentence = re.sub(r'[^a-zA-ZéèàêâôûùçÉÈÀÊÂÔÛÙÇ\s]', '', sentence)
        sentence = sentence.lower()
        doc = nlp(sentence)
        return ' '.join([token.lemma_ for token in doc])
    return sentence  # Return as-is if not a string

data = clean_french_sentences(data)
test_set = clean_french_sentences(test_set)
print(data.head())

2023-12-07 21:42:50.829198: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-07 21:42:50.829366: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-07 21:42:50.831296: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-07 21:42:51.118673: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-12-07 21:42:58.093996: I tensorflow/compiler/

   id                                           sentence difficulty  \
0   0  Les coûts kilométriques réels peuvent diverger...         C1   
1   1  Le bleu, c'est ma couleur préférée mais je n'a...         A1   
2   2  Le test de niveau en français est sur le site ...         A1   
3   3           Est-ce que ton mari est aussi de Boston?         A1   
4   4  Dans les écoles de commerce, dans les couloirs...         B1   

   difficulty_encoded  
0                   4  
1                   0  
2                   0  
3                   0  
4                   2  


### Function of feature engineering

In [5]:
import torch
import pandas as pd
from transformers import CamembertTokenizer, CamembertModel
import spacy
import pyphen
from nltk.tokenize import word_tokenize, sent_tokenize
from lexicalrichness import LexicalRichness
import numpy as np
import textstat

# Load Spacy French model
nlp = spacy.load("fr_core_news_sm")

# Initialize Camembert
tokenizer = CamembertTokenizer.from_pretrained('camembert/camembert-large')
model = CamembertModel.from_pretrained('camembert/camembert-large')

# Function to calculate features for a given text
def calculate_features(text):
    # Tokenize the text into words and sentences
    words = word_tokenize(text, language='french')
    sentences = sent_tokenize(text, language='french')

    # Initialize Pyphen for syllable counting
    dic = pyphen.Pyphen(lang='fr')

    # Compute text embeddings
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    # Lexical Diversity Measures
    lex = LexicalRichness(text)
    mtld = lex.mtld(threshold=0.72)

    # Syntactic Complexity Measures
    doc = nlp(text)
    num_subordinate_clauses = sum(1 for sent in doc.sents for token in sent if token.dep_ in ['csubj', 'csubjpass', 'advcl'])
    average_verbs_per_sentence = sum(1 for token in doc if token.pos_ == 'VERB') / len(sentences)

    # Readability Scores
    dcrs = textstat.dale_chall_readability_score(text)
    fkg = textstat.flesch_kincaid_grade(text)
    ari = textstat.automated_readability_index(text)
    cli = textstat.coleman_liau_index(text)

    return {
        'LEN': len(words),
        'AWL': np.mean([len(word) for word in words]),
        'TTR': len(set(words)) / len(words),
        'ASL': np.mean([len(word_tokenize(sentence, language='french')) for sentence in sentences]),
        'AVPS': average_verbs_per_sentence,
        'ASL.AVPS': np.mean([len(word_tokenize(sentence, language='french')) for sentence in sentences]) * average_verbs_per_sentence,
        'embeddings': embeddings.tolist(),  # Convert to list for easier handling
        'mtld': mtld,
        'num_subordinate_clauses': num_subordinate_clauses,
        'DCRS': dcrs,
        'FKG': fkg,
        'ARI': ari,
        'CLI': cli
    }

### Applied on train dataset

In [6]:
features_df = pd.DataFrame(data['sentence'].apply(calculate_features).tolist())
df = data.join(features_df)

# Example of accessing features of the first text
print(df.head())
# Save to csv file
df.to_csv('Data/Cleaned_Enhanced_training.csv', index=False)

   id                                           sentence difficulty  \
0   0  Les coûts kilométriques réels peuvent diverger...         C1   
1   1  Le bleu, c'est ma couleur préférée mais je n'a...         A1   
2   2  Le test de niveau en français est sur le site ...         A1   
3   3           Est-ce que ton mari est aussi de Boston?         A1   
4   4  Dans les écoles de commerce, dans les couloirs...         B1   

   difficulty_encoded  LEN       AWL       TTR   ASL  AVPS  ASL.AVPS  \
0                   4   44  4.954545  0.704545  44.0   4.0     176.0   
1                   0   14  3.642857  1.000000  14.0   2.0      28.0   
2                   0   14  3.857143  0.928571  14.0   1.0      14.0   
3                   0    9  3.666667  1.000000   9.0   1.0       9.0   
4                   2   39  4.564103  0.794872  39.0   4.0     156.0   

                                          embeddings       mtld  \
0  [0.02159704454243183, -0.21623103320598602, -0...  44.888889   
1  [0.

### Applied on the testset

In [7]:
test_set_df = pd.DataFrame(test_set['sentence'].apply(calculate_features).tolist())
test_df = test_set.join(test_set_df)

# Example of accessing features of the first text
print(test_df.head())
# Save to csv file

   id                                           sentence  LEN       AWL  \
0   0  Nous dûmes nous excuser des propos que nous eû...   10  5.000000   
1   1  Vous ne pouvez pas savoir le plaisir que j'ai ...   15  4.400000   
2   2  Et, paradoxalement, boire froid n'est pas la b...   12  4.166667   
3   3  Ce n'est pas étonnant, car c'est une saison my...   10  4.700000   
4   4  Le corps de Golo lui-même, d'une essence aussi...   78  4.987179   

        TTR   ASL  AVPS  ASL.AVPS  \
0  0.900000  10.0   3.0      30.0   
1  1.000000  15.0   4.0      60.0   
2  0.916667  12.0   1.0      12.0   
3  1.000000  10.0   0.0       0.0   
4  0.756410  78.0   8.0     624.0   

                                          embeddings       mtld  \
0  [-0.07058697938919067, -0.17462338507175446, -...  12.000000   
1  [0.06289323419332504, -0.09950374811887741, 0....  15.000000   
2  [0.1979207843542099, 0.25786763429641724, -0.0...  10.000000   
3  [-0.002974431961774826, -0.009397363290190697,...  33.8

In [8]:
test_df.to_csv('Data/Cleaned_Enhanced_test.csv', index=False)