## Data loading

In [3]:
import pandas as pd

# Load the dataset
file_path = "Data/training_data.csv"
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()

FileNotFoundError: [Errno 2] No such file or directory: 'Data/training_data.csv'

## Data Preprocessing

In [None]:
from sklearn.preprocessing import LabelEncoder

# Check for missing values
missing_values = data.isnull().sum()

# Encode the 'difficulty' column
label_encoder = LabelEncoder()
data['difficulty_encoded'] = label_encoder.fit_transform(data['difficulty'])

# Display missing values and the first few rows after encoding
missing_values, data.head()

(id            0
 sentence      0
 difficulty    0
 dtype: int64,
    id                                           sentence difficulty  \
 0   0  Les coûts kilométriques réels peuvent diverger...         C1   
 1   1  Le bleu, c'est ma couleur préférée mais je n'a...         A1   
 2   2  Le test de niveau en français est sur le site ...         A1   
 3   3           Est-ce que ton mari est aussi de Boston?         A1   
 4   4  Dans les écoles de commerce, dans les couloirs...         B1   
 
    difficulty_encoded  
 0                   4  
 1                   0  
 2                   0  
 3                   0  
 4                   2  )

### function of feature engineering

In [None]:
import torch
import pandas as pd
from transformers import CamembertTokenizer, CamembertModel
import spacy
import pyphen
from nltk.tokenize import word_tokenize, sent_tokenize
from lexicalrichness import LexicalRichness
import numpy as np

# Load Spacy French model
nlp = spacy.load("fr_core_news_sm")

# Initialize Camembert
tokenizer = CamembertTokenizer.from_pretrained('camembert/camembert-large')
model = CamembertModel.from_pretrained('camembert/camembert-large')

# Function to calculate features for a given text
def calculate_features(text):
    # Tokenize the text into words and sentences
    words = word_tokenize(text, language='french')
    sentences = sent_tokenize(text, language='french')

    # Initialize Pyphen for syllable counting
    dic = pyphen.Pyphen(lang='fr')

    # Compute text embeddings
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

    # Lexical Diversity Measures
    lex = LexicalRichness(text)
    mtld = lex.mtld(threshold=0.72)

    # Syntactic Complexity Measures
    doc = nlp(text)
    num_subordinate_clauses = sum(1 for sent in doc.sents for token in sent if token.dep_ in ['csubj', 'csubjpass', 'advcl'])

    return {
        'total_length': len(words),
        'average_word_length': np.mean([len(word) for word in words]),
        'ttr': len(set(words)) / len(words),
        'average_sentence_length': np.mean([len(word_tokenize(sentence, language='french')) for sentence in sentences]),
        'average_syllables_per_word': np.mean([len(dic.inserted(word).split("-")) for word in words]),
        'mtld': mtld,
        'num_subordinate_clauses': num_subordinate_clauses,
        'embeddings': embeddings.tolist()  # Convert to list for easier handling
    }
