## Data loading

In [7]:
import pandas as pd

# Load the dataset
file_path = 'https://raw.githubusercontent.com/nathanvdv/Nvidia/main/Data/training_data.csv?token=GHSAT0AAAAAACI3C7AMP6GYCDZQPDQ6MAGCZK3SKHQ'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0,id,sentence,difficulty
0,0,Les coûts kilométriques réels peuvent diverger...,C1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1
2,2,Le test de niveau en français est sur le site ...,A1
3,3,Est-ce que ton mari est aussi de Boston?,A1
4,4,"Dans les écoles de commerce, dans les couloirs...",B1


## Data Preprocessing

In [8]:
from sklearn.preprocessing import LabelEncoder

# Check for missing values
missing_values = data.isnull().sum()

# Encode the 'difficulty' column
label_encoder = LabelEncoder()
data['difficulty_encoded'] = label_encoder.fit_transform(data['difficulty'])

# Display missing values and the first few rows after encoding
missing_values, data.head()

(id            0
 sentence      0
 difficulty    0
 dtype: int64,
    id                                           sentence difficulty  \
 0   0  Les coûts kilométriques réels peuvent diverger...         C1   
 1   1  Le bleu, c'est ma couleur préférée mais je n'a...         A1   
 2   2  Le test de niveau en français est sur le site ...         A1   
 3   3           Est-ce que ton mari est aussi de Boston?         A1   
 4   4  Dans les écoles de commerce, dans les couloirs...         B1   
 
    difficulty_encoded  
 0                   4  
 1                   0  
 2                   0  
 3                   0  
 4                   2  )

In [9]:
# Calculate sentence length, word count, and average word length
data['sentence_length'] = data['sentence'].apply(len)
data['word_count'] = data['sentence'].apply(lambda s: len(s.split()))
data['average_word_length'] = data['sentence'].apply(lambda s: sum(len(word) for word in s.split()) / len(s.split()) if len(s.split()) > 0 else 0)

# Display the first few rows with the new features
data.head()

Unnamed: 0,id,sentence,difficulty,difficulty_encoded,sentence_length,word_count,average_word_length
0,0,Les coûts kilométriques réels peuvent diverger...,C1,4,255,38,5.736842
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,0,62,12,4.25
2,2,Le test de niveau en français est sur le site ...,A1,0,66,13,4.153846
3,3,Est-ce que ton mari est aussi de Boston?,A1,0,40,8,4.125
4,4,"Dans les écoles de commerce, dans les couloirs...",B1,2,209,34,5.176471


In [10]:
import spacy
from collections import Counter

# Load the French language model
nlp = spacy.load("fr_core_news_sm")

# Function to count POS tags in a sentence
def pos_tag_count(sentence):
    doc = nlp(sentence)
    pos_counts = Counter([token.pos_ for token in doc])
    return pos_counts

# Apply POS tagging to each sentence
data['pos_tags'] = data['sentence'].apply(pos_tag_count)


  return torch._C._cuda_getDeviceCount() > 0
2023-11-20 17:20:23.760119: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-20 17:20:23.760396: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-20 17:20:23.763196: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-20 17:20:24.054602: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-

In [11]:
#Count of unique words:
data['unique_word_count'] = data['sentence'].apply(lambda s: len(set(s.split())))
#Type-Token Ratio (TTR):
data['type_token_ratio'] = data['unique_word_count'] / data['word_count']


In [13]:
# Feature 1: Syntactic Complexity - Approximated by counting punctuation marks
data['clause_count'] = data['sentence'].apply(lambda s: s.count(',') + s.count(';') + s.count(':'))

# Feature 2: Lexical Richness - For demonstration, we'll use a simple measure such as the count of words longer than 7 characters
data['long_word_count'] = data['sentence'].apply(lambda s: len([word for word in s.split() if len(word) > 7]))

# Feature 4: Readability Scores - Using sentence length as a proxy
data['sentence_length_words'] = data['word_count']  # Already calculated previously
data['long_sentence'] = data['sentence_length_words'].apply(lambda x: 1 if x > 15 else 0)  # Assuming sentences longer than 15 words are complex

# Display the first few rows with the new features
data.head()


Unnamed: 0,id,sentence,difficulty,difficulty_encoded,sentence_length,word_count,average_word_length,pos_tags,unique_word_count,type_token_ratio,clause_count,long_word_count,sentence_length_words,long_sentence
0,0,Les coûts kilométriques réels peuvent diverger...,C1,4,255,38,5.736842,"{'DET': 3, 'NOUN': 14, 'ADJ': 3, 'VERB': 4, 'A...",29,0.763158,5,12,38,1
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,0,62,12,4.25,"{'DET': 3, 'NOUN': 2, 'PUNCT': 2, 'PRON': 2, '...",12,1.0,1,1,12,0
2,2,Le test de niveau en français est sur le site ...,A1,0,66,13,4.153846,"{'DET': 3, 'NOUN': 6, 'ADP': 4, 'VERB': 1, 'PU...",12,0.923077,0,3,13,0
3,3,Est-ce que ton mari est aussi de Boston?,A1,0,40,8,4.125,"{'NOUN': 3, 'PRON': 1, 'SCONJ': 1, 'VERB': 1, ...",8,1.0,0,0,8,0
4,4,"Dans les écoles de commerce, dans les couloirs...",B1,2,209,34,5.176471,"{'ADP': 11, 'DET': 4, 'NOUN': 10, 'PUNCT': 5, ...",28,0.823529,2,8,34,1
