## Import libraries

In [1]:
from collections import defaultdict
import re
import unicodedata
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import word_tokenize
from textstat import textstat
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import spacy
import nltk
from sklearn.discriminant_analysis import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
import torch
from transformers import CamembertTokenizer, CamembertModel

2023-11-28 15:12:45.835943: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-28 15:12:45.835983: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-28 15:12:45.836026: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-28 15:12:45.852214: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-11-28 15:12:49.324099: I tensorflow/compiler/

## load data

In [2]:
# Load the dataset
file_path = "/home/nathan/OneDrive/GitHub/Nvidia/data/training_data.csv"
data = pd.read_csv(file_path)

test_set = pd.read_csv("/home/nathan/OneDrive/GitHub/Nvidia/data/unlabelled_test_data.csv")
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        1200 non-null   int64 
 1   sentence  1200 non-null   object
dtypes: int64(1), object(1)
memory usage: 18.9+ KB


## Clean data

In [3]:
nlp = spacy.load("fr_core_news_sm")

def clean_french_sentence(sentence):
    if isinstance(sentence, str):
        # Normalize unicode characters to decompose accents
        #sentence = unicodedata.normalize('NFD', sentence).encode('ascii', 'ignore').decode('utf-8')
        # Remove punctuation
        sentence = re.sub(r'[^\w\s]', '', sentence)
        # Remove extra spaces
        sentence = re.sub(r'\s+', ' ', sentence).strip()
        doc = nlp(sentence)
        return ' '.join([token.lemma_ for token in doc])
    return sentence

test_set['sentence'] = test_set['sentence'].apply(clean_french_sentence)
data['sentence'] = data['sentence'].apply(clean_french_sentence)

In [4]:
print(data['sentence'].head(15))

0     le coût kilométrique réel pouvoir diverger sen...
1     le bleu cest mon couleur préférer mais je naim...
2     le test de niveau en français être sur le site...
3               estce que ton mari être aussi de Boston
4     dans le école de commerce dans le couloir de p...
5        voilà un autre histoire que jai beaucoup aimer
6     le médecin dire souvent quon devoir boire un v...
7     il être particulièrement observer chez le pers...
8     Jai retrouver le plaisir de manger un oeuf à l...
9     nous aller bien nous habiter dans un petit mai...
10                                 Bonjour et bon année
11    le presse sest abondamment faire lécho de effe...
12    pour que le rocher souvre il falloir le touche...
13       Jhabite un bel ville dans le nord de le France
14    certes il devoir répondre à goût de consommate...
Name: sentence, dtype: object


## Label difficulty level

In [5]:
# Check for missing values
missing_values = data.isnull().sum()

# Encode the 'difficulty' column
label_encoder = LabelEncoder()
data['difficulty_encoded'] = label_encoder.fit_transform(data['difficulty'])

In [6]:
print(data.head(15))

    id                                           sentence difficulty  \
0    0  le coût kilométrique réel pouvoir diverger sen...         C1   
1    1  le bleu cest mon couleur préférer mais je naim...         A1   
2    2  le test de niveau en français être sur le site...         A1   
3    3            estce que ton mari être aussi de Boston         A1   
4    4  dans le école de commerce dans le couloir de p...         B1   
5    5     voilà un autre histoire que jai beaucoup aimer         A2   
6    6  le médecin dire souvent quon devoir boire un v...         A2   
7    7  il être particulièrement observer chez le pers...         B2   
8    8  Jai retrouver le plaisir de manger un oeuf à l...         A2   
9    9  nous aller bien nous habiter dans un petit mai...         B1   
10  10                               Bonjour et bon année         A1   
11  11  le presse sest abondamment faire lécho de effe...         B2   
12  12  pour que le rocher souvre il falloir le touche...       

## Text embedding

In [7]:
from transformers import CamembertTokenizer, CamembertModel
import torch
import numpy as np

# Initialize Camembert
tokenizer = CamembertTokenizer.from_pretrained('camembert/camembert-large')
model = CamembertModel.from_pretrained('camembert/camembert-large')

def get_text_embeddings(texts, batch_size=32):
    embeddings = []
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i + batch_size]
        try:
            inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=512)
            with torch.no_grad():
                outputs = model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).squeeze()
            embeddings.append(batch_embeddings.cpu().numpy() if batch_embeddings.is_cuda else batch_embeddings.numpy())
        except Exception as e:
            print(f"Error in batch {i // batch_size}: {e}")
    
    # Concatenate all batch embeddings
    return np.concatenate(embeddings, axis=0)



## CEFR labeling for AJCV

In [8]:
import pandas as pd

def assign_cefr_level(row):
    # Assign a CEFR level based on the highest frequency across levels
    levels = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']
    freqs = [row[f'freq_{level}'] for level in levels]
    max_freq = max(freqs)
    
    # Return the level with the highest frequency, default to the highest level if all frequencies are zero
    return levels[freqs.index(max_freq)] if max_freq > 0 else 'C2'

def load_flelex_data(file_path):
    # Load the CSV file with tab delimiter
    fle_lex_data = pd.read_csv(file_path, sep='\t')

    # Assign CEFR levels to each word based on frequency data
    fle_lex_data['cefr_level'] = fle_lex_data.apply(assign_cefr_level, axis=1)

    # Create a dictionary mapping words to their CEFR levels
    cefrj_word_list = dict(zip(fle_lex_data['word'], fle_lex_data['cefr_level']))

    return cefrj_word_list

# Replace with the actual path of your FLELex_TreeTagger file
file_path = '/home/nathan/OneDrive/GitHub/Nvidia/data/FLELex_TreeTagger.csv'
cefrj_word_list = load_flelex_data(file_path)

# Function to convert CEFR-J levels to numeric values
def cefrj_level_to_numeric(level):
    mapping = {'A1': 1, 'A2': 2, 'B1': 3, 'B2': 4, 'C1': 5, 'C2': 6}
    return mapping.get(level, 0)  # Default to 0 if level not found

# Function to calculate AJCV
def ajcv_feature(text, cefrj_word_list):
    tokens = word_tokenize(text, language='french')
    words = [word.lower() for word in tokens if word.isalpha()]

    cefrj_values = []
    for word in words:
        if word in cefrj_word_list:
            cefrj_level = cefrj_word_list[word]
            numeric_value = cefrj_level_to_numeric(cefrj_level)
            cefrj_values.append(numeric_value)

    return sum(cefrj_values) / len(cefrj_values) if cefrj_values else 0


In [9]:
def bpera_feature(text, cefrj_word_list):
    tokens = nltk.word_tokenize(text, language='french')
    words = [word.lower() for word in tokens if word.isalpha()]

    a_level_count = 0
    b_level_count = 0

    for word in words:
        cefr_level = cefrj_word_list.get(word, None)
        if cefr_level in ['A1', 'A2']:
            a_level_count += 1
        elif cefr_level in ['B1', 'B2']:
            b_level_count += 1

    # Calculate BPERA, handle case where there are no A-level words
    return b_level_count / a_level_count if a_level_count > 0 else 0

## Feature engineering

In [18]:
from nltk.tokenize import word_tokenize
import pyphen
import textstat
from nltk.tokenize import sent_tokenize
from lexicalrichness import LexicalRichness

# Feature engineering function
def feature_engineering(text, cefrj_word_list):
    # Tokenization and word processing
    tokens = word_tokenize(text, language='french')
        # Tokenize the text into words and sentences
    words = word_tokenize(text, language='french')
    sentences = sent_tokenize(text, language='french')

    words = [word.lower() for word in tokens if word.isalpha()]
    num_sentences = text.count('.') + text.count('!') + text.count('?')
    num_words = len(words)
    word_lengths = [len(word) for word in words]
    # Get text embedding
    text_embedding = get_text_embeddings(text, batch_size=8)
    # Initialize Pyphen for syllable counting
    dic = pyphen.Pyphen(lang='fr')
    # Syntactic Complexity Measures
    doc = nlp(text)
    num_subordinate_clauses = sum(1 for sent in doc.sents for token in sent if token.dep_ in ['csubj', 'csubjpass', 'advcl'])
    average_verbs_per_sentence = sum(1 for token in doc if token.pos_ == 'VERB') / len(sentences)
        # Lexical Diversity Measures
    lex = LexicalRichness(text)
    mtld = lex.mtld(threshold=0.72)
    # Calculating features
    features = {
        #'AJCV': ajcv_feature(text, cefrj_word_list),  # Implement AJCV feature calculation
        #'ARI': textstat.automated_readability_index(text), # Implement ARI feature calculation
        #'ASL': len(words) / num_sentences if num_sentences > 0 else 0, # Implement ASL feature calculation
        #'AWL': sum(word_lengths) / num_words if num_words > 0 else 0,   # Implement AWL feature calculation
        #'BPERA': bpera_feature(text, cefrj_word_list),  # B per A ratio
        'CLI': textstat.coleman_liau_index(text), # Implement CLI feature calculation
        'DCRS': textstat.dale_chall_readability_score(text), # Implement DCRS feature calculation
        'FKG': textstat.flesch_kincaid_grade(text), # Implement FKG feature calculation
        'FRE': textstat.flesch_reading_ease(text), # Implement FRE feature calculation
        'LEN': len(text), # Implement LEN feature calculation
        #'embeddings': text_embedding.tolist(),  # Implement embeddings feature calculation
        #'TTR': len(set(words)) / num_words if num_words > 0 else 0 # Implement TTR feature calculation
        #'DIC': dic,
        #'num_subordinate_clauses': num_subordinate_clauses,
        #'AVPS': average_verbs_per_sentence,
        #'mtld': mtld
    }

    return features

## Applied on training data

In [19]:
# Apply the feature engineering function to each sentence and expand the results into separate columns
features_df = data['sentence'].apply(lambda x: feature_engineering(x, cefrj_word_list)).apply(pd.Series)

# Concatenate the original dataframe with the new features dataframe
df = pd.concat([data, features_df], axis=1)

# Display the updated DataFrame with separate columns for each feature
print(df.head())


KeyboardInterrupt: 

## applied on test data

In [None]:
# Apply the feature engineering function to each sentence and expand the results into separate columns
features_test = test_set['sentence'].apply(lambda x: feature_engineering(x, cefrj_word_list)).apply(pd.Series)

# Concatenate the original dataframe with the new features dataframe
df_test = pd.concat([test_set, features_test], axis=1)

# Display the updated DataFrame with separate columns for each feature
print(df_test.head())

## flatten embeddings

In [None]:
import re

def convert_string_to_list(string):
    # Check if the input is a string
    if isinstance(string, str):
        # Extract all floating-point numbers (including negatives) using a regular expression
        return [float(num) for num in re.findall(r"-?\d+\.\d+|-?\d+", string)]
    else:
        # Handle non-string inputs: return as-is, or handle differently if needed
        return string

# Apply this conversion to the embeddings column
df['embeddings'] = df['embeddings'].apply(convert_string_to_list)
df_test['embeddings'] = df_test['embeddings'].apply(convert_string_to_list)

# Flatten the embeddings
num_embedding_features = len(df['embeddings'].iloc[0])
for i in range(num_embedding_features):
    df[f'emb_{i}'] = df['embeddings'].apply(lambda x: x[i] if i < len(x) else None)
    df_test[f'emb_{i}'] = df_test['embeddings'].apply(lambda x: x[i] if i < len(x) else None)


In [None]:
print(df_test.head())

## Drop unused features

In [None]:
# Drop the original embeddings column and other non-feature columns
df.drop(['embeddings', 'sentence', 'id', 'difficulty'], axis=1, inplace=True)
df_test.drop(['embeddings', 'sentence', 'id'], axis=1, inplace=True)

## prepare for training and evaluation

In [None]:
# Preparing the data for training and validation
X = df.drop('difficulty_encoded', axis=1)
y = df['difficulty_encoded']

# Standardizing the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

## Hyperparameter tuning, training and validation of training data

In [None]:
from sklearn.metrics import classification_report


param_grid = {
    'C': [0.1, 1, 10, 100],  # Regularization parameter
    'gamma': ['scale', 'auto'],  # Kernel coefficient
}

# Create a GridSearchCV object for an SVM with RBF kernel
grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=5, verbose=2, n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Print best parameters
print("Best parameters found: ", grid_search.best_params_)

# Use the best estimator for validation
best_svm = grid_search.best_estimator_

# Classification Metrics on Validation Set
val_predictions = best_svm.predict(X_val)
print("Classification Report on Validation Set:")
print(classification_report(y_val, val_predictions))