In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (


In [2]:
# Specify the file path
file_path_test = "unlabelled_test_data.csv"
file_path_training = "training_data.csv"

# Read the CSV file
unlabelled_test_data = pd.read_csv(file_path_test)
training_data = pd.read_csv(file_path_training)

In [3]:
import string
import spacy
# Load the French spaCy model

nlp = spacy.load("fr_core_news_sm")

def preprocess_text(text):

    """

    Preprocess the text by:

    - Lowercasing

    - Removing punctuation

    - Lemmatizing (using French spaCy model)

    """

    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Lemmatization using spaCy
    doc = nlp(text)
    lemmatized_text = " ".join([token.lemma_ for token in doc])

    return lemmatized_text

# Apply the preprocessing to the 'sentence' column

training_data['processed_sentence'] = training_data['sentence'].apply(preprocess_text)

# Display the first few rows of the updated dataframe
training_data.head()

Unnamed: 0,id,sentence,difficulty,processed_sentence
0,0,Les coûts kilométriques réels peuvent diverger...,C1,le coût kilométrique réel pouvoir diverger sen...
1,1,"Le bleu, c'est ma couleur préférée mais je n'a...",A1,le bleu cest mon couleur préférer mais je naim...
2,2,Le test de niveau en français est sur le site ...,A1,le test de niveau en français être sur le site...
3,3,Est-ce que ton mari est aussi de Boston?,A1,estce que ton mari être aussi de boston
4,4,"Dans les écoles de commerce, dans les couloirs...",B1,dans le école de commerce dans le couloir de p...


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Using TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(training_data['sentence'])
y = training_data['difficulty']

In [5]:
# Splitting the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)


In [6]:
from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)

In [7]:
from sklearn.metrics import classification_report, accuracy_score

# Predict on the test set
y_pred = logistic_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(classification_rep)

Accuracy: 0.4625
Classification Report:
              precision    recall  f1-score   support

          A1       0.51      0.68      0.58       166
          A2       0.37      0.33      0.35       158
          B1       0.44      0.31      0.37       166
          B2       0.46      0.43      0.44       153
          C1       0.44      0.44      0.44       152
          C2       0.51      0.57      0.54       165

    accuracy                           0.46       960
   macro avg       0.46      0.46      0.45       960
weighted avg       0.46      0.46      0.45       960



In [8]:
# Step 1: Predict the difficulty levels
X_unlabelled = vectorizer.transform(unlabelled_test_data['sentence'])
predicted_difficulties = logistic_model.predict(X_unlabelled)

# Step 2: Create a new DataFrame
predictions_df = pd.DataFrame({
    'id': unlabelled_test_data['id'],
    'predicted_difficulty': predicted_difficulties
})

# Step 3: Export to CSV
#output_file_path = 'predicted_difficulties.csv'  
#predictions_df.to_csv(output_file_path, index=False)

In [9]:
#I want to extract some of the features from the training data so I can use them to train my model
#training_data['type_token_ratio'] - This ratio compares the number of unique words (types) to the total number of words (tokens) in a text.
#training_data['syntactic_complexity'] - counts commas as a proxy for complexity.

In [10]:
import numpy as np
import nltk
nltk.download('punkt')

def sentence_length(sentence):
    return len(sentence.split())

def average_word_length(sentence):
    words = sentence.split()
    return np.mean([len(word) for word in words]) if words else 0

def type_token_ratio(sentence):
    words = sentence.split()
    return len(set(words)) / len(words) if words else 0

def syntactic_complexity(sentence):
    return sentence.count(',')

def pos_tag_distribution(sentence):
    doc = nlp(sentence)
    pos_counts = {}
    for token in doc:
        pos = token.pos_
        pos_counts[pos] = pos_counts.get(pos, 0) + 1
    return pos_counts

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/oanaalexandrablanc/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [11]:
training_data['sentence_length'] = training_data['sentence'].apply(sentence_length)
training_data['avg_word_length'] = training_data['sentence'].apply(average_word_length)
training_data['type_token_ratio'] = training_data['sentence'].apply(type_token_ratio)
training_data['syntactic_complexity'] = training_data['sentence'].apply(syntactic_complexity)
training_data['pos_tags'] = training_data['sentence'].apply(pos_tag_distribution)

In [12]:
unique_pos_tags = set()
for pos_tags_dict in training_data['pos_tags']:
    unique_pos_tags.update(pos_tags_dict.keys())

print(unique_pos_tags)

{'NUM', 'X', 'PROPN', 'VERB', 'SCONJ', 'INTJ', 'PRON', 'ADP', 'AUX', 'DET', 'ADV', 'NOUN', 'PUNCT', 'ADJ', 'CCONJ'}


In [13]:
# Initialize columns for each POS tag with default value 0
for tag in ['PUNCT', 'ADV', 'CCONJ', 'X', 'AUX', 'DET', 'PRON', 'NUM', 'NOUN', 'INTJ', 'ADP', 'ADJ', 'VERB', 'PROPN', 'SCONJ']:
    training_data[tag] = 0

# Populate the columns with counts
for index, row in training_data.iterrows():
    for tag, count in row['pos_tags'].items():
        if tag in training_data.columns:
            training_data.at[index, tag] = count

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sp
from sklearn.preprocessing import PolynomialFeatures

# Initialize the TF-IDF Vectorizer with n-gram range
vectorizer = TfidfVectorizer(ngram_range=(1, 3), max_features=5000)

# Fit and transform the text data using the updated TF-IDF vectorizer
X_text_features = vectorizer.fit_transform(training_data['sentence'])

# Existing code for your linguistic features
X_linguistic_features = sp.csr_matrix(training_data[['sentence_length', 'avg_word_length', 'type_token_ratio', 'syntactic_complexity', 'PUNCT', 'ADV', 'CCONJ', 'X', 'AUX', 'DET', 'PRON', 'NUM', 'NOUN', 'INTJ', 'ADP', 'ADJ', 'VERB', 'PROPN', 'SCONJ']])

# Combine the updated TF-IDF features with the linguistic features
X_combined = sp.hstack([X_text_features, X_linguistic_features])

# Create interaction terms
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_interactions = poly.fit_transform(X_combined.toarray())  # Convert to dense array for PolynomialFeatures

# Now proceed with splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_interactions, training_data['difficulty'], test_size=0.2)

: 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Split the feature set with interaction terms into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_interactions, y, test_size=0.2, random_state=42)

# Train the Logistic Regression model using the new feature set
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Predict on the test set
y_pred = logistic_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(classification_rep)


Accuracy: 0.4895833333333333
              precision    recall  f1-score   support

          A1       0.58      0.77      0.66       166
          A2       0.44      0.43      0.43       158
          B1       0.46      0.40      0.43       166
          B2       0.43      0.42      0.42       153
          C1       0.45      0.47      0.46       152
          C2       0.54      0.45      0.49       165

    accuracy                           0.49       960
   macro avg       0.48      0.49      0.48       960
weighted avg       0.49      0.49      0.48       960



In [None]:
#extract the same features for the unlabelled test data

In [None]:
unlabelled_test_data['sentence_length'] = unlabelled_test_data['sentence'].apply(sentence_length)
unlabelled_test_data['avg_word_length'] = unlabelled_test_data['sentence'].apply(average_word_length)
unlabelled_test_data['type_token_ratio'] = unlabelled_test_data['sentence'].apply(type_token_ratio)
unlabelled_test_data['syntactic_complexity'] = unlabelled_test_data['sentence'].apply(syntactic_complexity)
unlabelled_test_data['pos_tags'] = unlabelled_test_data['sentence'].apply(pos_tag_distribution)

# Initialize columns for each POS tag with default value 0
for tag in ['PUNCT', 'ADV', 'CCONJ', 'X', 'AUX', 'DET', 'PRON', 'NUM', 'NOUN', 'INTJ', 'ADP', 'ADJ', 'VERB', 'PROPN', 'SCONJ']:
    unlabelled_test_data[tag] = 0

# Populate the columns with counts
for index, row in unlabelled_test_data.iterrows():
    for tag, count in row['pos_tags'].items():
        if tag in unlabelled_test_data.columns:
            unlabelled_test_data.at[index, tag] = count

# Transform the sentences using the same TF-IDF vectorizer
X_unlabelled_text = vectorizer.transform(unlabelled_test_data['sentence'])
X_unlabelled_linguistic = sp.csr_matrix(unlabelled_test_data[['sentence_length', 'avg_word_length', 'type_token_ratio', 'syntactic_complexity', 'PUNCT', 'ADV', 'CCONJ', 'X', 'AUX', 'DET', 'PRON', 'NUM', 'NOUN', 'INTJ', 'ADP', 'ADJ', 'VERB', 'PROPN', 'SCONJ']])
X_unlabelled_combined = sp.hstack([X_unlabelled_text, X_unlabelled_linguistic])

# Step 3: Predict the difficulty levels
predicted_difficulties = logistic_model.predict(X_unlabelled_combined)

# Step 4: Create a new DataFrame with predictions
predictions_df = pd.DataFrame({
    'id': unlabelled_test_data['id'],
    'predicted_difficulty': predicted_difficulties
})

# Step 5: Export to CSV
#predictions_df.to_csv('predicted_difficulties2.csv', index=False)

In [None]:
#Now I'll try text embeddings
#CamemBERT -> pip install transformers torch, pip install sentencepiece

In [None]:
from transformers import CamembertModel, CamembertTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd

# Load tokenizer and model for CamemBERT
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertModel.from_pretrained("camembert-base")

# Tokenize and encode sentences in the dataset
inputs = tokenizer(list(training_data['sentence']), padding=True, truncation=True, max_length=128, return_tensors="pt")
input_ids = inputs['input_ids']

# Create a DataLoader for batch processing
batch_size = 8  # Adjust based on your system's capability
dataset = TensorDataset(input_ids)
dataloader = DataLoader(dataset, batch_size=batch_size)

# Generate embeddings in batches
embeddings = []
model.eval()
with torch.no_grad():
    for batch in dataloader:
        input_ids = batch[0]
        outputs = model(input_ids)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings.append(batch_embeddings)
embeddings = torch.cat(embeddings, dim=0).numpy()

# Dimensionality Reduction on Embeddings
pca = PCA(n_components=50)  # Adjust n_components based on your dataset
reduced_embeddings = pca.fit_transform(embeddings)

# Normalize linguistic features
scaler = StandardScaler()
normalized_linguistic_features = scaler.fit_transform(training_data[['sentence_length', 'avg_word_length', 'type_token_ratio', 'syntactic_complexity', 'PUNCT', 'ADV', 'CCONJ', 'X', 'AUX', 'DET', 'PRON', 'NUM', 'NOUN', 'INTJ', 'ADP', 'ADJ', 'VERB', 'PROPN', 'SCONJ']].values)

# Combine features
combined_features = np.hstack((reduced_embeddings, normalized_linguistic_features))

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(combined_features, training_data['difficulty'], test_size=0.2)

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
# Map the computed weights to the corresponding class labels
class_weights_dict = dict(zip(np.unique(y_train), class_weights))

# Train logistic regression model with class weights
logistic_model = LogisticRegression(max_iter=1000, class_weight=class_weights_dict)
logistic_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = logistic_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Accuracy: 0.49895833333333334


In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score

# Hyperparameter Tuning
parameters = {'C': [0.01, 0.1, 1, 10, 100], 
              'penalty': ['l1', 'l2'],
              'solver': ['liblinear']}
logistic_model = LogisticRegression(max_iter=1000, class_weight=class_weights_dict)
clf = GridSearchCV(logistic_model, parameters, cv=5, error_score='raise')
clf.fit(X_train, y_train)

# Best hyperparameters
print("Best Hyperparameters:", clf.best_params_)

# Cross-Validation
cross_val_scores = cross_val_score(clf, combined_features, training_data['difficulty'], cv=5)
print("Cross-Validation Scores:", cross_val_scores)

Best Hyperparameters: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
Cross-Validation Scores: [0.50729167 0.48854167 0.47604167 0.48125    0.47291667]


In [None]:
#The cross-validation scores indicate that the model's performance is not yet optimal, 
#as the accuracy scores across the different folds are hovering around 48% to 51%.

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform

# Define the hyperparameter search space
param_distributions = {
    'C': uniform(loc=0, scale=4),
    'penalty': ['l2']
}

# Setup the randomized search with cross-validation
random_search = RandomizedSearchCV(
    LogisticRegression(max_iter=1000, class_weight=class_weights_dict),
    param_distributions=param_distributions,
    n_iter=100,
    cv=5,
    verbose=2,
    random_state=42,
    n_jobs=-1
)

# Run the hyperparameter search
random_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Hyperparameters:", random_search.best_params_)
print("Best Score:", random_search.best_score_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core.computation.check import NUMEXPR_INSTALLED
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (
  from pandas.core import (


[CV] END .....................C=1.49816047538945, penalty=l2; total time=   4.9s
[CV] END .....................C=1.49816047538945, penalty=l2; total time=   5.0s
[CV] END .....................C=1.49816047538945, penalty=l2; total time=   5.0s
[CV] END .....................C=1.49816047538945, penalty=l2; total time=   5.1s
[CV] END .....................C=1.49816047538945, penalty=l2; total time=   5.0s
[CV] END ...................C=3.8028572256396647, penalty=l2; total time=   5.4s
[CV] END ...................C=3.8028572256396647, penalty=l2; total time=   5.5s
[CV] END ...................C=3.8028572256396647, penalty=l2; total time=   5.5s
[CV] END ...................C=2.9279757672456204, penalty=l2; total time=   1.4s
[CV] END ...................C=2.9279757672456204, penalty=l2; total time=   1.5s
[CV] END ...................C=2.9279757672456204, penalty=l2; total time=   1.4s
[CV] END ...................C=3.8028572256396647, penalty=l2; total time=   1.7s
[CV] END ...................