In [None]:
#Now I'll try text embeddings
#CamemBERT -> pip install transformers torch, pip install sentencepiece

In [None]:
from transformers import CamembertModel, CamembertTokenizer
import torch
from torch.utils.data import DataLoader, TensorDataset
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd

# Assuming the functions and training_data are properly defined

# Compute additional features and add them to the DataFrame
training_data['sentence_length'] = training_data['sentence'].apply(sentence_length)
training_data['avg_word_length'] = training_data['sentence'].apply(average_word_length)
training_data['type_token_ratio'] = training_data['sentence'].apply(type_token_ratio)
training_data['syntactic_complexity'] = training_data['sentence'].apply(syntactic_complexity)

# Load tokenizer and model for CamemBERT
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
model = CamembertModel.from_pretrained("camembert-base")

# Tokenize and encode sentences in the dataset
inputs = tokenizer(list(training_data['sentence']), padding=True, truncation=True, max_length=128, return_tensors="pt")
input_ids = inputs['input_ids']

# Create a DataLoader for batch processing
batch_size = 8  # Adjust based on your system's capability
dataset = TensorDataset(input_ids)
dataloader = DataLoader(dataset, batch_size=batch_size)

# Generate embeddings in batches
embeddings = []
model.eval()
with torch.no_grad():
    for batch in dataloader:
        input_ids = batch[0]
        outputs = model(input_ids)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings.append(batch_embeddings)
embeddings = torch.cat(embeddings, dim=0).numpy()

# Combine CamemBERT embeddings with additional features
combined_features = np.hstack((embeddings, 
                               training_data[['sentence_length', 'avg_word_length', 'type_token_ratio', 'syntactic_complexity']].values))

# Split data for training and testing
X_train, X_test, y_train, y_test = train_test_split(combined_features, training_data['difficulty'], test_size=0.2)

# Train logistic regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = logistic_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

In [None]:
# Initialize CamemBERT tokenizer
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")

# Tokenize and encode sentences in the dataset
inputs = tokenizer(list(unlabelled_test_data['sentence']), padding=True, truncation=True, max_length=128, return_tensors="pt")
input_ids = inputs['input_ids']

# Create a DataLoader for batch processing
batch_size = 8  # Adjust based on your system's capability
dataset = TensorDataset(input_ids)
dataloader = DataLoader(dataset, batch_size=batch_size)

# Load the CamemBERT model
model = CamembertModel.from_pretrained("camembert-base")

# Generate embeddings in batches
embeddings = []
model.eval()
with torch.no_grad():
    for batch in dataloader:
        input_ids = batch[0]
        outputs = model(input_ids)
        batch_embeddings = outputs.last_hidden_state.mean(dim=1)
        embeddings.append(batch_embeddings)
embeddings = torch.cat(embeddings, dim=0).numpy()

# Compute additional features for the unlabelled test data
unlabelled_test_data['sentence_length'] = unlabelled_test_data['sentence'].apply(sentence_length)
unlabelled_test_data['avg_word_length'] = unlabelled_test_data['sentence'].apply(average_word_length)
unlabelled_test_data['type_token_ratio'] = unlabelled_test_data['sentence'].apply(type_token_ratio)
unlabelled_test_data['syntactic_complexity'] = unlabelled_test_data['sentence'].apply(syntactic_complexity)

# Combine CamemBERT embeddings with additional features
combined_features = np.hstack((embeddings, 
                               unlabelled_test_data[['sentence_length', 'avg_word_length', 'type_token_ratio', 'syntactic_complexity']].values))

# Make predictions using the trained logistic regression model
predictions = logistic_model.predict(combined_features)

In [None]:
# Create output DataFrame
output_df = pd.DataFrame({
    'id': unlabelled_test_data['id'],
    'predicted_difficulty': predictions
})

# Save the DataFrame to a CSV file
output_df.to_csv('predicted_difficulties_camembert.csv', index=False)