In [10]:
from transformers import AutoTokenizer
import pandas as pd
import re

# Load the dataset
file_path = '/home/nathan/OneDrive/GitHub/Nvidia/Data/training_data.csv'
data = pd.read_csv(file_path)

# Data Cleaning
# Removing unnecessary characters while preserving French-specific characters
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove digits and punctuation except for French-specific characters and apostrophes
    text = re.sub(r'[^a-zA-ZéèêëîïôœùûüçàâÉÈÊËÎÏÔŒÙÛÜÇÀÂ\'\s]', '', text)
    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

data['cleaned_sentence'] = data['sentence'].apply(clean_text)

# Convert difficulty levels to numerical labels
# Define the correct order of difficulty levels
difficulty_order = ['A1', 'A2', 'B1', 'B2', 'C1', 'C2']

# Create a mapping from difficulty level to a numerical label
difficulty_mapping = {level: i for i, level in enumerate(difficulty_order)}

# Apply the mapping to the dataset
data['difficulty_label'] = data['difficulty'].map(difficulty_mapping)

# Check the first few rows to confirm the labels are correctly assigned
data[['sentence', 'difficulty', 'difficulty_label']].head()

# Load the tokenizer for a French BERT model
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tokenize the sentences
tokenized_data = tokenizer(list(data['cleaned_sentence']), padding=True, truncation=True, max_length=128)

# Prepare the dataset for PyTorch
import torch
from torch.utils.data import Dataset, DataLoader

class FrenchSentenceDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create the dataset
dataset = FrenchSentenceDataset(tokenized_data, list(data['difficulty_label']))


In [11]:
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np

# Split the dataset into training and validation sets
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2)


In [2]:
from sklearn.model_selection import train_test_split
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np

# Split the dataset into training and validation sets
train_dataset, val_dataset = train_test_split(dataset, test_size=0.2)

# Load the pre-trained BERT model for sequence classification
model = AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(difficulty_mapping))

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory for model checkpoints
    evaluation_strategy="epoch",     # evaluation is done at the end of each epoch
    learning_rate=2e-5,              # learning rate
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=16,   # batch size for evaluation
    num_train_epochs=3,              # number of training epochs
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,                # log training information every 10 steps
)

# Initialize the Trainer
trainer = Trainer(
    model=model,                         # the instantiated model to be trained
    args=training_args,                  # training arguments
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset             # evaluation dataset
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

# Save the model
model.save_pretrained('./First_iteration_model')


2023-11-19 20:19:32.253373: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-19 20:19:32.254185: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-19 20:19:32.372687: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-19 20:19:32.580198: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of BertForSequenceClassification were no

  0%|          | 0/1440 [00:00<?, ?it/s]

{'loss': 1.8081, 'learning_rate': 1.9861111111111114e-05, 'epoch': 0.02}
{'loss': 1.7888, 'learning_rate': 1.9722222222222224e-05, 'epoch': 0.04}
{'loss': 1.7968, 'learning_rate': 1.9583333333333333e-05, 'epoch': 0.06}
{'loss': 1.7979, 'learning_rate': 1.9444444444444445e-05, 'epoch': 0.08}
{'loss': 1.7324, 'learning_rate': 1.9305555555555558e-05, 'epoch': 0.1}
{'loss': 1.6102, 'learning_rate': 1.916666666666667e-05, 'epoch': 0.12}
{'loss': 1.6018, 'learning_rate': 1.902777777777778e-05, 'epoch': 0.15}
{'loss': 1.6732, 'learning_rate': 1.888888888888889e-05, 'epoch': 0.17}
{'loss': 1.6228, 'learning_rate': 1.8750000000000002e-05, 'epoch': 0.19}
{'loss': 1.5646, 'learning_rate': 1.8611111111111114e-05, 'epoch': 0.21}
{'loss': 1.5125, 'learning_rate': 1.8472222222222224e-05, 'epoch': 0.23}
{'loss': 1.5131, 'learning_rate': 1.8333333333333333e-05, 'epoch': 0.25}
{'loss': 1.4615, 'learning_rate': 1.8194444444444445e-05, 'epoch': 0.27}
{'loss': 1.4323, 'learning_rate': 1.8055555555555558e-0

  0%|          | 0/60 [00:00<?, ?it/s]

{'eval_loss': 1.1282073259353638, 'eval_runtime': 151.5092, 'eval_samples_per_second': 6.336, 'eval_steps_per_second': 0.396, 'epoch': 1.0}
{'loss': 1.0475, 'learning_rate': 1.3194444444444446e-05, 'epoch': 1.02}
{'loss': 1.015, 'learning_rate': 1.3055555555555557e-05, 'epoch': 1.04}
{'loss': 0.9803, 'learning_rate': 1.2916666666666668e-05, 'epoch': 1.06}
{'loss': 1.3657, 'learning_rate': 1.2777777777777777e-05, 'epoch': 1.08}
{'loss': 1.0308, 'learning_rate': 1.263888888888889e-05, 'epoch': 1.1}
{'loss': 1.102, 'learning_rate': 1.25e-05, 'epoch': 1.12}
{'loss': 1.1724, 'learning_rate': 1.2361111111111113e-05, 'epoch': 1.15}
{'loss': 0.9752, 'learning_rate': 1.2222222222222224e-05, 'epoch': 1.17}
{'loss': 1.1181, 'learning_rate': 1.2083333333333333e-05, 'epoch': 1.19}
{'loss': 1.1298, 'learning_rate': 1.1944444444444444e-05, 'epoch': 1.21}
{'loss': 0.9704, 'learning_rate': 1.1805555555555557e-05, 'epoch': 1.23}
{'loss': 0.9489, 'learning_rate': 1.1666666666666668e-05, 'epoch': 1.25}
{'

  0%|          | 0/60 [00:00<?, ?it/s]

{'eval_loss': 1.0847216844558716, 'eval_runtime': 149.3287, 'eval_samples_per_second': 6.429, 'eval_steps_per_second': 0.402, 'epoch': 2.0}
{'loss': 0.7691, 'learning_rate': 6.5277777777777784e-06, 'epoch': 2.02}
{'loss': 0.7937, 'learning_rate': 6.3888888888888885e-06, 'epoch': 2.04}
{'loss': 0.9421, 'learning_rate': 6.25e-06, 'epoch': 2.06}
{'loss': 0.8325, 'learning_rate': 6.111111111111112e-06, 'epoch': 2.08}
{'loss': 0.8629, 'learning_rate': 5.972222222222222e-06, 'epoch': 2.1}
{'loss': 0.7558, 'learning_rate': 5.833333333333334e-06, 'epoch': 2.12}
{'loss': 0.8594, 'learning_rate': 5.694444444444445e-06, 'epoch': 2.15}
{'loss': 0.804, 'learning_rate': 5.555555555555557e-06, 'epoch': 2.17}
{'loss': 0.7885, 'learning_rate': 5.416666666666667e-06, 'epoch': 2.19}
{'loss': 0.8046, 'learning_rate': 5.2777777777777785e-06, 'epoch': 2.21}
{'loss': 0.7287, 'learning_rate': 5.138888888888889e-06, 'epoch': 2.23}
{'loss': 0.7767, 'learning_rate': 5e-06, 'epoch': 2.25}
{'loss': 0.8117, 'learni

  0%|          | 0/60 [00:00<?, ?it/s]

{'eval_loss': 1.1365675926208496, 'eval_runtime': 146.1611, 'eval_samples_per_second': 6.568, 'eval_steps_per_second': 0.411, 'epoch': 3.0}
{'train_runtime': 7840.2775, 'train_samples_per_second': 1.469, 'train_steps_per_second': 0.184, 'train_loss': 1.0645485974020428, 'epoch': 3.0}


  0%|          | 0/60 [00:00<?, ?it/s]

In [14]:
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Assuming val_dataset is a list of dictionaries or a PyTorch Dataset
# Extract labels from the val_dataset
true_labels = [label['labels'].item() for label in val_dataset]

# Predictions on the validation set
predictions = trainer.predict(val_dataset)

# Convert predictions to numpy for evaluation
preds = np.argmax(predictions.predictions, axis=-1)

# Calculate accuracy
accuracy = accuracy_score(true_labels, preds)

# Detailed classification report
# Adjust target_names as per your difficulty_mapping
report = classification_report(true_labels, preds, target_names=difficulty_mapping.keys())

print("Accuracy:", accuracy)
print("Classification Report:\n", report)


  0%|          | 0/120 [00:00<?, ?it/s]

Accuracy: 0.6875
Classification Report:
               precision    recall  f1-score   support

          A1       0.82      0.81      0.82       166
          A2       0.60      0.77      0.68       158
          B1       0.73      0.64      0.68       166
          B2       0.58      0.75      0.65       153
          C1       0.65      0.59      0.62       152
          C2       0.82      0.57      0.67       165

    accuracy                           0.69       960
   macro avg       0.70      0.69      0.69       960
weighted avg       0.70      0.69      0.69       960



In [15]:
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
import torch
from torch.utils.data import Dataset
import numpy as np

# Define the data cleaning function
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-zA-ZéèêëîïôœùûüçàâÉÈÊËÎÏÔŒÙÛÜÇÀÂ\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Load the test dataset
test_file_path = '/home/nathan/OneDrive/GitHub/Nvidia/Data/unlabelled_test_data.csv'  # Replace with the correct path
test_data = pd.read_csv(test_file_path)

# Apply data cleaning
test_data['cleaned_sentence'] = test_data['sentence'].apply(clean_text)

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")

# Tokenize the test sentences
test_tokenized = tokenizer(list(test_data['cleaned_sentence']), padding=True, truncation=True, max_length=128)

# Prepare the test dataset for PyTorch
class TestDataset(Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

test_dataset = TestDataset(test_tokenized)

# Load the saved model
saved_model_directory = './First_iteration_model'  # Replace with the correct path
model = AutoModelForSequenceClassification.from_pretrained(saved_model_directory)

# Initialize the Trainer with the loaded model
trainer = Trainer(model=model)

# Make predictions on the test dataset
test_predictions = trainer.predict(test_dataset)

# Convert predictions to numpy
test_preds = np.argmax(test_predictions.predictions, axis=-1)

# Assuming difficulty_mapping is defined and maps difficulties to numeric labels
# Convert numeric predictions back to difficulty labels
predicted_difficulties = [list(difficulty_mapping.keys())[list(difficulty_mapping.values()).index(pred)] for pred in test_preds]

# Add predictions to the test data DataFrame
test_data['difficulty'] = predicted_difficulties

# Display the first few rows of the test data with predictions
test_data[['id', 'difficulty']].head()


  0%|          | 0/150 [00:00<?, ?it/s]

Unnamed: 0,id,difficulty
0,0,C2
1,1,A2
2,2,B2
3,3,B1
4,4,C2


In [16]:
test_data[['id', 'difficulty']].to_csv('/home/nathan/OneDrive/GitHub/Nvidia/Data/Nvidia_CamemBERT_Enhanched_without_hyperparam.csv', index=False)

## Hyperparameter

In [17]:
import optuna
from transformers import TrainingArguments, Trainer

def model_init():
    return AutoModelForSequenceClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(difficulty_mapping))

def objective(trial):
    # Hyperparameters to be tuned by Optuna
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    num_train_epochs = trial.suggest_int("num_train_epochs", 2, 5)
    per_device_train_batch_size = trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32])

    # Define training arguments
    training_args = TrainingArguments(
        output_dir='./results',
        evaluation_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        weight_decay=0.01,
        logging_dir='./logs',
        logging_steps=10,
    )

    # Initialize the Trainer
    trainer = Trainer(
        model_init=model_init,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    results = trainer.evaluate()
    
    # Return the evaluation metric of interest, e.g., loss
    return results["eval_loss"]

# Create a study object and optimize the objective function
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=10)

# Best hyperparameters
best_trial = study.best_trial
print("Best trial:", best_trial.params)


[I 2023-11-19 23:04:29,536] A new study created in memory with name: no-name-f459bfe2-b331-40f7-9640-72dffde49587
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/360 [00:00<?, ?it/s]

[W 2023-11-19 23:04:58,182] Trial 0 failed with parameters: {'learning_rate': 1.4282768944114872e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 32} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/home/nathan/.local/lib/python3.10/site-packages/optuna/study/_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "/tmp/ipykernel_184386/3964014588.py", line 34, in objective
    trainer.train()
  File "/home/nathan/.local/lib/python3.10/site-packages/transformers/trainer.py", line 1591, in train
    return inner_training_loop(
  File "/home/nathan/.local/lib/python3.10/site-packages/transformers/trainer.py", line 1892, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/home/nathan/.local/lib/python3.10/site-packages/transformers/trainer.py", line 2776, in training_step
    loss = self.compute_loss(model, inputs)
  File "/home/nathan/.local/lib/python3.10/site-packages/tra

KeyboardInterrupt: 