In [1]:
import pandas as pd
from tqdm import tqdm

In [2]:
# Read the CSV file
training_data = pd.read_csv("./Dataset_upgrade/training_dataUP.csv", index_col=0)
training_data = training_data[['sentence', 'difficulty']]

training_data.head()

Unnamed: 0_level_0,sentence,difficulty
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Les coûts kilométriques réels peuvent diverger...,4
1,"Le bleu, c'est ma couleur préférée mais je n'a...",0
2,Le test de niveau en français est sur le site ...,0
3,Est-ce que ton mari est aussi de Boston?,0
4,"Dans les écoles de commerce, dans les couloirs...",2


In [3]:
import torch
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import CamembertForSequenceClassification, CamembertTokenizer, AutoModelForSequenceClassification, AutoTokenizer
from torch.optim import Adam
import torch.nn.functional as F

In [4]:
from sentence_transformers import SentenceTransformer

model_name = "dangvantuan/sentence-camembert-large"
#model = SentenceTransformer(model_name)

In [5]:
sentences = training_data['sentence'].tolist()
labels = training_data['difficulty'].tolist()

In [6]:
# Tokenizer and model initialization
tokenizer = CamembertTokenizer.from_pretrained(model_name)
model = CamembertForSequenceClassification.from_pretrained(model_name, num_labels=6)
tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at dangvantuan/sentence-camembert-large and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [7]:
# Recovery of tensors required for BatchEncoding
input_ids = tokens['input_ids']
attention_mask = tokens['attention_mask']

# Create a training dataset
train_dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, torch.tensor(labels))
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=16, shuffle=True)

# Define the loss function
criterion = torch.nn.CrossEntropyLoss()

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Model training
for epoch in range(6):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Époque {epoch+1}/{6}"):
        input_ids_batch, attention_mask_batch, labels_batch = batch

        inputs = {'input_ids': input_ids_batch, 'attention_mask': attention_mask_batch, 'labels': labels_batch}
           
        outputs = model(**inputs)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

print(f"Loss moyenne pour l'époque: {total_loss / len(train_loader)}")

Époque 1/6: 100%|██████████| 300/300 [3:13:39<00:00, 38.73s/it]  


Loss moyenne pour l'époque: 1.2588747866948446


Époque 2/6: 100%|██████████| 300/300 [3:11:17<00:00, 38.26s/it]  


Loss moyenne pour l'époque: 0.8929875550667444


Époque 3/6: 100%|██████████| 300/300 [3:08:05<00:00, 37.62s/it]  


Loss moyenne pour l'époque: 0.6508233609795571


Époque 4/6: 100%|██████████| 300/300 [3:26:02<00:00, 41.21s/it]   


Loss moyenne pour l'époque: 0.4309373359878858


Époque 5/6: 100%|██████████| 300/300 [3:09:42<00:00, 37.94s/it]  


Loss moyenne pour l'époque: 0.31548847556114196


Époque 6/6: 100%|██████████| 300/300 [3:09:30<00:00, 37.90s/it]  

Loss moyenne pour l'époque: 0.18432556837797165





torch.save(model.state_dict(), "save4/modele.pth")

# Après l'entraînement de chaque époque
torch.save({
    'epoch': epoch,
    'model_state_dict': model.state_dict(),
    'optimizer_state_dict': optimizer.state_dict(),
    'loss': loss,  # Vous pouvez sauvegarder la perte moyenne ou la dernière perte enregistrée
    # Ajoutez d'autres métriques si nécessaire
}, "save4/complet.pth")


UTILISER LE MODEL POST-TRAIN











In [8]:
from transformers import CamembertForSequenceClassification, CamembertTokenizer
import torch

# Charger le tokenizer et le modèle pré-entraîné
model_name = "dangvantuan/sentence-camembert-large"

tokenizer = CamembertTokenizer.from_pretrained("dangvantuan/sentence-camembert-large")
model = CamembertForSequenceClassification.from_pretrained("dangvantuan/sentence-camembert-large", num_labels=6)

# Charger l'état sauvegardé de votre modèle post-entraîné
model.load_state_dict(torch.load("model_only.pth"))
#model.load_state_dict(torch.load("complet.pth")['model_state_dict'])

# Mettre le modèle en mode évaluation
model.eval()

Some weights of CamembertForSequenceClassification were not initialized from the model checkpoint at dangvantuan/sentence-camembert-large and are newly initialized: ['classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight', 'classifier.out_proj.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CamembertForSequenceClassification(
  (roberta): CamembertModel(
    (embeddings): CamembertEmbeddings(
      (word_embeddings): Embedding(32005, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): CamembertEncoder(
      (layer): ModuleList(
        (0-23): 24 x CamembertLayer(
          (attention): CamembertAttention(
            (self): CamembertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): CamembertSelfOutput(
              (dense): Linear(in_features=1024, out_features=10

In [9]:
from sklearn.metrics import accuracy_score

# Supposons que sentences_test et labels_test sont vos ensembles de test
sentences_test = training_data['sentence'].tolist()

labels_test = training_data['difficulty'].tolist()

# Prédiction et évaluation
predicted_labels = []

for sentence in sentences_test:
    tokens = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**tokens)
        predicted_label = torch.argmax(outputs.logits, dim=1).item()
        predicted_labels.append(predicted_label)

# Calcul de l'accuracy
accuracy = accuracy_score(labels_test, predicted_labels)
print("Accuracy du modèle :", accuracy)


Accuracy du modèle : 0.9477083333333334


In [10]:
test_data = pd.read_csv("../Dataset_upgrade/unlabelled_test_dataUP.csv", index_col=0)
test_data2 = pd.read_csv("../Dataset/unlabelled_test_data.csv", index_col=0)

In [11]:
import pandas as pd

sentences_test = test_data['sentence'].tolist()

# Prédictions
model.eval()  
predicted_labels = []

for sentence in sentences_test:
    tokens = tokenizer(sentence, return_tensors="pt", padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**tokens)
        predicted_label = torch.argmax(outputs.logits, dim=1).item()
        predicted_labels.append(predicted_label)

# prediction
test_data2['difficulty'] = predicted_labels

print(test_data2.head())


                                             sentence  difficulty
id                                                               
0   Nous dûmes nous excuser des propos que nous eû...           5
1   Vous ne pouvez pas savoir le plaisir que j'ai ...           2
2   Et, paradoxalement, boire froid n'est pas la b...           2
3   Ce n'est pas étonnant, car c'est une saison my...           1
4   Le corps de Golo lui-même, d'une essence aussi...           5


In [12]:
test_data2 = test_data2.drop(columns=['sentence'])
difficulty_mapping = {
    0: 'A1',
    1: 'A2',
    2: 'B1',
    3: 'B2',
    4: 'C1',
    5: 'C2'
}

test_data2['difficulty'] = test_data2['difficulty'].map(difficulty_mapping)

In [13]:
print(test_data2.head())


   difficulty
id           
0          C2
1          B1
2          B1
3          A2
4          C2


In [15]:
test_data2.shape

(1200, 1)

In [14]:
test_data2.to_csv('philippe.csv', index=True)
