In [2]:
import torch
from transformers import LongformerTokenizerFast, LongformerForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import AdamW
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('../dataset.csv')

#garder colonne description, cat1, cat2, cat3

df = df[['description', 'cat1', 'cat2', 'cat3']]

#supprimer les retours à la ligne dans la colonne description

df['description'] = df['description'].str.replace('\n', ' ')
df.head(1)



#supprimer colonne cat2 et cat3

drop_columns = ['cat2', 'cat3']
df = df.drop(drop_columns, axis=1)

#supprimer le manquant dans la colonne description

df = df.dropna(subset=['description'])

#Labeencoder cat1

label_encoder = LabelEncoder()

df['cat1_encoded'] = label_encoder.fit_transform(df['cat1'])



In [4]:


train_description, test_description , train_label, test_label = train_test_split(df['description'], df['cat1_encoded'], test_size=0.2, random_state=42)

 

In [None]:
#definir le nombre de classes
num_classes = len(df['cat1_encoded'].unique())

#initialise le tokenizer
tokenizer = AutoTokenizer.from_pretrained("papluca/xlm-roberta-base-language-detection")
model = AutoModelForSequenceClassification.from_pretrained("papluca/xlm-roberta-base-language-detection", num_labels=num_classes)

#tokeniser les données
train_encodings = tokenizer(list(train_description), truncation=True, padding=True)
test_encodings = tokenizer(list(test_description), truncation=True, padding=True)

#convertir les labels en tensor
train_labels = torch.tensor(list(train_label))
test_labels = torch.tensor(list(test_label))

#convertir les encodings en tensor
train_input_ids = torch.tensor(train_encodings.input_ids)
train_attention_mask = torch.tensor(train_encodings.attention_mask)
test_input_ids = torch.tensor(test_encodings.input_ids)
test_attention_mask = torch.tensor(test_encodings.attention_mask)

#créer dataloader
train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_labels)
test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_labels)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

#initialiser l'optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Fonction d'entraînement
def train(model, dataloader, optimizer, epochs=1):
    model.train()
    for epoch in range(epochs):
        for batch in dataloader:
            optimizer.zero_grad()
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# entraîner le modèle
train(model, train_loader, optimizer, epochs=1)

#tester le modèle
def test(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids, attention_mask, labels = batch
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            _, predicted = torch.max(logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f'Accuracy: {correct/total}')

test(model, test_loader)

#sauvegarder le modèle
model.save_pretrained('model')
tokenizer.save_pretrained('model')


