In [7]:
import pandas as pd

df = pd.read_csv('../dataset.csv')

#garder colonne description, cat1

df = df[['description', 'cat1']]

#supprimer le manquant dans la colonne description

df = df.dropna(subset=['description'])

#supprimer les retours à la ligne dans la colonne description

df['description'] = df['description'].str.replace('\n', ' ')

#balacing classes

from imblearn.over_sampling import RandomOverSampler

oversampler = RandomOverSampler(random_state=0)

#fit random oversampler to data
X = df['description']
y = df['cat1']

X_resampled, y_resampled = oversampler.fit_resample(df, y)

#resampled data
df = pd.DataFrame(X_resampled, columns=['description', 'cat1'])

import matplotlib.pyplot as plt
import seaborn as sns

categories_count = df['cat1'].value_counts()

categories = categories_count.index

fig = plt.figure(figsize=(12, 8))

ax = fig.add_subplot(111)

sns.barplot(x=categories_count, y=categories, ax=ax)

for a,p in enumerate(ax.patches):
    ax.annotate(f'{categories_count[a]}', (p.get_width(), a), fontsize=12)
    
plt.xlabel('Nombre de descriptions', fontsize=16)
plt.ylabel('Catégorie', fontsize=16)

#train and test set

from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, shuffle=True  ,random_state=0)

print("Train set shape: ", df_train.shape)
print("Test set shape: ", df_test.shape)

encoded_dict = {
    "Festival": 0,
    "Exposition": 1,
    "Théatre": 2,
    "Détente": 3,
    "Action": 4,
    "Marché": 5,
    "Environnement": 6,
    "Atelier": 7,
    "Fête": 8,
    "Sport": 9,
    "Balade": 10,
    "Visite": 11,
    "Brocante": 12,
    "Spectacle": 13,
    "Conférence": 14,
    "Danse": 15,
    "Jeu": 16,
    "Famille": 17,
    "Concert": 18,
    "Culture": 19
}

df_train['cat1'] = df_train['cat1'].map(encoded_dict)
df_test['cat1'] = df_test['cat1'].map(encoded_dict)

df_train.head()

#charger model et tokenizer

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.utils.data import DataLoader

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-multilingual-cased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert/distilbert-base-multilingual-cased")

# Tokeniser les données
train_encodings = tokenizer(df_train['description'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(df_test['description'].tolist(), truncation=True, padding=True)

# Convertir les données tokenisées en un dataset PyTorch
class TextDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels 

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

train_dataset = TextDataset(train_encodings, df_train['cat1'].tolist())
test_dataset = TextDataset(test_encodings, df_test['cat1'].tolist())

# Charger le dataset dans un DataLoader
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

from torch.optim import AdamW

# Initialiser l'optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Fonction d'entraînement
def train(model, dataloader, optimizer, epochs=1):
    model.train()
    for epoch in range(epochs):
        for batch in dataloader:
            optimizer.zero_grad()
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            print(f'Epoch {epoch+1}, Loss: {loss.item()}')

# Entraîner le modèle
train(model, train_loader, optimizer, epochs=1)

# Fonction de test
def test(model, dataloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            logits = outputs.logits
            _, predicted = torch.max(logits, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    print(f'Accuracy: {correct/total}')

# Tester le modèle
test(model, test_loader)


  ax.annotate(f'{categories_count[a]}', (p.get_width(), a), fontsize=12)


Train set shape:  (1328, 2)
Test set shape:  (332, 2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
