In [11]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from sklearn.preprocessing import MultiLabelBinarizer

# Importation de la bibliothèque pandas
import pandas as pd

# Lecture du fichier CSV et stockage des données dans un DataFrame
df = pd.read_csv('../dataset.csv')



# Remplacement des retours à la ligne dans la colonne 'description' par des espaces
df['description'] = df['description'].str.replace('\n', ' ')

# Sélection des colonnes 'description' et 'cat1' du DataFrame
df = df[['description', 'cat1', 'cat2', 'cat3']]

# df['cat1'].fillna('missing', inplace=True)
# df['cat2'].fillna('missing', inplace=True)
# df['cat3'].fillna('missing', inplace=True)

LABELS = [
          "Balade",
          "Spectacle",
          "Culture",
          "Détente",
          "Fête",
          "Gastronomie",
          "Famille",
          "Sport",
          "Festival",
          "Atelier",
          "Environnement",
          "Danse",
          "Marché",
          "Théatre",
          "Concert",
          "Exposition",
          "Jeu",
          "Visite",
          "Histoire",
          "Art",
          "Brocante",
          "Action",
          "Santé",
          "Conférence"
          ]

df_new = df.copy()
for label in LABELS:
  df_new.insert(df_new.shape[1], label, pd.Series(0, dtype=int, index=range(df_new.shape[0])))

for row in df_new.iterrows():
  for cat in ['cat1', 'cat2', 'cat3']:
    for cat_col in LABELS:
      if row[1].loc[cat] == cat_col:
        df_new.at[row[0], cat_col] = 1
        break

df_new.drop(columns=['cat1', 'cat2', 'cat3'], inplace=True)
df_new = df_new.iloc[:, :]


# Charger le tokenizer et le modèle
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-multilingual-cased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-multilingual-cased")




#tokenisation des descriptions

from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df_new, test_size=0.2, shuffle=True  ,random_state=0)

# Ensure all descriptions are strings
df_train['description'] = df_train['description'].astype(str)
df_test['description'] = df_test['description'].astype(str)

# Replace NaN values with an empty string
df_train['description'].fillna('', inplace=True)
df_test['description'].fillna('', inplace=True)


#tokernisation des descriptions
train_encodings = tokenizer(df_train['description'].tolist(), truncation=True, padding=True)
test_encodings = tokenizer(df_test['description'].tolist(), truncation=True, padding=True)

# conversion des données tokénisées en tenseurs

class TextDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.float32 )
        return item

    def __len__(self):
        return len(self.labels)
      
train_dataset = TextDataset(train_encodings, df_train[LABELS].values)
test_dataset = TextDataset(test_encodings, df_test[LABELS].values)

# Création du DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=True)

# Création du modèle
class TextClassifier(nn.Module):
    def __init__(self, model_name, num_labels):
        super(TextClassifier, self).__init__()
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids, attention_mask=attention_mask)
      
model = TextClassifier("distilbert-base-multilingual-cased", len(LABELS))

# Fonction d'entraînement

def train(model, train_loader, test_loader, num_epochs=3, lr=1e-5):

    # Définition de la fonction de coût
    criterion = nn.BCEWithLogitsLoss()

    # Définition de l'optimiseur
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch['input_ids']
            attention_mask = batch['attention_mask']
            labels = batch['labels']
            outputs = model(input_ids, attention_mask)
            loss = criterion(outputs.logits, labels.float())
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        train_loss /= len(train_loader)

        model.eval()
        test_loss = 0
        for batch in test_loader:
            with torch.no_grad():
                input_ids = batch['input_ids']
                attention_mask = batch['attention_mask']
                labels = batch['labels']
                outputs = model(input_ids, attention_mask)
                loss = criterion(outputs.logits, labels.float())
                test_loss += loss.item()
        test_loss /= len(test_loader)

        print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}")
        
train(model, train_loader, test_loader) # Entraînement du modèle  (3 epochs)      


      




Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_train['description'].fillna('', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].meth

In [38]:
df_new.head(2)

Unnamed: 0,description,Balade,Spectacle,Culture,Détente,Fête,Gastronomie,Famille,Sport,Festival,...,Concert,Exposition,Jeu,Visite,Histoire,Art,Brocante,Action,Santé,Conférence
0,Le jeu aventure « Les templiers du coffre d’or...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,"L'apparition des tanks, arme de guerre incarna...",0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
