In [45]:
import warnings
warnings.simplefilter('ignore')
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import DistilBertTokenizer, DistilBertModel
import logging
logging.basicConfig(level=logging.ERROR)

# Lecture du fichier CSV et stockage des données dans un DataFrame
df = pd.read_csv('../dataset.csv')

df.head()

# Remplacement des retours à la ligne dans la colonne 'description' par des espaces
df['description'] = df['description'].str.replace('\n', ' ')

# Sélection des colonnes 'description' et 'cat1' du DataFrame
df = df[['description', 'cat1', 'cat2', 'cat3']]

# Remplacement des retours à la ligne dans la colonne 'description' par des espaces
df['description'] = df['description'].str.replace('\n', ' ')

df['cat1'].fillna('', inplace=True)
df['cat2'].fillna('', inplace=True)
df['cat3'].fillna('', inplace=True)


new_df = pd.DataFrame()
new_df['text'] = df['description']
new_df['labels'] = df.iloc[:, 1:].values.tolist()
new_df.head()



#one hot encoding

from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
labels = mlb.fit_transform(new_df['labels'])
labels = pd.DataFrame(labels, columns=mlb.classes_)

new_df = pd.concat([new_df, labels], axis=1)
new_df.drop(columns='labels', inplace=True)


#drop la colonne " "
new_df.drop(columns=[''], inplace=True)


# Concaténer toutes les colonnes sauf 'description'
new_df['labels'] = new_df.drop(columns=['text']).apply(lambda row: row.tolist(), axis=1)


#drop toute les colonnes sauf 'text' et 'labels'

new_df = new_df[['text', 'labels']]

df.head()








Unnamed: 0,description,cat1,cat2,cat3
0,Le jeu aventure « Les templiers du coffre d’or...,Jeu,Famille,Détente
1,"L'apparition des tanks, arme de guerre incarna...",Exposition,Culture,
2,L’exposition vous invite à une plongée au cœur...,Exposition,Culture,
3,"L’abbaye de Vaucelles, monument culturel du Dé...",Exposition,Culture,Histoire
4,"Exposition ""Les chasseurs cyclistes au combat ...",Exposition,Histoire,Culture


In [106]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [107]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [108]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased', truncation=True, do_lower_case=True)

In [109]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.text
        self.targets = self.data.labels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [110]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_data=new_df.sample(frac=train_size,random_state=200)
test_data=new_df.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (391, 2)
TRAIN Dataset: (313, 2)
TEST Dataset: (78, 2)


In [111]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [112]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model. 

class DistilBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistilBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 24)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = DistilBERTClass()
model.to(device)

DistilBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(in

In [113]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [114]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

from pytorch_lightning.callbacks import EarlyStopping
from pytorch_lightning import Trainer

# Initialisation de l'early stopping
early_stopping = EarlyStopping(monitor='val_loss', min_delta=0.01, patience=5)

In [115]:
def train(epoch):
    model.train()
    trainer = Trainer(max_epochs=epoch, callbacks=[early_stopping])

    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _%5000==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        loss.backward()
        optimizer.step()
    

In [116]:
for epoch in range(EPOCHS):
    train(epoch)

0it [00:00, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  0.7088172435760498


1it [00:03,  3.90s/it]


KeyboardInterrupt: 

In [None]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >=0.5

In [None]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

In [None]:
LABELS = [
    "Festival",
    "Exposition",
    "Théatre",
    "Détente",
    "Action",
    "Marché",
    "Environnement",
    "Atelier",
    "Fête",
    "Sport",
    "Balade",
    "Visite",
    "Brocante",
    "Spectacle",
    "Conférence",
    "Danse",
    "Jeu",
    "Famille",
    "Concert",
    "Culture"
]

category_position_map = {category: position for position, category in enumerate(LABELS)}

#tester mon modèle avec du texte

def predict(text):
    model.eval()
    inputs = tokenizer.encode_plus(
        text,
        None,
        add_special_tokens=True,
        max_length=MAX_LEN,
        pad_to_max_length=True,
        return_token_type_ids=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    ids = inputs['input_ids'].to(device, dtype = torch.long)
    mask = inputs['attention_mask'].to(device, dtype = torch.long)
    token_type_ids = inputs["token_type_ids"].to(device, dtype = torch.long)

    outputs = model(ids, mask, token_type_ids)
    outputs = torch.sigmoid(outputs).cpu().detach().numpy().tolist()[0]

    categories = [LABELS[position] for position, value in enumerate(outputs) if value >= 0.5]
    return categories

text = "Le jeu aventure « Les templiers du coffre d’or » créé par la ville de Caudry en partenariat avec l’association Caudry Ma Passion débarque !"

print(predict(text))


