## Chargement du dataset et création d'un dataset d'entraînement composé d'un dico avec 2 clés : text et label

In [14]:
import json

file_path_train = "./data/train.json"
file_path_test = "./data/test.json"

with open(file_path_train, "r", encoding="utf-8") as file:
    data_train = json.load(file)

with open(file_path_test,"r", encoding="utf-8") as file:
    data_test = json.load(file)

# Afficher le contenu
print(data_train)
print(data_test)

{'Politics': ['The mayor announced a new initiative to improve public transportation.', 'The senator is facing criticism for her stance on the recent bill.', 'The upcoming election has sparked intense debates among the candidates.'], 'Health': ['Regular exercise and a balanced diet are key to maintaining good health.', 'The World Health Organization has issued new guidelines on COVID-19.', 'A new study reveals the benefits of meditation for mental health.'], 'Finance': ['The stock market saw a significant drop following the announcement.', 'Investing in real estate can be a profitable venture if done correctly.', "The company's profits have doubled since the launch of their new product."], 'Travel': ['Visiting the Grand Canyon is a breathtaking experience.', 'The tourism industry has been severely impacted by the pandemic.', 'Backpacking through Europe is a popular choice for young travelers.'], 'Food': ['The new restaurant in town offers a fusion of Italian and Japanese cuisine.', 'Dr

In [15]:
indix_label_dico = {}
for indix, key in enumerate(data_train):
    indix_label_dico[key] = indix

In [16]:
indix_label_dico

{'Politics': 0,
 'Health': 1,
 'Finance': 2,
 'Travel': 3,
 'Food': 4,
 'Education': 5,
 'Environment': 6,
 'Fashion': 7,
 'Science': 8,
 'Sports': 9,
 'Technology': 10,
 'Entertainment': 11}

In [84]:
import numpy as np

data = {
    'train':{
        'text': [],
        'label': [],
    },
    'test': {
        'text': [],
        'label': [],
    },
    'eval': {
        'text': [],
        'label': [],
    },

}

for key, value in data_train.items(): 
    for text in value:
        data['eval']['text'].append(text)
        label = indix_label_dico[key]
        data['eval']['label'].append(label)

for key, value in data_test.items(): 

    for text in value:
        if np.random.random() > 0.2:
            data['train']['text'].append(text)
            label = indix_label_dico[key]
            data['train']['label'].append(label)
        else: 
            data['test']['text'].append(text)
            label = indix_label_dico[key]
            data['test']['label'].append(label)

        

print(data)

{'train': {'text': ['The team scored a last-minute goal to win the championship.', 'The tennis player served an ace to win the match.', 'The basketball game went into overtime, but the home team emerged victorious.', 'The marathon runner broke the world record with an incredible time.', 'The soccer match ended in a draw after a thrilling 90 minutes.', 'The coach implemented a new strategy that led to a series of victories.', 'The athlete trained tirelessly to achieve peak performance.', 'The sports commentator provided insightful analysis during the game.', 'The fans cheered enthusiastically as their team scored a touchdown.', 'The gymnast executed a flawless routine to clinch the gold medal.', 'The cycling race traversed challenging terrain through picturesque landscapes.', 'The swimmer set a new personal best in the 100-meter freestyle.', 'The referee made a controversial call that sparked debate among fans.', 'The sports network broadcasted the match to millions of viewers worldwide

In [85]:
import torch
from datasets import load_dataset
from tqdm import tqdm
import nltk
from gensim.models.phrases import Phrases, Phraser
from nltk.tokenize import TreebankWordTokenizer, TweetTokenizer
import pandas as pd
from termcolor import colored
from collections import Counter
import numpy as np


In [86]:
from datasets import Dataset, DatasetDict

# Convertir les données en Dataset
train_dataset = Dataset.from_dict(data['train'])
test_dataset = Dataset.from_dict(data['test'])
eval_dataset = Dataset.from_dict(data['eval'])

# Créer un DatasetDict
dataset_dict = DatasetDict({
    'train': train_dataset,
    'test': test_dataset,
    'eval': eval_dataset
})

In [87]:
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 1064
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 274
    })
    eval: Dataset({
        features: ['text', 'label'],
        num_rows: 36
    })
})

In [88]:
import torch

if torch.backends.mps.is_available():
    device = torch.device("mps")
else:
    device = torch.device("cpu")

In [33]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased", use_fast=True)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

tokenized_datasets = dataset_dict.map(tokenize_function, batched=True)

Map: 100%|██████████| 36/36 [00:00<00:00, 1059.37 examples/s]
Map: 100%|██████████| 1338/1338 [00:01<00:00, 1261.81 examples/s]


In [34]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 36
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1338
    })
})

In [35]:
tokenized_datasets = tokenized_datasets.remove_columns(["text"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")


DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 36
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1338
    })
})


In [43]:
print(tokenized_datasets["train"] )

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 36
})


In [53]:
from torch.utils.data import DataLoader
train_dataloader = DataLoader(tokenized_datasets['train'], shuffle=True, batch_size=1)
test_dataloader = DataLoader(tokenized_datasets['test'], batch_size=8)
eval_dataloader = DataLoader(tokenized_datasets['eval'], batch_size=8)

In [54]:
for batch in train_dataloader:
    print(batch)

{'labels': tensor([0]), 'input_ids': tensor([[ 101, 1109, 4398, 1717,  170, 1207, 7191, 1106, 4607, 1470, 6312,  119,
          102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    

In [55]:
from transformers import AutoModelForSequenceClassification
# This model is equal to BERT + a linear layer for classification. In our custom model we designed a FastText + a hidden layer and linear layer for classification
model = AutoModelForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=12)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at prajjwal1/bert-tiny and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [72]:
from torch.optim import AdamW
optimizer = AdamW(model.parameters(), lr=5e-4)

In [73]:
from transformers import get_scheduler

num_epochs = 3
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

In [74]:
print(num_training_steps)

108


In [None]:
from tqdm.auto import tqdm
import evaluate

progress_bar = tqdm(range(num_training_steps))
model.to(device)
model.train()
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")
    model.train(False)
    loss_val, accuracy = evaluate(my_net, data_val, criterion)
    model.train(True)
    print("{} loss train: {:1.4f}\t val {:1.4f}\tAcc (val): {:.1%}".format(i, loss.item(), loss_val, accuracy   ))
    for batch_idx, batch in enumerate(train_dataloader):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()


        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

100%|██████████| 108/108 [00:55<00:00,  1.95it/s]


Epoch 1/3




Epoch 2/3




Epoch 3/3




In [76]:
print(f"Taille de eval_dataloader : {len(eval_dataloader)}")

Taille de eval_dataloader : 168


In [77]:
import evaluate

metric = evaluate.load("accuracy")
model.eval()
preds, trues = [], []
for i, batch in tqdm(enumerate(eval_dataloader), desc="evaluating", total=eval_dataloader.__len__()):
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

    _, tag_seq  = torch.max(logits, 1)
    preds.extend(tag_seq.cpu().detach().tolist())
    trues.extend(batch['labels'].cpu().detach().tolist())

metric.compute()

evaluating: 100%|██████████| 168/168 [00:17<00:00,  9.77it/s]


{'accuracy': 0.07922272047832586}

In [78]:
names = ["Politics",
 'Health',
 'Finance',
 'Travel',
 'Food',
 'Education',
 'Environment',
 'Fashion',
 'Science',
 'Sports',
 'Technology',
 'Entertainment']

In [79]:
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
print(classification_report(np.array(trues).flatten(), np.array(preds).flatten(), target_names=names))

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

     Politics       0.00      0.00      0.00       105
       Health       0.38      0.15      0.22       120
      Finance       0.00      0.00      0.00        97
       Travel       0.00      0.00      0.00       111
         Food       0.00      0.00      0.00       122
    Education       0.00      0.00      0.00       126
  Environment       0.00      0.00      0.00       136
      Fashion       0.00      0.00      0.00       118
      Science       0.00      0.00      0.00       109
       Sports       0.00      0.00      0.00        97
   Technology       0.07      0.88      0.13       100
Entertainment       0.00      0.00      0.00        97

     accuracy                           0.08      1338
    macro avg       0.04      0.09      0.03      1338
 weighted avg       0.04      0.08      0.03      1338

