<a href="https://colab.research.google.com/github/mustaphamerakech/multilingual-text-theme-classification/blob/main/BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import time
import datetime
import gc
import random
from nltk.corpus import stopwords
import re

import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler,random_split
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import transformers
from transformers import BertForSequenceClassification, AdamW, BertConfig,BertTokenizer,get_linear_schedule_with_warmup

In [2]:
df = pd.read_csv('/content/train_clean.csv', encoding='latin-1')
df.head(5)

Unnamed: 0,STORY,SECTION,Category,Text_length,words_counts,punctuation_count,Text_cleaning,len_text_clean
0,But the most painful was the huge reversal in ...,3,Business,843,148,12,painful huge reversal fee income unheard among...,573
1,How formidable is the opposition alliance amon...,0,Politics,129,17,6,formidable opposition alliance among congress ...,108
2,Most Asian currencies were trading lower today...,3,Business,386,58,31,asian currencies trading lower today south kor...,230
3,"If you want to answer any question, click on Â...",1,Technology,587,103,12,want answer question click answer clicking ans...,360
4,"In global markets, gold prices edged up today ...",3,Business,299,46,10,global markets gold prices edged today disappo...,247


In [20]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
# charger les données dans Nympay tableau
texts = df.Text_cleaning.values
labels = df.SECTION.values

In [4]:
# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]



In [5]:
print(' Original: ', texts[0])

# Print the sentence split into tokens.
print('Tokenized: ', tokenizer.tokenize(texts[0]))

# Print the sentence mapped to token ids.
print('Token IDs: ', tokenizer.convert_tokens_to_ids(tokenizer.tokenize(texts[0])))

 Original:  painful huge reversal fee income unheard among private sector lenders essentially means yes bank took granted fees structured loan deals paid accounted upfront books borrowers turned defaulters fees tied loan deals fell cracks gill vowed shift safer accounting practice amortizing fee income rather booking upfront gill move mend past ways means nasty surprises future good news considering investors love clean image loathe uncertainties gain without pain promise strong stable balance sheet comes sacrifices well investors give hopes phenomenal growth promise made kapoor
Tokenized:  ['painful', 'huge', 'reversal', 'fee', 'income', 'un', '##heard', 'among', 'private', 'sector', 'lend', '##ers', 'essentially', 'means', 'yes', 'bank', 'took', 'granted', 'fees', 'structured', 'loan', 'deals', 'paid', 'accounted', 'up', '##front', 'books', 'borrow', '##ers', 'turned', 'default', '##ers', 'fees', 'tied', 'loan', 'deals', 'fell', 'cracks', 'gill', 'vowed', 'shift', 'safer', 'accountin

In [6]:
max_len = 0

# For every text...
for text in texts:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(text, add_special_tokens=True)

    # Update the maximum text length.
    max_len = max(max_len, len(input_ids))

print('Max text length: ', max_len)

Token indices sequence length is longer than the specified maximum sequence length for this model (535 > 512). Running this sequence through the model will result in indexing errors


Max text length:  676


In [7]:
# Initialisez un compteur
long_texts_count = 0

# Parcourez chaque texte pour compter ceux qui dépassent 512 tokens
for text in texts:
    # Tokenisez et ajoutez les tokens spéciaux
    input_ids = tokenizer.encode(text, add_special_tokens=True)

    # Vérifiez si la longueur dépasse 512 tokens
    if len(input_ids) > 512:
        long_texts_count += 1

print(f"Nombre de textes dépassant 512 tokens : {long_texts_count}")

Nombre de textes dépassant 512 tokens : 9


In [8]:
# Filtrez les textes et étiquettes directement dans texts et labels
filtered_texts = []
filtered_labels = []

for text, label in zip(texts, labels):
    input_ids = tokenizer.encode(text, add_special_tokens=True)

    # Conservez seulement les textes avec une longueur de 512 tokens ou moins
    if len(input_ids) <= 512:
        filtered_texts.append(text)
        filtered_labels.append(label)

# Remplacez texts et labels par les versions filtrées
texts = np.array(filtered_texts)
labels = np.array(filtered_labels)

print(f"Nombre de textes après suppression : {len(texts)}")


Nombre de textes après suppression : 7619


In [9]:
max_len = 0

# For every text...
for text in texts:

    # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
    input_ids = tokenizer.encode(text, add_special_tokens=True)

    # Update the maximum text length.
    max_len = max(max_len, len(input_ids))

print('Max text length: ', max_len)

Max text length:  512


In [10]:
input_ids = []
attention_masks = []

# For every text...
for text in texts:
    # `encode_plus` will:
    #   (1) Tokenize the text.
    #   (2) Prepend the `[CLS]` token to the start.
    #   (3) Append the `[SEP]` token to the end.
    #   (4) Map tokens to their IDs.
    #   (5) Pad or truncate the sentence to `max_length`
    #   (6) Create attention masks for [PAD] tokens.
    encoded_dict = tokenizer.encode_plus(
                        text,                      # text to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = max_len,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.est un vecteur qui contient des 1 pour les tokens réels et des 0 pour les tokens de padding
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )

    # Add the encoded text to the list.
    input_ids.append(encoded_dict['input_ids'])

    # And its attention mask (simply differentiates padding from non-padding).
    attention_masks.append(encoded_dict['attention_mask'])

# Convert the lists into tensors.
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(labels)

# Print sentence 0, now as a list of IDs.
print('Original: ', texts[0])
print('Token IDs:', input_ids[0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Original:  painful huge reversal fee income unheard among private sector lenders essentially means yes bank took granted fees structured loan deals paid accounted upfront books borrowers turned defaulters fees tied loan deals fell cracks gill vowed shift safer accounting practice amortizing fee income rather booking upfront gill move mend past ways means nasty surprises future good news considering investors love clean image loathe uncertainties gain without pain promise strong stable balance sheet comes sacrifices well investors give hopes phenomenal growth promise made kapoor
Token IDs: tensor([  101,  9145,  4121, 23163,  7408,  3318,  4895, 26362,  2426,  2797,
         4753, 18496,  2545,  7687,  2965,  2748,  2924,  2165,  4379,  9883,
        14336,  5414,  9144,  3825, 14729,  2039, 12792,  2808, 17781,  2545,
         2357, 12398,  2545,  9883,  5079,  5414,  9144,  3062, 15288, 12267,
        18152,  5670, 13726,  9529,  3218, 16095,  3775,  6774,  7408,  3318,
         2738,

In [11]:
# Combine the training inputs into a TensorDataset.
dataset = TensorDataset(input_ids, attention_masks, labels)

# Create a 90-10 train-validation split.

# Calculate the number of samples to include in each set.
train_size = int(0.8 * len(dataset))
#val_size = int(0.2 * len(dataset))
val_size = len(dataset)  - train_size

# Divide the dataset by randomly selecting samples.
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

print('{:>5,} training samples'.format(train_size))
print('{:>5,} validation samples'.format(val_size))

6,095 training samples
1,524 validation samples


In [12]:
# The DataLoader needs to know our batch size for training, so we specify it
# here. For fine-tuning BERT on a specific task, the authors recommend a batch
# size of 16 or 32.
batch_size = 32

# Create the DataLoaders for our training and validation sets.
# We'll take training samples in random order.
train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )

# For validation the order doesn't matter, so we'll just read them sequentially.
validation_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [21]:
# Load BertForSequenceClassification, the pretrained BERT model with a single
# linear classification layer on top.
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 4, # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)

if device == "cuda:0":
# Tell pytorch to run this model on the GPU.
    model = model.cuda()
# Explicitly set device to "cpu"
# device = torch.device("cpu")
model = model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )



In [23]:
# Number of training epochs. The BERT authors recommend between 2 and 4.
# We chose to run for 4, but we'll see later that this may be over-fitting the
# training data.
epochs = 4

# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [24]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [25]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [26]:
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
training_stats = []

# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):

    # ========================================
    #               Training
    # ========================================
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    total_train_loss = 0
    model.train()
    for step, batch in enumerate(train_dataloader):
        # Unpack this training batch from our dataloader.
        #
        # As we unpack the batch, we'll also copy each tensor to the device using the
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids
        #   [1]: attention masks
        #   [2]: labels
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        optimizer.zero_grad()
        output = model(b_input_ids,
                             token_type_ids=None,
                             attention_mask=b_input_mask,
                             labels=b_labels)
        loss = output.loss
        total_train_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()

    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(training_time))
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables
    total_eval_accuracy = 0
    best_eval_accuracy = 0
    total_eval_loss = 0
    nb_eval_steps = 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)
        # Tell pytorch not to bother with constructing the compute graph during
        # the forward pass, since this is only needed for backprop (training).
        with torch.no_grad():
            output= model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask,
                                   labels=b_labels)
        loss = output.loss
        total_eval_loss += loss.item()
        # Move logits and labels to CPU if we are using GPU
        logits = output.logits
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        # Calculate the accuracy for this batch of test sentences, and
        # accumulate it over all batches.
        total_eval_accuracy += flat_accuracy(logits, label_ids)
    # Report the final accuracy for this validation run.
    avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
    print("  Accuracy: {0:.2f}".format(avg_val_accuracy))
    # Calculate the average loss over all of the batches.
    avg_val_loss = total_eval_loss / len(validation_dataloader)
    # Measure how long the validation run took.
    validation_time = format_time(time.time() - t0)
    if avg_val_accuracy > best_eval_accuracy:
        torch.save(model, 'bert_model')
        best_eval_accuracy = avg_val_accuracy
    #print("  Validation Loss: {0:.2f}".format(avg_val_loss))
    #print("  Validation took: {:}".format(validation_time))
    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )
print("")
print("Training complete!")

print("Total training took {:} (h:mm:ss)".format(format_time(time.time()-total_t0)))



Training...

  Average training loss: 0.29
  Training epcoh took: 0:08:46

Running Validation...
  Accuracy: 0.97

Training...

  Average training loss: 0.07
  Training epcoh took: 0:08:52

Running Validation...
  Accuracy: 0.98

Training...

  Average training loss: 0.03
  Training epcoh took: 0:08:52

Running Validation...
  Accuracy: 0.98

Training...

  Average training loss: 0.02
  Training epcoh took: 0:08:52

Running Validation...
  Accuracy: 0.98

Training complete!
Total training took 0:38:30 (h:mm:ss)


In [27]:
model = torch.load('bert_model')

  model = torch.load('bert_model')


In [29]:

df_test = pd.read_csv('/content/test_clean.csv', encoding='latin-1')

text_test = df_test['STORY'].values

In [30]:
test_input_ids = []
test_attention_masks = []
for text in text_test:
    encoded_dict = tokenizer.encode_plus(
                        text,
                        add_special_tokens = True,
                        max_length = max_len,
                        pad_to_max_length = True,
                        return_attention_mask = True,
                        return_tensors = 'pt',
                   )
    test_input_ids.append(encoded_dict['input_ids'])
    test_attention_masks.append(encoded_dict['attention_mask'])
test_input_ids = torch.cat(test_input_ids, dim=0)
test_attention_masks = torch.cat(test_attention_masks, dim=0)



In [31]:
test_dataset = TensorDataset(test_input_ids, test_attention_masks)
test_dataloader = DataLoader(
            test_dataset, # The validation samples.
            sampler = SequentialSampler(test_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

In [32]:
predictions = []
for batch in test_dataloader:
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():
            output= model(b_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_input_mask)
            logits = output.logits
            logits = logits.detach().cpu().numpy()
            pred_flat = np.argmax(logits, axis=1).flatten()

            predictions.extend(list(pred_flat))

In [38]:
df_output = pd.DataFrame()
df_output['STORY'] = df_test['STORY']
df_output['target'] =predictions
df_output["Category"] = df_output["target"]
rem = {"Category": {0: "Politics", 1: "Technology", 2:"Entertainment", 3:"Business"}}
df_output = df_output.replace(rem)
df_output.to_csv('submission.csv',index=False)

In [39]:
df_submission = pd.read_csv('/content/submission.csv', encoding='latin-1')

In [40]:
df_submission.head(5)


Unnamed: 0,STORY,target,Category
0,2019 will see gadgets like gaming smartphones ...,1,Technology
1,It has also unleashed a wave of changes in the...,2,Entertainment
2,It can be confusing to pick the right smartpho...,1,Technology
3,The mobile application is integrated with a da...,0,Politics
4,We have rounded up some of the gadgets that sh...,1,Technology
