<a href="https://colab.research.google.com/github/maddran/headlineclassification/blob/main/Headline_classification_BERTML_AG_News.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -Uq datasets
!pip install -Uq transformers
!pip install -Uq sentencepiece

[K     |████████████████████████████████| 163kB 17.1MB/s 
[K     |████████████████████████████████| 245kB 43.4MB/s 
[K     |████████████████████████████████| 20.7MB 1.2MB/s 
[K     |████████████████████████████████| 1.8MB 16.0MB/s 
[K     |████████████████████████████████| 890kB 45.6MB/s 
[K     |████████████████████████████████| 2.9MB 56.0MB/s 
[?25h  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
[K     |████████████████████████████████| 1.2MB 17.9MB/s 
[?25h

In [2]:
from datasets import load_dataset
import pandas as pd

dataset = load_dataset("ag_news")
train_df = pd.DataFrame(dataset['train'][:])
test_df = pd.DataFrame(dataset['test'][:])
del dataset

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1817.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1227.0, style=ProgressStyle(description…

Using custom data configuration default



Downloading and preparing dataset ag_news/default (download: 29.88 MiB, generated: 30.23 MiB, post-processed: Unknown size, total: 60.10 MiB) to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=11045148.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=751209.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset ag_news downloaded and prepared to /root/.cache/huggingface/datasets/ag_news/default/0.0.0/fb5c5e74a110037311ef5e904583ce9f8b9fbc1354290f97b4929f01b3f48b1a. Subsequent calls will reuse this data.


In [3]:
for data_df in [train_df, test_df]:
  data_df.columns = ['category', 'headline']
  print(data_df.head())

   category                                           headline
0         2  Wall St. Bears Claw Back Into the Black (Reute...
1         2  Carlyle Looks Toward Commercial Aerospace (Reu...
2         2  Oil and Economy Cloud Stocks' Outlook (Reuters...
3         2  Iraq Halts Oil Exports from Main Southern Pipe...
4         2  Oil prices soar to all-time record, posing new...
   category                                           headline
0         2  Fears for T N pension after talks Unions repre...
1         3  The Race is On: Second Private Team Sets Launc...
2         3  Ky. Company Wins Grant to Study Peptides (AP) ...
3         3  Prediction Unit Helps Forecast Wildfires (AP) ...
4         3  Calif. Aims to Limit Farm-Related Smog (AP) AP...


In [4]:
from transformers import BertTokenizer
# Load the BERT tokenizer.
print('Loading BERT tokenizer...')
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased', do_lower_case=True)

Loading BERT tokenizer...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=995526.0, style=ProgressStyle(descripti…




In [5]:
def encode_headlines(data):
  # Tokenize all of the sentences and map the tokens to thier word IDs.
  input_ids = []
  # For every sentence...
  for sent in data.headline:
      # `encode` will:
      #   (1) Tokenize the sentence.
      #   (2) Prepend the `[CLS]` token to the start.
      #   (3) Append the `[SEP]` token to the end.
      #   (4) Map tokens to their IDs.
      encoded_sent = tokenizer.encode(
                          sent,                      # Sentence to encode.
                          add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                          # This function also supports truncation and conversion
                          # to pytorch tensors, but we need to do padding, so we
                          # can't use these features :( .
                          truncation = True,
                          max_length = 128,          # Truncate all sentences.
                          #return_tensors = 'pt',     # Return pytorch tensors.
                    )
      
      # Add the encoded sentence to the list.
      input_ids.append(encoded_sent)

  return input_ids

train_input_ids = encode_headlines(train_df)


In [6]:
# Print sentence 0, now as a list of IDs.
print('Original: ', train_df['headline'].values[0])
print('Token IDs:', train_input_ids[0])
print('Max sentence length: ', max([len(sen) for sen in train_input_ids]))

Original:  Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.
Token IDs: [101, 26699, 28780, 119, 77911, 171, 37811, 12014, 10708, 10105, 15045, 113, 11639, 54092, 10107, 114, 11639, 54092, 10107, 118, 13716, 118, 22154, 10943, 117, 26699, 23840, 112, 187, 172, 47673, 11945, 165, 11570, 10108, 71560, 118, 90362, 16981, 117, 10301, 57039, 23320, 13123, 119, 102]
Max sentence length:  128


In [7]:
def pad_sentences(input_ids):
  from keras.preprocessing.sequence import pad_sequences

  MAX_LEN = 128
  print('\nPadding/truncating all sentences to %d values...' % MAX_LEN)
  print('\nPadding token: "{:}", ID: {:}'.format(tokenizer.pad_token, tokenizer.pad_token_id))

  input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", 
                            value=0, truncating="post", padding="post")
  print('\Done.')

  return input_ids

train_input_ids = pad_sentences(train_input_ids)


Padding/truncating all sentences to 128 values...

Padding token: "[PAD]", ID: 0
\Done.


In [8]:
def get_attention(input_ids):
  # Create attention masks
  #     - If a token ID is 0, then it's padding, set the mask to 0.
  #     - If a token ID is > 0, then it's a real token, set the mask to 1.
  attention_masks = [[int(token_id > 0) for token_id in sent] for sent in input_ids]
  return attention_masks

train_attention_masks =  get_attention(train_input_ids)

In [9]:
def train_test_val_split(input_ids, labels, attention_masks, TEST_ = False):

  from sklearn.model_selection import train_test_split

  if TEST_:
    train_inputs, test_inputs, train_labels, test_labels = train_test_split(input_ids, labels, 
                                                                random_state=2332, test_size=0.1)

    train_masks, test_masks, _, _ = train_test_split(attention_masks, labels,
                                                random_state=2332, test_size=0.1)

  else:
    train_labels = labels
    train_inputs = input_ids
    train_masks = attention_masks

  temp_train_labels = train_labels

  train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(train_inputs, train_labels, 
                                                              random_state=2332, test_size=0.1)

  train_masks, validation_masks, _, _ = train_test_split(train_masks, temp_train_labels,
                                              random_state=2332, test_size=0.1)

  if TEST_:
    return (train_inputs, train_labels, train_masks,
            validation_inputs, validation_labels, validation_masks,
            test_inputs, test_labels, test_masks)
  else:
    return (train_inputs, train_labels, train_masks,
            validation_inputs, validation_labels, validation_masks)

labels = train_df['category'].tolist()
(train_inputs, train_labels, train_masks,
  validation_inputs, validation_labels, validation_masks) = train_test_val_split(train_input_ids, labels, train_attention_masks, TEST_ = False)

In [10]:
set(train_labels)

{0, 1, 2, 3}

In [11]:
label_dict = dict(data=train_labels, train=train_labels, val=validation_labels)
nl = "\n"
summary = [pd.Series(l).rename(f"{name}: count = {len(l)}").value_counts()/len(l) for name, l in label_dict.items()]
summary = pd.concat(summary, axis=1)
summary

Unnamed: 0,data: count = 108000,train: count = 108000,val: count = 12000
0,0.249398,0.249398,0.255417
1,0.250463,0.250463,0.245833
2,0.250278,0.250278,0.2475
3,0.249861,0.249861,0.25125


In [12]:
import torch
# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))
# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla T4


In [13]:

train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)

try:
  test_inputs = torch.tensor(test_inputs)
  test_labels = torch.tensor(test_labels)
  test_masks = torch.tensor(test_masks)
except:
  print("No test set defined")

No test set defined


In [14]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
# The DataLoader needs to know our batch size for training, so we specify it 
# here.
# For fine-tuning BERT on a specific task, the authors recommend a batch size of
# 16 or 32.
batch_size = 64
# Create the DataLoader for our training set.
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# Create the DataLoader for our validation set.
validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [15]:
from transformers import BertForSequenceClassification, AdamW, BertConfig
# Load BertForSequenceClassification, the pretrained BERT model with a single 
# linear classification layer on top. 
model = BertForSequenceClassification.from_pretrained(
    "bert-base-multilingual-cased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = len(set(labels)), # The number of output labels--2 for binary classification.
                    # You can increase this for multi-class tasks.   
    output_attentions = False, # Whether the model returns attentions weights.
    output_hidden_states = False, # Whether the model returns all hidden-states.
)
# Tell pytorch to run this model on the GPU.
model.cuda();

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=625.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=714314041.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

In [16]:
# Get all of the model's parameters as a list of tuples.
params = list(model.named_parameters())
print('The BERT model has {:} different named parameters.\n'.format(len(params)))
print('==== Embedding Layer ====\n')
for p in params[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== First Transformer ====\n')
for p in params[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))
print('\n==== Output Layer ====\n')
for p in params[-6:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

The BERT model has 201 different named parameters.

==== Embedding Layer ====

bert.embeddings.word_embeddings.weight                  (119547, 768)
bert.embeddings.position_embeddings.weight                (512, 768)
bert.embeddings.token_type_embeddings.weight                (2, 768)
bert.embeddings.LayerNorm.weight                              (768,)
bert.embeddings.LayerNorm.bias                                (768,)

==== First Transformer ====

bert.encoder.layer.0.attention.self.query.weight          (768, 768)
bert.encoder.layer.0.attention.self.query.bias                (768,)
bert.encoder.layer.0.attention.self.key.weight            (768, 768)
bert.encoder.layer.0.attention.self.key.bias                  (768,)
bert.encoder.layer.0.attention.self.value.weight          (768, 768)
bert.encoder.layer.0.attention.self.value.bias                (768,)
bert.encoder.layer.0.attention.output.dense.weight        (768, 768)
bert.encoder.layer.0.attention.output.dense.bias              

In [17]:
# Note: AdamW is a class from the huggingface library (as opposed to pytorch) 
# I believe the 'W' stands for 'Weight Decay fix"
optimizer = AdamW(model.parameters(),
                  lr = 2e-5, # args.learning_rate - default is 5e-5, our notebook had 2e-5
                  eps = 1e-8 # args.adam_epsilon  - default is 1e-8.
                )
from transformers import get_linear_schedule_with_warmup
# Number of training epochs (authors recommend between 2 and 4)
epochs = 3
# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs
# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [18]:
import numpy as np
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

import time
import datetime
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [19]:
import random
# This training code is based on the `run_glue.py` script here:
# https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128
# Set the seed value all over the place to make this reproducible.
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
# Store the average loss after each epoch so we can plot them.
loss_values = []
# For each epoch...
for epoch_i in range(0, epochs):
    
    # ========================================
    #               Training
    # ========================================
    
    # Perform one full pass over the training set.
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')
    # Measure how long the training epoch takes.
    t0 = time.time()
    # Reset the total loss for this epoch.
    total_loss = 0
    # Put the model into training mode. Don't be mislead--the call to 
    # `train` just changes the *mode*, it doesn't *perform* the training.
    # `dropout` and `batchnorm` layers behave differently during training
    # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
    model.train()
    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):
        # Progress update every 40 batches.
        if step % 50 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            
            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))
        # Unpack this training batch from our dataloader. 
        #
        # As we unpack the batch, we'll also copy each tensor to the GPU using the 
        # `to` method.
        #
        # `batch` contains three pytorch tensors:
        #   [0]: input ids 
        #   [1]: attention masks
        #   [2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Always clear any previously calculated gradients before performing a
        # backward pass. PyTorch doesn't do this automatically because 
        # accumulating the gradients is "convenient while training RNNs". 
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        
        # Perform a forward pass (evaluate the model on this training batch).
        # This will return the loss (rather than the model output) because we
        # have provided the `labels`.
        # The documentation for this `model` function is here: 
        # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        
        # The call to `model` always returns a tuple, so we need to pull the 
        # loss value out of the tuple.
        loss = outputs[0]
        # Accumulate the training loss over all of the batches so that we can
        # calculate the average loss at the end. `loss` is a Tensor containing a
        # single value; the `.item()` function just returns the Python value 
        # from the tensor.
        total_loss += loss.item()
        # Perform a backward pass to calculate the gradients.
        loss.backward()
        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        # Update parameters and take a step using the computed gradient.
        # The optimizer dictates the "update rule"--how the parameters are
        # modified based on their gradients, the learning rate, etc.
        optimizer.step()
        # Update the learning rate.
        scheduler.step()
    # Calculate the average loss over the training data.
    avg_train_loss = total_loss / len(train_dataloader)            
    
    # Store the loss value for plotting the learning curve.
    loss_values.append(avg_train_loss)
    print("")
    print("  Average training loss: {0:.2f}".format(avg_train_loss))
    print("  Training epcoh took: {:}".format(format_time(time.time() - t0)))
        
    # ========================================
    #               Validation
    # ========================================
    # After the completion of each training epoch, measure our performance on
    # our validation set.
    print("")
    print("Running Validation...")
    t0 = time.time()
    # Put the model in evaluation mode--the dropout layers behave differently
    # during evaluation.
    model.eval()
    # Tracking variables 
    eval_loss, eval_accuracy = 0, 0
    nb_eval_steps, nb_eval_examples = 0, 0
    # Evaluate data for one epoch
    for batch in validation_dataloader:
        
        # Add batch to GPU
        batch = tuple(t.to(device) for t in batch)
        
        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_labels = batch
        
        # Telling the model not to compute or store gradients, saving memory and
        # speeding up validation
        with torch.no_grad():        
            # Forward pass, calculate logit predictions.
            # This will return the logits rather than the loss because we have
            # not provided labels.
            # token_type_ids is the same as the "segment ids", which 
            # differentiates sentence 1 and 2 in 2-sentence tasks.
            # The documentation for this `model` function is here: 
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            outputs = model(b_input_ids, 
                            token_type_ids=None, 
                            attention_mask=b_input_mask)
        
        # Get the "logits" output by the model. The "logits" are the output
        # values prior to applying an activation function like the softmax.
        logits = outputs[0]
        # Move logits and labels to CPU
        logits = logits.detach().cpu().numpy()
        label_ids = b_labels.to('cpu').numpy()
        
        # Calculate the accuracy for this batch of test sentences.
        tmp_eval_accuracy = flat_accuracy(logits, label_ids)
        
        # Accumulate the total accuracy.
        eval_accuracy += tmp_eval_accuracy
        # Track the number of batches
        nb_eval_steps += 1
    # Report the final accuracy for this validation run.
    print("  Accuracy: {0:.2f}".format(eval_accuracy/nb_eval_steps))
    print("  Validation took: {:}".format(format_time(time.time() - t0)))
print("")
print("Training complete!")


Training...
  Batch    50  of  1,688.    Elapsed: 0:01:03.
  Batch   100  of  1,688.    Elapsed: 0:02:10.
  Batch   150  of  1,688.    Elapsed: 0:03:20.
  Batch   200  of  1,688.    Elapsed: 0:04:32.
  Batch   250  of  1,688.    Elapsed: 0:05:44.
  Batch   300  of  1,688.    Elapsed: 0:06:56.
  Batch   350  of  1,688.    Elapsed: 0:08:08.
  Batch   400  of  1,688.    Elapsed: 0:09:21.
  Batch   450  of  1,688.    Elapsed: 0:10:33.
  Batch   500  of  1,688.    Elapsed: 0:11:46.
  Batch   550  of  1,688.    Elapsed: 0:12:58.
  Batch   600  of  1,688.    Elapsed: 0:14:10.
  Batch   650  of  1,688.    Elapsed: 0:15:22.
  Batch   700  of  1,688.    Elapsed: 0:16:35.
  Batch   750  of  1,688.    Elapsed: 0:17:47.
  Batch   800  of  1,688.    Elapsed: 0:18:59.
  Batch   850  of  1,688.    Elapsed: 0:20:12.
  Batch   900  of  1,688.    Elapsed: 0:21:24.
  Batch   950  of  1,688.    Elapsed: 0:22:36.
  Batch 1,000  of  1,688.    Elapsed: 0:23:48.
  Batch 1,050  of  1,688.    Elapsed: 0:25:00.


In [20]:
import plotly.express as px
f = pd.DataFrame(loss_values)
f.columns=['Loss']
fig = px.line(f, x=f.index, y=f.Loss)
fig.update_layout(title='Training loss of the Model',
                   xaxis_title='Epoch',
                   yaxis_title='Loss')
fig.show()

In [22]:
test_input_ids = encode_headlines(test_df)
test_inputs = pad_sentences(test_input_ids)
test_masks = get_attention(test_inputs)
test_labels = test_df['category'].tolist()

test_inputs = torch.tensor(test_inputs)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_masks)

# Create the DataLoader.
prediction_data = TensorDataset(test_inputs, test_masks, test_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)


Padding/truncating all sentences to 128 values...

Padding token: "[PAD]", ID: 0
\Done.


In [23]:
# Prediction on test set
print('Predicting labels for {:,} test sentences...'.format(len(test_inputs)))
# Put model in evaluation mode
model.eval()
# Tracking variables 
predictions , true_labels = [], []
# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  
  # Telling the model not to compute or store gradients, saving memory and 
  # speeding up prediction
  with torch.no_grad():
      # Forward pass, calculate logit predictions
      outputs = model(b_input_ids, token_type_ids=None, 
                      attention_mask=b_input_mask)
  logits = outputs[0]
  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)
print('DONE.')

Predicting labels for 7,600 test sentences...
DONE.


In [24]:
from sklearn.metrics import matthews_corrcoef
matthews_set = []
# Evaluate each test batch using Matthew's correlation coefficient
print('Calculating Matthews Corr. Coef. for each batch...')
# For each input batch...
for i in range(len(true_labels)):
  
  # The predictions for this batch are a 2-column ndarray (one column for "0" 
  # and one column for "1"). Pick the label with the highest value and turn this
  # in to a list of 0s and 1s.
  pred_labels_i = np.argmax(predictions[i], axis=1).flatten()
  
  # Calculate and store the coef for this batch.  
  matthews = matthews_corrcoef(true_labels[i], pred_labels_i)                
  matthews_set.append(matthews)

Calculating Matthews Corr. Coef. for each batch...


In [25]:
# Combine the predictions for each batch into a single list of 0s and 1s.
flat_predictions = [item for sublist in predictions for item in sublist]
flat_predictions = np.argmax(flat_predictions, axis=1).flatten()
# Combine the correct labels for each batch into a single list.
flat_true_labels = [item for sublist in true_labels for item in sublist]
# Calculate the MCC
mcc = matthews_corrcoef(flat_true_labels, flat_predictions)
print('MCC: %.3f' % mcc)

MCC: 0.920


In [26]:
from sklearn.metrics import confusion_matrix,classification_report

preds = [np.argmax(predictions[i], axis=1).flatten() for i in range(len(true_labels))]
preds = np.concatenate(preds).ravel()

true_labels = np.concatenate(true_labels).ravel()

confusion_matrix(true_labels, preds)

array([[1803,   12,   47,   38],
       [  15, 1865,   11,    9],
       [  46,   11, 1710,  133],
       [  32,    6,   97, 1765]])

In [27]:
print(classification_report(true_labels, preds))

              precision    recall  f1-score   support

           0       0.95      0.95      0.95      1900
           1       0.98      0.98      0.98      1900
           2       0.92      0.90      0.91      1900
           3       0.91      0.93      0.92      1900

    accuracy                           0.94      7600
   macro avg       0.94      0.94      0.94      7600
weighted avg       0.94      0.94      0.94      7600



In [31]:
torch.save(model, "bert_multilingual_newscategorization_ag_news_datastet_3_epochs.pt")

In [None]:
#from https://huggingface.co/datasets/ag_news
ag_topics = {0:'world', 1:'sport', 2:'business', 3:'tech'}


In [28]:
from glob import glob

if "bbc-fulltext.zip" not in glob("bbc*.zip"):
  !wget "http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip"
  !unzip -q bbc*.zip

def get_headline(filepath):
  with open(filepath, 'rb') as f:
    first_line = f.readline()
  return first_line.decode('utf-8')


--2021-02-05 22:32:19--  http://mlg.ucd.ie/files/datasets/bbc-fulltext.zip
Resolving mlg.ucd.ie (mlg.ucd.ie)... 137.43.93.132
Connecting to mlg.ucd.ie (mlg.ucd.ie)|137.43.93.132|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2874078 (2.7M) [application/zip]
Saving to: ‘bbc-fulltext.zip’


2021-02-05 22:32:20 (4.53 MB/s) - ‘bbc-fulltext.zip’ saved [2874078/2874078]



In [29]:
import os
import pandas as pd

path = "bbc"
subfolders = [f.path for f in os.scandir(path) if f.is_dir()]
res = []
for sf in subfolders:
  category = sf.split('/')[-1]
  glob_pattern = os.path.join(f'bbc/{category}', '*')
  filepaths = sorted(glob(glob_pattern), key=os.path.getctime)
  res.append([{"category":category, "headline":get_headline(fp)} 
              for fp in filepaths])
    
res = [item for sublist in res for item in sublist]
bbc_data = pd.DataFrame(res)
bbc_data['headline'] = raw_data['headline'].replace(r'\n','', regex=True) 
bbc_data.head()

Unnamed: 0,category,headline
0,entertainment,Poppins musical gets flying start
1,entertainment,Levy takes Whitbread novel prize
2,entertainment,Musical treatment for Capra film
3,entertainment,Uganda bans Vagina Monologues
4,entertainment,Artists' secret postcards on sale


In [30]:
def categorization_pipeline(model)

['business' 'entertainment' 'politics' 'sport' 'tech']


Unnamed: 0,category,headline
0,1,Poppins musical gets flying start
1,1,Levy takes Whitbread novel prize
2,1,Musical treatment for Capra film
3,1,Uganda bans Vagina Monologues
4,1,Artists' secret postcards on sale
