*Copyright (c) Microsoft Corporation. All rights reserved.*

*Licensed under the MIT License.*

# Classification of Hindi BBC News Data using BERT

In [1]:
import os
import sys

import json
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import scrapbook as sb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

sys.path.append("../../")
from utils_nlp.common.timer import Timer
from utils_nlp.dataset.multinli import load_pandas_df
from utils_nlp.models.bert.common import Language, Tokenizer
from utils_nlp.models.bert.sequence_classification import BERTSequenceClassifier

## Introduction
In this notebook, we fine-tune and evaluate a pretrained [BERT](https://arxiv.org/abs/1810.04805) model on a subset of the [BBC Hindi News](https://github.com/NirantK/hindi2vec/releases/tag/bbc-hindi-v0.1) dataset.

We use a [sequence classifier](../../utils_nlp/bert/sequence_classification.py) that wraps [Hugging Face's PyTorch implementation](https://github.com/huggingface/pytorch-pretrained-BERT) of Google's [BERT](https://github.com/google-research/bert).

In [2]:
DATA_FOLDER = "./temp"
BERT_CACHE_DIR = "./temp"
LANGUAGE = Language.MULTILINGUAL
TO_LOWER = False
MAX_LEN = 128
BATCH_SIZE = 8
WARMUP_PROPORTION = 0.1
NUM_GPUS = 2
NUM_EPOCHS = 2
LABEL_COL = "news_category"
TEXT_COL = "news_content"

## Read Dataset
We start by downloading the dataset by using the following command.



In [3]:
!wget https://github.com/NirantK/hindi2vec/releases/download/bbc-hindi-v0.1/bbc-hindiv01.tar.gz &&\
    mkdir -p bbc-hindiv01 &&\
    mv bbc-hindiv01.tar.gz ./bbc-hindiv01 && cd ./bbc-hindiv01 &&\
    tar -xvf bbc-hindiv01.tar.gz 

--2019-09-12 16:01:58--  https://github.com/NirantK/hindi2vec/releases/download/bbc-hindi-v0.1/bbc-hindiv01.tar.gz
Resolving github.com (github.com)... 140.82.113.3
Connecting to github.com (github.com)|140.82.113.3|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://github-production-release-asset-2e65be.s3.amazonaws.com/123591003/701307f8-3cb5-11e8-9472-df990c204ce8?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWNJYAX4CSVEH53A%2F20190912%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20190912T160158Z&X-Amz-Expires=300&X-Amz-Signature=f1da6919e49dba6ebcc3f040ff6a9ffa2c7235a60b9797ba37b86a798214def9&X-Amz-SignedHeaders=host&actor_id=0&response-content-disposition=attachment%3B%20filename%3Dbbc-hindiv01.tar.gz&response-content-type=application%2Foctet-stream [following]
--2019-09-12 16:01:58--  https://github-production-release-asset-2e65be.s3.amazonaws.com/123591003/701307f8-3cb5-11e8-9472-df990c204ce8?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Crede

Once dataset is downloaded, we can just use pandas to load the training and testing data into dataframes and also inspect the dataframes. 

For our classification task, we are limited by the memory of the machine we use. We need to set appropriate maximum sequence MAX_LEN and bath size BATCH_SIZE to fit the training data into memory. This notebook has ran on a machine with two  Tesla K80 GPUS.   If you experience any out of memory issue, you should consider descrease the MAX_LEN and/or BATCH_SIZE but you may see difference accuracy of the model

In [4]:
df_train = pd.read_csv('./bbc-hindiv01/hindi-train.csv', sep="\t", encoding='utf-8', header=None)
df_train.head()

Unnamed: 0,0,1
0,india,मेट्रो की इस लाइन के चलने से दक्षिणी दिल्ली से...
1,pakistan,नेटिजन यानि इंटरनेट पर सक्रिय नागरिक अब ट्विटर...
2,news,इसमें एक फ़्लाइट एटेनडेंट की मदद की गुहार है औ...
3,india,"प्रतीक खुलेपन का, आज़ाद ख्याली का और भीड़ से अ..."
4,india,ख़ासकर पिछले 10 साल तक प्रधानमंत्री रहे मनमोहन...


In [5]:
df_test = pd.read_csv('./bbc-hindiv01/hindi-test.csv', sep="\t", encoding='utf-8', header=None)
df_test.head()

Unnamed: 0,0,1
0,india,बुधवार को राज्य सभा में विपक्ष के सवालों के जव...
1,india,लखनऊ स्थित पत्रकार समीरात्मज मिश्र को बुलंदशहर...
2,india,लगभग 1300 हेक्टेयर ज़मीन का अधिग्रहण किया जा च...
3,international,हालांकि उनके अंगरक्षकों को बमों को जाम करने वा...
4,india,आयोग का कहना है कि इस तरह के परीक्षण से महिलाओ...


In [6]:
df_train.describe()

Unnamed: 0,0,1
count,3468,3467
unique,14,3458
top,india,"हम प्रायः पशु, पक्षियों और कीड़ों-मकोड़ों के ह..."
freq,1390,2


In [7]:
df_test.describe()

Unnamed: 0,0,1
count,867,866
unique,14,865
top,india,यहां घर-घर में साड़ी बुनने के हैंडलूम लगे हैं....
freq,357,2


In [8]:
df_train.columns = [LABEL_COL, TEXT_COL]
df_test.columns = [LABEL_COL, TEXT_COL]

In [9]:
df_train = df_train.fillna("")
df_test = df_test.fillna("")

The examples in the dataset are grouped into 14 categories:

In [10]:
df_train[LABEL_COL].value_counts()

india              1390
international       904
entertainment       285
sport               258
news                230
science             194
business             54
pakistan             43
southasia            42
institutional        19
social               18
china                14
multimedia           12
learningenglish       5
Name: news_category, dtype: int64

In [11]:
print("Number of training examples: {}".format(df_train.shape[0]))
print("Number of testing examples: {}".format(df_test.shape[0]))

Number of training examples: 3468
Number of testing examples: 867


## Tokenize and Preprocess 
Before training, we tokenize the text documents and convert them to lists of tokens. The following steps instantiate a BERT tokenizer given the language, and tokenize the text of the training and testing sets. 
In addition, we perform the following preprocessing steps in the following cell:
- Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary
- Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence
- Pad or truncate the token lists to the specified max length
- Return mask lists that indicate paddings' positions
- Return token type id lists that indicate which sentence the tokens belong to (not needed for one-sequence classification)

*See the original [implementation](https://github.com/google-research/bert/blob/master/run_classifier.py) for more information on BERT's input format.*

In [12]:
tokenizer = Tokenizer(LANGUAGE, TO_LOWER, BERT_CACHE_DIR)
tokens_train = tokenizer.tokenize(list(df_train[TEXT_COL]))
tokens_test = tokenizer.tokenize(list(df_test[TEXT_COL]))

label_encoder = LabelEncoder()
labels_train = label_encoder.fit_transform(df_train[LABEL_COL])
labels_test = label_encoder.transform(df_test[LABEL_COL])
num_labels = len(np.unique(labels_train))

100%|██████████| 3468/3468 [00:27<00:00, 123.97it/s]
100%|██████████| 867/867 [00:06<00:00, 125.47it/s]


In [13]:
tokens_train, mask_train, _ = tokenizer.preprocess_classification_tokens(
    tokens_train, MAX_LEN
)
tokens_test, mask_test, _ = tokenizer.preprocess_classification_tokens(
    tokens_test, MAX_LEN
)

## Create Model
Next, we create a sequence classifier that loads a pre-trained BERT model.

In [14]:
classifier = BERTSequenceClassifier(LANGUAGE, num_labels=num_labels, cache_dir=BERT_CACHE_DIR)

## Train
We train the classifier using the training examples. This involves fine-tuning the BERT Transformer and learning a linear classification layer on top of that:

In [15]:
with Timer() as t:
    classifier.fit(
        token_ids=tokens_train,
        input_mask=mask_train,
        labels=labels_train,    
        num_gpus=NUM_GPUS,        
        num_epochs=NUM_EPOCHS,
        batch_size=BATCH_SIZE,
        warmup_proportion=WARMUP_PROPORTION,
        verbose=True,
    )    
print("[Training time: {:.3f} hrs]".format(t.interval / 3600))

Iteration:   0%|          | 1/434 [00:02<18:06,  2.51s/it]

epoch:1/2; batch:1->44/434; average training loss:2.665879


Iteration:  10%|█         | 45/434 [00:32<04:27,  1.46it/s]

epoch:1/2; batch:45->88/434; average training loss:2.100084


Iteration:  21%|██        | 89/434 [01:02<03:57,  1.45it/s]

epoch:1/2; batch:89->132/434; average training loss:1.840270


Iteration:  31%|███       | 133/434 [01:33<03:27,  1.45it/s]

epoch:1/2; batch:133->176/434; average training loss:1.703301


Iteration:  41%|████      | 177/434 [02:03<02:57,  1.44it/s]

epoch:1/2; batch:177->220/434; average training loss:1.611534


Iteration:  51%|█████     | 221/434 [02:34<02:27,  1.44it/s]

epoch:1/2; batch:221->264/434; average training loss:1.581564


Iteration:  61%|██████    | 265/434 [03:04<01:56,  1.45it/s]

epoch:1/2; batch:265->308/434; average training loss:1.549611


Iteration:  71%|███████   | 309/434 [03:35<01:30,  1.39it/s]

epoch:1/2; batch:309->352/434; average training loss:1.507914


Iteration:  81%|████████▏ | 353/434 [04:07<00:59,  1.37it/s]

epoch:1/2; batch:353->396/434; average training loss:1.474626


Iteration:  91%|█████████▏| 397/434 [04:39<00:26,  1.40it/s]

epoch:1/2; batch:397->434/434; average training loss:1.453205


Iteration: 100%|██████████| 434/434 [05:06<00:00,  1.38it/s]
Iteration:   0%|          | 1/434 [00:00<05:57,  1.21it/s]

epoch:2/2; batch:1->44/434; average training loss:0.690934


Iteration:  10%|█         | 45/434 [00:34<05:07,  1.27it/s]

epoch:2/2; batch:45->88/434; average training loss:1.146616


Iteration:  21%|██        | 89/434 [01:08<04:27,  1.29it/s]

epoch:2/2; batch:89->132/434; average training loss:1.077667


Iteration:  31%|███       | 133/434 [01:43<03:54,  1.29it/s]

epoch:2/2; batch:133->176/434; average training loss:1.033159


Iteration:  41%|████      | 177/434 [02:18<03:29,  1.23it/s]

epoch:2/2; batch:177->220/434; average training loss:1.023701


Iteration:  51%|█████     | 221/434 [02:52<02:51,  1.24it/s]

epoch:2/2; batch:221->264/434; average training loss:1.049415


Iteration:  61%|██████    | 265/434 [03:23<01:57,  1.44it/s]

epoch:2/2; batch:265->308/434; average training loss:1.049472


Iteration:  71%|███████   | 309/434 [03:54<01:26,  1.44it/s]

epoch:2/2; batch:309->352/434; average training loss:1.027788


Iteration:  81%|████████▏ | 353/434 [04:24<00:55,  1.45it/s]

epoch:2/2; batch:353->396/434; average training loss:1.000812


Iteration:  91%|█████████▏| 397/434 [04:55<00:25,  1.44it/s]

epoch:2/2; batch:397->434/434; average training loss:0.998862


Iteration: 100%|██████████| 434/434 [05:20<00:00,  1.49it/s]

[Training time: 0.175 hrs]





## Score
We score the test set using the trained classifier:

In [16]:
preds = classifier.predict(
    token_ids=tokens_test, input_mask=mask_test, num_gpus=NUM_GPUS, batch_size=BATCH_SIZE
)

Iteration: 100%|██████████| 109/109 [00:21<00:00,  5.31it/s]


## Evaluate Results
Finally, we compute the accuracy, precision, recall, and F1 metrics of the evaluation on the test set.

In [17]:
report = classification_report(labels_test, preds, target_names=label_encoder.classes_, output_dict=True) 
accuracy = accuracy_score(labels_test, preds )
print("accuracy: {}".format(accuracy))
print(json.dumps(report, indent=4, sort_keys=True))

accuracy: 0.7104959630911188
{
    "business": {
        "f1-score": 0.0,
        "precision": 0.0,
        "recall": 0.0,
        "support": 7
    },
    "china": {
        "f1-score": 0.0,
        "precision": 0.0,
        "recall": 0.0,
        "support": 5
    },
    "entertainment": {
        "f1-score": 0.7133757961783439,
        "precision": 0.6511627906976745,
        "recall": 0.7887323943661971,
        "support": 71
    },
    "india": {
        "f1-score": 0.8192090395480226,
        "precision": 0.8262108262108262,
        "recall": 0.8123249299719888,
        "support": 357
    },
    "institutional": {
        "f1-score": 0.0,
        "precision": 0.0,
        "recall": 0.0,
        "support": 4
    },
    "international": {
        "f1-score": 0.6787878787878788,
        "precision": 0.5936395759717314,
        "recall": 0.7924528301886793,
        "support": 212
    },
    "learningenglish": {
        "f1-score": 0.0,
        "precision": 0.0,
        "recall": 0.0,
 

  'precision', 'predicted', average, warn_for)


In [18]:
# for testing
sb.glue("accuracy", accuracy)
sb.glue("precision", report["macro avg"]["precision"])
sb.glue("recall", report["macro avg"]["recall"])
sb.glue("f1", report["macro avg"]["f1-score"])