In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.6.0-cp37-cp37m-manylinux1_x86_64.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 5.4 MB/s 
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.6.0


In [3]:
!pip install pyyaml h5py 



In [4]:
import os
import numpy as np
import pandas as pd
import re
import random
import sys
import time
import gc
from datetime import datetime
from sklearn.model_selection import train_test_split

In [5]:
import tensorflow as tf
import tensorflow_hub as hub
import logging
logging.basicConfig(level=logging.INFO)

In [8]:
#Loading the JSON file from the dataframe
#The whole path will have to be specified for colab
basePath = '/content/drive/My Drive/BERT model/'
news_df = pd.read_json(basePath + "IndianNews_Dataset_for_testtrainsplit.json", orient ='split', compression = 'infer')
news_df.shape

INFO:numexpr.utils:NumExpr defaulting to 2 threads.


(25524, 4)

In [9]:
#Creating a dictionary to change the values of the Ideology column
#Note that the number have to start from 0 to use the keras to_categorical() function
id_dict = {'centre': 0, 'left': 1, 'right': 2}

In [10]:
#Checking the values in this column before mapping
news_df['Ideology'].unique()

array(['centre', 'left', 'right'], dtype=object)

In [11]:
#Using the dictionary to change the values in the column
news_df['Ideology'] = news_df['Ideology'].map(id_dict)
news_df['Ideology'].unique()

array([0, 1, 2])

Splitting the data into test and train sets.

In [12]:
X_train, X_test, y_train, y_test = train_test_split(news_df.drop(['Ideology'], axis=1), news_df['Ideology'], 
                                                    test_size=0.25, random_state=100, stratify = news_df['Ideology'])

Next, splitting the train set into train and validation sets

In [13]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=100, stratify = y_train)

In [14]:
print("Train set size: ", X_train.shape[0])
print("Test set size: ", X_test.shape[0])
print("Validation set size: ", X_val.shape[0])

Train set size:  15314
Test set size:  6381
Validation set size:  3829


In [15]:
type(y_train)

pandas.core.series.Series

In [16]:
#Converting the y_x from Series to Dataframes
y_train = pd.DataFrame({'Ideology': y_train})
y_test = pd.DataFrame({'Ideology': y_test})
y_val = pd.DataFrame({'Ideology': y_val})

In [17]:
del news_df
gc.collect()

74

### Loading BERT model

We will need a BERT Tokenization class.

In [18]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [19]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 5.2 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [20]:
import tokenization

module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'.
INFO:absl:Downloaded https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2, Total size: 421.50MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'.


Try this other BERT model:
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'

### Tokenization and Encoding

In [21]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)


### Building the Model

In [22]:
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='relu')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(32, activation='relu')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(3, activation='softmax')(net)

    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(learning_rate=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [23]:
max_len = 512

start_time = time.time()

train_input = bert_encode(X_train['Title+Article'].values, tokenizer, max_len=max_len)
val_input = bert_encode(X_val['Title+Article'].values, tokenizer, max_len=max_len)

print("Time taken for encoding: ", time.time()-start_time)

Time taken for encoding:  213.89260411262512


In [24]:
train_labels = tf.keras.utils.to_categorical(y_train['Ideology'].values, num_classes=3)

In [25]:
train_input

(array([[  101,  7148,  9453, ...,  2097,  5466,   102],
        [  101, 24954,  8011, ...,  1997,  2119,   102],
        [  101,  7842, 25990, ...,  2212,  4105,   102],
        ...,
        [  101, 10195,  9054, ...,     0,     0,     0],
        [  101,  3720, 16444, ...,     0,     0,     0],
        [  101,  2339,  2053, ...,  1037,  2112,   102]]),
 array([[1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        [1, 1, 1, ..., 1, 1, 1],
        ...,
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 0, 0, 0],
        [1, 1, 1, ..., 1, 1, 1]]),
 array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]]))

In [26]:
model = build_model(bert_layer, max_len=max_len)
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 512)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

Getting the path where the models are stored

In [28]:
checkpoint_path = basePath + "model_1_v2/saved models/cp-model.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

Looking at the checkpoints on the drive:

In [29]:
os.listdir(checkpoint_dir)

['cp-0002.ckpt.index', 'cp-0002.ckpt.data-00000-of-00001']

In [38]:
# Choosing the latest model
latest = tf.train.latest_checkpoint(checkpoint_dir)
latest

'/content/drive/My Drive/BERT model/model_1_v2/saved models/cp-0002.ckpt'

In [39]:
# Create a new model instance
model = build_model(bert_layer, max_len=max_len)

# Loading the saved weights to the model
model.load_weights(latest)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7fa6014923d0>

Let's resume training

In [40]:
#Setting the path where the model weights should be saved
checkpoint_path = basePath + "model_1_v2/saved models/cp-model.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

#Defining a few variables to determine the number of times the model should be saved
batch_size = 8        #This is the batch size that is possible on colabs's GPU
validation_split=0.2

#Creating callbacks for saving the model and for early stopping
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path, monitor='val_accuracy', save_weights_only=True, 
                                                 save_freq='epoch', verbose=1)
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=5, verbose=1)

train_history = model.fit(
    train_input, train_labels, 
    validation_split=validation_split,
    epochs=3,
    callbacks=[cp_callback, earlystopping],
    batch_size=batch_size,
    verbose=1, shuffle=True)

Epoch 1/3

Epoch 00001: saving model to /content/drive/My Drive/BERT model/model_1_v2/saved models/cp-model.ckpt
Epoch 2/3

Epoch 00002: saving model to /content/drive/My Drive/BERT model/model_1_v2/saved models/cp-model.ckpt
Epoch 3/3

Epoch 00003: saving model to /content/drive/My Drive/BERT model/model_1_v2/saved models/cp-model.ckpt


Since the difference between train and validation accuracy is already quite high, let's not train the model for another epoch.

Predictions for the validation data:

In [41]:
val_pred = model.predict(val_input)

In [42]:
val_pred

array([[6.7592603e-01, 9.0939283e-02, 2.3313466e-01],
       [1.4551640e-02, 2.2299024e-03, 9.8321837e-01],
       [3.7182182e-01, 5.1419479e-01, 1.1398335e-01],
       ...,
       [9.6483696e-01, 3.1228198e-02, 3.9348998e-03],
       [6.5030134e-01, 5.7233557e-02, 2.9246512e-01],
       [6.4957429e-05, 9.9992120e-01, 1.3799337e-05]], dtype=float32)

In [55]:
from sklearn.metrics import classification_report

In [58]:
print(classification_report(list(y_val['Ideology']), [list(i).index(max(i)) for i in val_pred]))

              precision    recall  f1-score   support

           0       0.73      0.64      0.68      1196
           1       0.85      0.84      0.84      1440
           2       0.70      0.80      0.74      1193

    accuracy                           0.76      3829
   macro avg       0.76      0.76      0.76      3829
weighted avg       0.77      0.76      0.76      3829



The macro f1-score is 0.76, and the accuracy is 0.76.

In [59]:
model.save('/content/drive/My Drive/BERT model/model_1_v2/saved models/final/final_saved_model') 



INFO:tensorflow:Assets written to: /content/drive/My Drive/BERT model/model_1_v2/saved models/final/final_saved_model/assets


INFO:tensorflow:Assets written to: /content/drive/My Drive/BERT model/model_1_v2/saved models/final/final_saved_model/assets


Loading the saved model

In [60]:
new_model = tf.keras.models.load_model('/content/drive/My Drive/BERT model/model_1_v2/saved models/final/final_saved_model')

# Check its architecture
new_model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 512)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]           

In [61]:
val_pred_new = new_model.predict(val_input)

In [62]:
print(classification_report(list(y_val['Ideology']), [list(i).index(max(i)) for i in val_pred_new]))

              precision    recall  f1-score   support

           0       0.73      0.64      0.68      1196
           1       0.85      0.84      0.84      1440
           2       0.70      0.80      0.74      1193

    accuracy                           0.76      3829
   macro avg       0.76      0.76      0.76      3829
weighted avg       0.77      0.76      0.76      3829



The results are the same, as expected.