This notebook loads the saved weights of the fine-tuned BERT model and makes predictions on the test set.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
pip install tensorflow_text

Collecting tensorflow_text
  Downloading tensorflow_text-2.7.3-cp37-cp37m-manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.5 MB/s 
Installing collected packages: tensorflow-text
Successfully installed tensorflow-text-2.7.3


In [None]:
!pip install pyyaml h5py 



In [None]:
import os
import numpy as np
import pandas as pd
import re
import random
import sys
import time
import gc
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import metrics

In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
#Loading the JSON file from the dataframe
#The whole path will have to be specified for colab
basePath = '/content/drive/My Drive/Dissertation/Classification Models/BERT/'
news_df = pd.read_json(basePath + "IndianNews_Dataset_for_testtrainsplit.json", orient ='split', compression = 'infer')
news_df.shape

INFO:numexpr.utils:NumExpr defaulting to 2 threads.


(25524, 7)

In [None]:
#Creating a dictionary to change the values of the Ideology column
#Note that the number have to start from 0 to use the keras to_categorical() function
id_dict = {'centre': 0, 'left': 1, 'right': 2}

In [None]:
#Checking the values in this column before mapping
news_df['Ideology'].unique()

array(['centre', 'left', 'right'], dtype=object)

In [None]:
#Using the dictionary to change the values in the column
news_df['Ideology'] = news_df['Ideology'].map(id_dict)
news_df['Ideology'].unique()

array([0, 1, 2])

Splitting the data into test and train sets.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(news_df.drop(['Ideology'], axis=1), news_df['Ideology'], 
                                                    test_size=0.25, random_state=100, stratify = news_df['Ideology'])

Next, splitting the train set into train and validation sets

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=100, stratify = y_train)

In [None]:
print("Train set size: ", X_train.shape[0])
print("Test set size: ", X_test.shape[0])
print("Validation set size: ", X_val.shape[0])

Train set size:  15314
Test set size:  6381
Validation set size:  3829


In [None]:
type(y_train)

pandas.core.series.Series

In [None]:
#Converting the y_x from Series to Dataframes
y_train = pd.DataFrame({'Ideology': y_train})
y_test = pd.DataFrame({'Ideology': y_test})
y_val = pd.DataFrame({'Ideology': y_val})

In [None]:
del news_df
gc.collect()

188

### Loading BERT model

We will need a BERT Tokenization class.

In [None]:
!wget --quiet https://raw.githubusercontent.com/tensorflow/models/master/official/nlp/bert/tokenization.py

In [None]:
!pip install sentencepiece

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 5.2 MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.96


In [None]:
import tokenization

module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'
bert_layer = hub.KerasLayer(module_url, trainable=True)

INFO:absl:Using /tmp/tfhub_modules to cache modules.
INFO:absl:Downloading TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'.
INFO:absl:Downloaded https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2, Total size: 421.50MB
INFO:absl:Downloaded TF-Hub Module 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'.


Try this other BERT model:
module_url = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'

### Tokenization and Encoding

In [None]:
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
tokenizer = tokenization.FullTokenizer(vocab_file, do_lower_case)

def bert_encode(texts, tokenizer, max_len=512):
    all_tokens = []
    all_masks = []
    all_segments = []
    
    for text in texts:
        text = tokenizer.tokenize(text)
            
        text = text[:max_len-2]
        input_sequence = ["[CLS]"] + text + ["[SEP]"]
        pad_len = max_len - len(input_sequence)
        
        tokens = tokenizer.convert_tokens_to_ids(input_sequence) + [0] * pad_len
        pad_masks = [1] * len(input_sequence) + [0] * pad_len
        segment_ids = [0] * max_len
        
        all_tokens.append(tokens)
        all_masks.append(pad_masks)
        all_segments.append(segment_ids)
    
    return np.array(all_tokens), np.array(all_masks), np.array(all_segments)


### Building the Model

In [None]:
def build_model(bert_layer, max_len=512):
    input_word_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    input_mask = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="input_mask")
    segment_ids = tf.keras.Input(shape=(max_len,), dtype=tf.int32, name="segment_ids")

    pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])
    clf_output = sequence_output[:, 0, :]
    net = tf.keras.layers.Dense(64, activation='tanh')(clf_output)
    net = tf.keras.layers.Dropout(0.2)(net)
    net = tf.keras.layers.Dense(32, activation='tanh')(net)
    net = tf.keras.layers.Dropout(0.2)(net)
    out = tf.keras.layers.Dense(3, activation='softmax')(net)

    model = tf.keras.models.Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=out)
    model.compile(tf.keras.optimizers.Adam(learning_rate=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [None]:
max_len = 512

start_time = time.time()

train_input = bert_encode(X_train['Title+Article'].values, tokenizer, max_len=max_len)
val_input = bert_encode(X_val['Title+Article'].values, tokenizer, max_len=max_len)

print("Time taken for encoding (in seconds): ", time.time()-start_time)

Time taken for encoding (in seconds):  230.08318066596985


In [None]:
#Converting the labels column to a categorical variable (from string)
train_labels = tf.keras.utils.to_categorical(y_train['Ideology'].values, num_classes=3)

In [None]:
model = build_model(bert_layer, max_len=max_len)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 512)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 512)]        0           []                               
                                                                                                  
 keras_layer (KerasLayer)       [(None, 768),        109482241   ['input_word_ids[0][0]',         
                                 (None, 512, 768)]                'input_mask[0][0]',         

Loading the saved model

In [None]:
path = basePath + "final_saved_model/"

In [None]:
#Loading the saved model
bert_model_final = tf.keras.models.load_model(path)

### Making predictions on the validation set

In [None]:
#Making predictions on the validation set
val_pred_new = bert_model_final.predict(val_input)

In [None]:
#Viewing the classification report
print(classification_report(list(y_val['Ideology']), [list(i).index(max(i)) for i in val_pred_new]))

              precision    recall  f1-score   support

           0       0.75      0.73      0.74      1196
           1       0.81      0.88      0.84      1440
           2       0.79      0.72      0.75      1193

    accuracy                           0.78      3829
   macro avg       0.78      0.78      0.78      3829
weighted avg       0.78      0.78      0.78      3829



In [None]:
#Printing metrics for predictions
print("Confusion matrix: \n", metrics.confusion_matrix(list(y_val['Ideology']), [list(i).index(max(i)) for i in val_pred_new]))
print("\nAccuracy: ", metrics.accuracy_score(list(y_val['Ideology']), [list(i).index(max(i)) for i in val_pred_new]))
print("\nWeighted f1-score: ", metrics.f1_score(list(y_val['Ideology']), [list(i).index(max(i)) for i in val_pred_new], average='weighted'))

Confusion matrix: 
 [[ 871  158  167]
 [ 101 1270   69]
 [ 188  143  862]]

Accuracy:  0.7842778793418648

Weighted f1-score:  0.7826584346789616


### Making predictions on the test set

In [None]:
#Tokenizing the test set
test_input = bert_encode(X_test['Title+Article'].values, tokenizer, max_len=max_len)

In [None]:
#Making predictions on the test set
test_pred_new = bert_model_final.predict(test_input)

In [None]:
#Viewing the classification report
print(classification_report(list(y_test['Ideology']), [list(i).index(max(i)) for i in test_pred_new]))

              precision    recall  f1-score   support

           0       0.75      0.69      0.72      1993
           1       0.79      0.88      0.84      2399
           2       0.78      0.73      0.75      1989

    accuracy                           0.78      6381
   macro avg       0.77      0.77      0.77      6381
weighted avg       0.78      0.78      0.78      6381



In [None]:
#Printing metrics for predictions
print("Confusion matrix: \n", metrics.confusion_matrix(list(y_test['Ideology']), [list(i).index(max(i)) for i in test_pred_new]))
print("\nAccuracy: ", metrics.accuracy_score(list(y_test['Ideology']), [list(i).index(max(i)) for i in test_pred_new]))
print("\nWeighted f1-score: ", metrics.f1_score(list(y_test['Ideology']), [list(i).index(max(i)) for i in test_pred_new], average='weighted'))

Confusion matrix: 
 [[1385  320  288]
 [ 156 2118  125]
 [ 306  227 1456]]

Accuracy:  0.7771509167842031

Weighted f1-score:  0.7750655090903723


Next, let's get predictions for each ideology label in the test set.

In [None]:
#Creating a method to get subsets of a dataframe for validation
def subset_creator(df, labels_series, ideology_label):

  #Defining a dictionary that maps news outlets to their political ideology
  outlet_ideology_dict = {"left": "ndtv|thewire|scroll|telegraph|thestatesman",
                        "centre": "TOI|theprint|IndiaToday|asiangage|freepressjournal",
                        "right": "opindia|NIE|oneindia|dailypioneer|indiatv"}


  #Converting text to sequences
  subset_tokenized = bert_encode(df['Title+Article'][df['News_Outlet'].str.contains(outlet_ideology_dict.get(ideology_label))], tokenizer, max_len=max_len)

  #Getting the indices for this subset of data
  subset_indices = list(df.index[df['News_Outlet'].str.contains(outlet_ideology_dict.get(ideology_label))])

  #Getting the labels for the subset by using these indices
  subset_labels = labels_series[labels_series.index.isin(subset_indices)]

  return subset_tokenized, subset_labels

#### Centrist

In [None]:
#Getting the tokenized data for centrist outlets
subset_tokenized, subset_ground = subset_creator(X_test, y_test, 'centre')

#Triggering garbage collection
gc.collect()

3631

In [None]:
#Making predictions
centre_test_pred = bert_model_final.predict(subset_tokenized)

In [None]:
#Calculating and storing metrics
centre_scores = [metrics.accuracy_score(list(subset_ground['Ideology']), [list(i).index(max(i)) for i in centre_test_pred]), 
                 metrics.f1_score(list(subset_ground['Ideology']), [list(i).index(max(i)) for i in centre_test_pred], average='weighted')]

#Printing metrics for predictions
print("Confusion matrix for centrist outlets: \n", metrics.confusion_matrix(list(subset_ground['Ideology']), [list(i).index(max(i)) for i in centre_test_pred]))
print("\nAccuracy for centrist outlets: ", centre_scores[0])
print("\nWeighted f1-score for centrist outlets: ", centre_scores[1])

Confusion matrix for centrist outlets: 
 [[1385  320  288]
 [   0    0    0]
 [   0    0    0]]

Accuracy for centrist outlets:  0.6949322629202208

Weighted f1-score for centrist outlets:  0.8200118413262286


#### Left wing

In [None]:
#Getting the tokenized data for centrist outlets
subset_tokenized, subset_ground = subset_creator(X_test, y_test, 'left')

#Triggering garbage collection
gc.collect()

0

In [None]:
#Making predictions
left_test_pred = bert_model_final.predict(subset_tokenized)

In [None]:
#Calculating and storing metrics
left_scores = [metrics.accuracy_score(list(subset_ground['Ideology']), [list(i).index(max(i)) for i in left_test_pred]), 
                 metrics.f1_score(list(subset_ground['Ideology']), [list(i).index(max(i)) for i in left_test_pred], average='weighted')]

#Printing metrics for predictions
print("Confusion matrix for centrist outlets: \n", metrics.confusion_matrix(list(subset_ground['Ideology']), [list(i).index(max(i)) for i in left_test_pred]))
print("\nAccuracy for centrist outlets: ", left_scores[0])
print("\nWeighted f1-score for centrist outlets: ", left_scores[1])

Confusion matrix for centrist outlets: 
 [[   0    0    0]
 [ 156 2118  125]
 [   0    0    0]]

Accuracy for centrist outlets:  0.8828678616090038

Weighted f1-score for centrist outlets:  0.9377905689617002


#### Right wing

In [None]:
#Getting the tokenized data for centrist outlets
subset_tokenized, subset_ground = subset_creator(X_test, y_test, 'right')

#Triggering garbage collection
gc.collect()

880

In [None]:
#Making predictions
right_test_pred = bert_model_final.predict(subset_tokenized)

In [None]:
#Calculating and storing metrics
right_scores = [metrics.accuracy_score(list(subset_ground['Ideology']), [list(i).index(max(i)) for i in right_test_pred]), 
                 metrics.f1_score(list(subset_ground['Ideology']), [list(i).index(max(i)) for i in right_test_pred], average='weighted')]

#Printing metrics for predictions
print("Confusion matrix for centrist outlets: \n", metrics.confusion_matrix(list(subset_ground['Ideology']), [list(i).index(max(i)) for i in right_test_pred]))
print("\nAccuracy for centrist outlets: ", right_scores[0])
print("\nWeighted f1-score for centrist outlets: ", right_scores[1])

Confusion matrix for centrist outlets: 
 [[   0    0    0]
 [   0    0    0]
 [ 306  227 1456]]

Accuracy for centrist outlets:  0.7320261437908496

Weighted f1-score for centrist outlets:  0.8452830188679245
