## BBC News Classification

https://www.kaggle.com/competitions/learn-ai-bbc/data

In [1]:
import string
import re
import pandas as pd
import numpy as np
import math
import random
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from wordcloud import WordCloud

# nltk imports
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.tokenize import word_tokenize  # tokenize the text == the text is splitted into words in list
from nltk.corpus import stopwords  # this contain common stop words that has no effect in analysis
from nltk.stem import WordNetLemmatizer  # Lemmatization is the process of grouping together the different inflected forms of a word so they can be analyzed as a single item

# sklearn imports
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer  # bags of words and TF IDF
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, confusion_matrix, make_scorer  # classification Metrics

from sklearn.model_selection import train_test_split  # splitting dataset

from sklearn import pipeline
from sklearn import linear_model
from sklearn.model_selection import StratifiedKFold

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
import keras
from keras import backend as K
from tensorflow.keras.layers import Embedding
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras_preprocessing.sequence import pad_sequences
from keras.preprocessing import sequence, text
from keras.models import Sequential
from tensorflow.keras.layers import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.utils import np_utils
from keras.callbacks import EarlyStopping , ReduceLROnPlateau
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

In [3]:
cd E:\BBC News

E:\BBC News


In [4]:
BBC_Data = pd.read_csv('BBC News Train.csv')

In [5]:
BBC_Data.head(10)

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
5,1582,howard truanted to play snooker conservative...,politics
6,651,wales silent on grand slam talk rhys williams ...,sport
7,1797,french honour for director parker british film...,entertainment
8,2034,car giant hit by mercedes slump a slump in pro...,business
9,1866,fockers fuel festive film chart comedy meet th...,entertainment


In [6]:
BBC_Data.Category.value_counts()

sport            346
business         336
politics         274
entertainment    273
tech             261
Name: Category, dtype: int64

In [7]:
dicto = {'sport': 1,'business': 2, 'politics': 3, 'entertainment': 4, 'tech': 5}

BBC_Data.Category = BBC_Data.Category.map(dicto)

In [8]:
BBC_Data.head(10)

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,2
1,154,german business confidence slides german busin...,2
2,1101,bbc poll indicates economic gloom citizens in ...,2
3,1976,lifestyle governs mobile choice faster bett...,5
4,917,enron bosses in $168m payout eighteen former e...,2
5,1582,howard truanted to play snooker conservative...,3
6,651,wales silent on grand slam talk rhys williams ...,1
7,1797,french honour for director parker british film...,4
8,2034,car giant hit by mercedes slump a slump in pro...,2
9,1866,fockers fuel festive film chart comedy meet th...,4


In [11]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [33]:
stop_words = [i for i in stopwords.words('english') if "n't" not in i and i not in ('not','no')]

def process_text(text):    
    text = word_tokenize(text) # tokenize words in text
    text = [re.sub('[^A-Za-z0-9]+', '', word) for word in text] # this line substitutes any white space before the word by removing the space
    text = [word.translate(str.maketrans('', '', string.punctuation)) for word in text]
    text = [word.lower() for word in text if word.isalpha()] # lower each word in text
    text = [word for word in text if word not in stop_words]
    text = [WordNetLemmatizer().lemmatize(word) for word in text] # lemmatization of words, so when see persons an person, both are dealt as one word person
    text = ' '.join(text) # join words into text again
    return text

In [34]:
text = BBC_Data.Text[0]
process_text(text)



In [35]:
BBC_Data['Text'] = BBC_Data['Text'].apply(process_text) # this line applies process_text function to Sentence in dataset
BBC_Data['Text'].sample(10)

363     ronaldo considering new contract manchester un...
1243    jarvis sell tube stake spain share engineering...
326     ore cost hit global steel firm share steel fir...
807     uk firm face venezuelan land row venezuelan au...
150     collins call chamber return world champion kim...
1453    rock group korn guitarist quits guitarist u ro...
1357    hague sixfigure earnings shown reward leaving ...
366     lib dems bold election policy charles kennedy ...
1213    benitez deflects blame dudek liverpool manager...
819     new rule tackle sham wedding new rule marriage...
Name: Text, dtype: object

In [41]:
def cal_num_of_words(text):
    return len(text.split(' '))

In [43]:
BBC_Data['Text'].apply(cal_num_of_words).max()

1653

In [106]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 5000
max_len=100

def tokenize_pad_sequences(text):
    '''
    This function tokenize the input text into sequnences of intergers and then
    pad each sequence to the same length
    '''
    # Text tokenization
    tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
    tokenizer.fit_on_texts(text)
    # Transforms text to a sequence of integers
    X = tokenizer.texts_to_sequences(text)
    # Pad sequences to the same length
    X = pad_sequences(X, padding='post', maxlen=max_len)
    # return sequences
    return X, tokenizer

print('Before Tokenization & Padding \n', BBC_Data.loc[10, 'Text'])
X, tokenizer = tokenize_pad_sequences(BBC_Data['Text'])
print('After Tokenization & Padding \n', X[10])

Before Tokenization & Padding 
 blair reject iraq advice call tony blair rejected call publication advice legality iraq war amid growing call investigation prime minister told monthly press conference matter dealt attorney general earlier conservative mp michael mate joined call probe claim lord goldsmith statement parliament drawn number mr blair said statement fair summary lord goldsmith opinion lord goldsmith said say dealt time time time mr blair told monthly news conference downing street refused answer question issue saying dealt literally score time position not changed lord goldsmith denied leaned say word written government refuse publish advice legality war saying paper always kept confidential mr mate member common intelligence security committee part butler inquiry prewar intelligence told bbc friday general rule right not absolute rule said occasion advice published recently regarding prince charles marriage plan government could not pick choose use convention said mr mate

In [51]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [107]:
y = pd.get_dummies(BBC_Data['Category'])
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=1)
print('Train Set ->', X_train.shape, y_train.shape)
print('Validation Set ->', X_val.shape, y_val.shape)

Train Set -> (1117, 100) (1117, 5)
Validation Set -> (373, 100) (373, 5)


In [53]:
import keras.backend as K

def f1_score(precision, recall):
    ''' Function to calculate f1 score '''
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [108]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from keras.metrics import Precision, Recall
from keras.optimizers import SGD
from keras import datasets

from keras.callbacks import LearningRateScheduler
from keras.callbacks import History

from keras import losses

vocab_size = 5000
embedding_size = 64
epochs=100
learning_rate = 0.1
decay_rate = learning_rate / epochs
momentum = 0.8

sgd = SGD(learning_rate=learning_rate, momentum=momentum, decay=decay_rate, nesterov=False)
# Build model
LSTM_model= Sequential()
LSTM_model.add(Embedding(vocab_size, embedding_size, input_length=max_len))
LSTM_model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
LSTM_model.add(MaxPooling1D(pool_size=2))
LSTM_model.add(Bidirectional(LSTM(64)))
#LSTM_model.add(Dropout(0.4))
LSTM_model.add(Dense(5, activation='softmax'))
LSTM_model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy', Precision(), Recall()])

In [56]:
import tensorflow as tf
#tf.keras.utils.plot_model(LSTM_model, show_shapes=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model to work.


In [109]:
print(LSTM_model.summary())
# Compile model
LSTM_model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy', Precision(), Recall()])

# Train model

batch_size = 64
history = LSTM_model.fit(X_train, y_train, validation_data=(X_val, y_val),
                         batch_size=batch_size, epochs=epochs, verbose=1)

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 100, 64)           320000    
                                                                 
 conv1d_7 (Conv1D)           (None, 100, 32)           6176      
                                                                 
 max_pooling1d_7 (MaxPooling  (None, 50, 32)           0         
 1D)                                                             
                                                                 
 bidirectional_7 (Bidirectio  (None, 128)              49664     
 nal)                                                            
                                                                 
 dense_13 (Dense)            (None, 5)                 645       
                                                                 
Total params: 376,485
Trainable params: 376,485
Non-tr

Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100


Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100


Epoch 99/100
Epoch 100/100


In [112]:
history = LSTM_model.fit(X_train, y_train,
                      validation_data=(X_val, y_val),
                      batch_size=batch_size, epochs=50, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50


Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [113]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluate model on the test set
y_pred_probs = LSTM_model.predict(X_val)
y_pred = np.argmax(y_pred_probs, axis=1)
accuracy = accuracy_score(y_val.values.argmax(axis=1), y_pred)
precision = precision_score(y_val.values.argmax(axis=1), y_pred, average='weighted')
recall = recall_score(y_val.values.argmax(axis=1), y_pred, average='weighted')
f1 = f1_score(y_val.values.argmax(axis=1), y_pred, average='weighted')

# Print metrics
print('Accuracy : {:.4f}'.format(accuracy))
print('Precision: {:.4f}'.format(precision))
print('Recall   : {:.4f}'.format(recall))
print('F1 Score : {:.4f}'.format(f1))

Accuracy : 0.8874
Precision: 0.8905
Recall   : 0.8874
F1 Score : 0.8881


In [114]:
# train-test split
X_train, X_val, y_train, y_val = train_test_split(BBC_Data['Text'],BBC_Data['Category'],
                                                  stratify=BBC_Data['Category'],test_size=0.20)

In [115]:
df_train = pd.DataFrame(list(zip(X_train, y_train)), columns=['Text', 'Category'])
df_val = pd.DataFrame(list(zip(X_val, y_val)), columns=['Text', 'Category'])

In [116]:
y_train = tf.keras.utils.to_categorical(df_train['Category'])
y_val = tf.keras.utils.to_categorical(df_val['Category'])

In [117]:
y_val.shape

(298, 6)

In [118]:
df_train['Text']

0       tory urge change top tory delegate gathering e...
1       kennedy call iraq exit plan tony blair set pro...
2       xbox may unveiled summer detail next generatio...
3       mp tout lord replacement plan group mp tried r...
4       indecency fine viacom medium giant viacom paid...
                              ...                        
1187    jowell reject la vega jibe secretary state cul...
1188    highdefinition dvd first humble home video dvd...
1189    labour eu propaganda taxpayer subsidised propa...
1190    baghdad blogger big screen film based internet...
1191    capriati miss melbourne jennifer capriati beco...
Name: Text, Length: 1192, dtype: object

In [119]:
# load pre-trained model BERT
from transformers import AutoTokenizer, TFBertModel
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
bert = TFBertModel.from_pretrained('bert-base-uncased')

# tokenizing
max_len = 100 # from histogram length words
x_train = tokenizer(text=df_train['Text'].tolist(),
                    add_special_tokens=True,
                    max_length=max_len,
                    truncation=True,
                    padding='max_length',
                    return_tensors='tf',
                    return_token_type_ids=False,
                    return_attention_mask=True,
                    verbose=True)
x_test = tokenizer(text=df_val['Text'].tolist(),
                   add_special_tokens=True,
                   max_length=max_len,
                   truncation=True,
                   padding='max_length',
                   return_tensors='tf',
                   return_token_type_ids=False,
                   return_attention_mask=True,
                   verbose=True)

Some layers from the model checkpoint at bert-base-uncased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [120]:
# build model fine tuning bert
input_ids = tf.keras.layers.Input(
    shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask = tf.keras.layers.Input(
    shape=(max_len,), dtype=tf.int32, name="attention_mask")

embeddings = bert(input_ids, attention_mask=input_mask)[0]

out = tf.keras.layers.GlobalMaxPool1D()(embeddings)
out = tf.keras.layers.Dense(128, activation='relu')(out)
out = tf.keras.layers.Dropout(0.1)(out)
out = tf.keras.layers.Dense(32, activation='relu')(out)

y = tf.keras.layers.Dense(6, activation='softmax')(out)

model = tf.keras.Model(inputs=[input_ids, input_mask], outputs=y)

model.layers[2].trainable = True

# set the decay schedule
decay_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=5e-05,
    decay_steps=10000,
    decay_rate=0.96)

# Set optimizer
optimizer = tf.keras.optimizers.Adam(
    learning_rate=decay_schedule,
    epsilon=1e-08,
    clipnorm=1.0)

# Set loss and metrics
loss = tf.keras.losses.CategoricalCrossentropy(from_logits=False)
metric = tf.keras.metrics.CategoricalAccuracy('balanced_accuracy')

# Compile the model
model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=metric)

model.summary()

Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 100)]        0           []                               
                                                                                                  
 attention_mask (InputLayer)    [(None, 100)]        0           []                               
                                                                                                  
 tf_bert_model_2 (TFBertModel)  TFBaseModelOutputWi  109482240   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'attention_mask[0][0]']         
                                tentions(last_hidde                                               
                                n_state=(None, 100,                                         

In [121]:
# model training
history = model.fit(
    x={'input_ids': x_train['input_ids'],
        'attention_mask': x_train['attention_mask']},
    y=y_train,
    validation_data=(
        {'input_ids': x_test['input_ids'], 'attention_mask': x_test['attention_mask']}, y_val),
    epochs=3,
    batch_size=32
)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [124]:
# prediction
predicted_raw = model.predict(
    {'input_ids': x_test['input_ids'], 'attention_mask': x_test['attention_mask']})

y_predicted = np.argmax(predicted_raw, axis=1)
y_true = df_val['Category']

print(classification_report(y_true, y_predicted, target_names=['sport','business', 'politics', 'entertainment', 'tech']))

               precision    recall  f1-score   support

        sport       0.97      1.00      0.99        69
     business       0.97      1.00      0.99        67
     politics       1.00      0.98      0.99        55
entertainment       1.00      1.00      1.00        55
         tech       1.00      0.94      0.97        52

     accuracy                           0.99       298
    macro avg       0.99      0.98      0.99       298
 weighted avg       0.99      0.99      0.99       298



In [125]:
BBC_Data_Test = pd.read_csv('BBC News Test.csv')

In [128]:
BBC_Data_Test

Unnamed: 0,ArticleId,Text
0,1018,qpr keeper day heads for preston queens park r...
1,1319,software watching while you work software that...
2,1138,d arcy injury adds to ireland woe gordon d arc...
3,459,india s reliance family feud heats up the ongo...
4,1020,boro suffer morrison injury blow middlesbrough...
...,...,...
730,1923,eu to probe alitalia state aid the european ...
731,373,u2 to play at grammy awards show irish rock ba...
732,1704,sport betting rules in spotlight a group of mp...
733,206,alfa romeos to get gm engines fiat is to sto...


In [130]:
X_test = tokenizer(text=BBC_Data_Test['Text'].tolist(),
                   add_special_tokens=True,
                   max_length=max_len,
                   truncation=True,
                   padding='max_length',
                   return_tensors='tf',
                   return_token_type_ids=False,
                   return_attention_mask=True,
                   verbose=True)

In [131]:
predicted_raw = model.predict(
    {'input_ids': X_test['input_ids'], 'attention_mask': X_test['attention_mask']})

Y_predicted = np.argmax(predicted_raw, axis=1)



In [137]:
dicto_inv = {1:'sport',2:'business',3:'politics',4:'entertainment',5:'tech'}

BBC_Data_Test['Predicted Category'] = np.vectorize(dicto_inv.get)(Y_predicted)

In [138]:
BBC_Data_Test

Unnamed: 0,ArticleId,Text,Predicted Category
0,1018,qpr keeper day heads for preston queens park r...,sport
1,1319,software watching while you work software that...,tech
2,1138,d arcy injury adds to ireland woe gordon d arc...,sport
3,459,india s reliance family feud heats up the ongo...,business
4,1020,boro suffer morrison injury blow middlesbrough...,sport
...,...,...,...
730,1923,eu to probe alitalia state aid the european ...,business
731,373,u2 to play at grammy awards show irish rock ba...,entertainment
732,1704,sport betting rules in spotlight a group of mp...,politics
733,206,alfa romeos to get gm engines fiat is to sto...,business
