newsclassification_nlp_lstm.py

# -*- coding: utf-8 -*-
"""NewsClassification_NLP_LSTM.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/1JYwhR7yoSfAef5Tek_SI6nrZxaQauE-9

# **First Project: Developing NLP Model Using TensorFlow**

**Name: Nicko Arya Dharma**   
**Email: nicko.arya.dharma@gmail.com**   
**DicodingID: nickoaryad**

## **1 <font color='yellow'>**|**</font> About the Dataset**

Context
News article datasets, originating from BBC News, provided for use as benchmarks for machine learning research. The original data is processed to form a single csv file for ease of use, the news title and the related text file name is preserved along with the news content and its category. This dataset is made available for non-commercial and research purposes only.

All rights, including copyright, in the content of the original articles are owned by the BBC.

Content:   
Consists of 2225 documents from the BBC news website corresponding to stories in five topical areas from 2004-2005.

Class Labels:  
5 (business, entertainment, politics, sport, tech)

Acknowledgements:   
The original source of the data may be accessed through this link and it might be interesting to read the associated research article.

Associated Official Research Papers:   
D. Greene and P. Cunningham. "Practical Solutions to the Problem of Diagonal Dominance in Kernel Document Clustering", Proc. ICML 2006.

Source:   
https://www.kaggle.com/datasets/hgultekin/bbcnewsarchive/data

## **2 <font color='yellow'>**|**</font> Importing Libraries**
"""

import zipfile

import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Dropout, BatchNormalization, Flatten, LSTM
from tensorflow.keras import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model

from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
!pip install wordcloud

import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

import re, string, unicodedata
from string import punctuation

import matplotlib.pyplot as plt


"""## **3 <font color='yellow'>**|**</font> Preparing the Dataset**

#### **3.1 <font color='yellow'>**|**</font> Extracting the Dataset**
"""

local_zip = '/bbc-news-data.csv.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('/tmp')
zip_ref.close()

"""#### **3.2 <font color='yellow'>**|**</font> Reading the Dataset**"""

data = pd.read_csv('/tmp/bbc-news-data.csv', on_bad_lines='skip', sep='\t')
data

data.info()

data.category.value_counts()

data["category"].hist()

heading_sport = data[data["category"]=="sport"]["title"]
collapsed_heading_sport = heading_sport.str.cat(sep=' ')

heading_business  = data[data["category"]=="sport"]["title"]
collapsed_heading_business = heading_sport.str.cat(sep=' ')

heading_politics  = data[data["category"]=="sport"]["title"]
collapsed_heading_politics = heading_sport.str.cat(sep=' ')

heading_tech  = data[data["category"]=="sport"]["title"]
collapsed_heading_tech = heading_sport.str.cat(sep=' ')

heading_entertainment  = data[data["category"]=="sport"]["title"]
collapsed_heading_entertainment = heading_sport.str.cat(sep=' ')

print("Word Cloud for SPORT")
wordcloud = WordCloud(background_color = "black",max_words = 50).generate(collapsed_heading_sport)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()

print("Word Cloud for BUSINESS")
wordcloud = WordCloud(background_color = "black",max_words = 50).generate(collapsed_heading_business)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()

print("Word Cloud for POLITICS")
wordcloud = WordCloud(background_color = "black",max_words = 50).generate(collapsed_heading_politics)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()

print("Word Cloud for TECH")
wordcloud = WordCloud(background_color = "black",max_words = 50).generate(collapsed_heading_tech)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()

print("Word Cloud for ENTERTAINMENT")
wordcloud = WordCloud(background_color = "black",max_words = 50).generate(collapsed_heading_entertainment)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()

"""#### **3.3 <font color='yellow'>**|**</font> Wrangling the Dataset**"""

# Removing unnecessary column
data = data.drop(columns=['filename'])

# Combining title and content columns
data['text'] = data['title'] + " " + data['content']

# Renaming column
data.columns = ['Category', 'Title', 'Content', 'Text']

# Removing title and content columns
data = data.drop(columns=['Title', 'Content'])

# Reindexing columns
data = data.reindex(columns=['Text', 'Category'])

data

# Inspecting any NaN data
data.isna().sum()

# Inspecting any null data
data.isnull().sum()

"""## **4 <font color='yellow'>**|**</font> Preprocessing Text**"""

# Removing unnecessary whitespace, punctuation, and characters
def preprocess(text):
    text = text.lower() #convert to lowercase
    text = text.strip()  #remove any whitespace
    text = re.compile('<.*?>').sub('', text) #remove any HTML tags
    text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text) #replace punctuation with space
    text = re.sub('\s+', ' ', text) #remove extra space and tabs
    text = re.sub(r'\[[0-9]*\]',' ', text) #[0-9] matches any digit (0 to 10000...)
    text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ', text) #matches any digit from 0 to 100000..., \D matches non-digits
    text = re.sub(r'\s+',' ', text) #\s matches any whitespace, \s+ matches multiple whitespace, \S matches non-whitespace
    return text

# Removing stopwords
def stopword(string):
    a= [i for i in string.split() if i not in stopwords.words('english')]
    return ' '.join(a)

# Initializing the stemmer
snow = SnowballStemmer('english')
def stemming(string):
    a=[snow.stem(i) for i in word_tokenize(string) ]
    return " ".join(a)

# Initializing the lemmatizer
wl = WordNetLemmatizer()

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# Tokenizing the sentence
def lemmatizer(string):
    word_pos_tags = nltk.pos_tag(word_tokenize(string))
    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)]
    return " ".join(a)

# Executing the preprocess batches
def finalpreprocess(string):
    return lemmatizer(stopword(preprocess(string)))

data['Text'] = data['Text'].apply(lambda x: finalpreprocess(x))
data.head()

"""## **5 <font color='yellow'>**|**</font> Encoding Dummy Variable**"""

category = pd.get_dummies(data.Category)
data = pd.concat([data, category], axis=1)
data = data.drop(columns=['Category'])
data

data.info()

"""## **6 <font color='yellow'>**|**</font> Splitting the Dataset**"""

X = data['Text'].values
y = data[['sport', 'business', 'politics', 'tech', 'entertainment']].values

X

y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)

print('Train Set:         ', X_train.shape, y_train.shape)
print('Test Set:         ', X_test.shape, y_test.shape)

"""## **7 <font color='yellow'>**|**</font> Developing Model**

#### **7.1 <font color='yellow'>**|**</font> Tokenizing**
"""

tokenizer = Tokenizer(num_words=10000, oov_token="<oov>")
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index

seq_train = tokenizer.texts_to_sequences(X_train)
seq_test = tokenizer.texts_to_sequences(X_test)

pad_train = pad_sequences(seq_train, maxlen=200, truncating="post")
pad_test = pad_sequences(seq_test, maxlen=200, truncating="post")

print("Padded Train = ")
print(pad_train.shape)
print("Padded Test = ")
print(pad_test.shape)

pad_train

pad_test

"""#### **7.2 <font color='yellow'>**|**</font> Sequential Modelling using Embedding and LSTM**"""

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(input_dim=10000, output_dim=32, input_length=200, mask_zero=True),
    tf.keras.layers.LSTM(64, return_sequences=False),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    #tf.keras.layers.LSTM(64),
    #tf.keras.layers.Dropout(0.3),
    tf.keras.layers.Dense(5, activation='softmax')
])
plot_model(model, show_shapes = True)

model.compile(optimizer="adam",
              metrics=['accuracy'],
              loss='categorical_crossentropy')

model.summary()

"""#### **7.3 <font color='yellow'>**|**</font> Defining Callbacks to Control Epochs**"""

# Defining calbacks to stop epoch
class myCallback(tf.keras.callbacks.Callback):
  def on_epoch_end(self, epoch, logs={}):
    if(logs.get('accuracy')>0.90 and logs.get('val_accuracy')>0.90):
      self.model.stop_training = True
      print("\n Accuracy of the training set and the validation set have reached > 90%!")
callbacks = myCallback()

# Improving validation accuracy by decreasing Learning Rate
auto_reduction_LR = ReduceLROnPlateau(
    monitor = 'val_accuracy',
    patience = 2, # epoch waiting to decrease LR by factor
    verbose = 1,
    factor = 0.1, #factor for decreasing LR
    min_lr = 1.5e-5 #minimum Learning Rate
)

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    min_delta=0,
    patience=12,
    verbose=0,
    mode="auto",
    baseline=None,
    restore_best_weights=True
)

"""#### **7.4 <font color='yellow'>**|**</font> Training the Model Using Fit Function**"""

num_epochs = 70
history = model.fit(pad_train, y_train,
                    epochs=num_epochs,
                    validation_data=(pad_test, y_test),
                    verbose=2,
                    callbacks=[callbacks, auto_reduction_LR])

"""#### **7.5 <font color='yellow'>**|**</font> Saving the Model for Deployment**"""

model.save_weights('model_weights.h5')
model.save('model.h5')

"""## **8 <font color='yellow'>**|**</font> Plotting**

#### **8.1 <font color='yellow'>**|**</font> Loss of Training and Validation**
"""

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['Training Loss', 'Validation Loss'], loc = 'upper right')
plt.show()

"""#### **8.2 <font color='yellow'>**|**</font> Accuracy of Training and Validation**"""

plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Training Accuracy', 'Validation Accuracy'], loc='lower right')
plt.show()