# Imports

## If running on Google Colab

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import unicodedata
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras import layers, Input
from tensorflow.keras.layers import LSTM, GRU

In [3]:
import re
import string
import json
import os

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [4]:
re_url = re.compile(r'(?:http|ftp|https)://(?:[\w_-]+(?:(?:\.[\w_-]+)+))(?:[\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?')
re_email = re.compile('(?:[a-z0-9!#$%&\'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&\'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9]))\.){3}(?:(2(5[0-5]|[0-4][0-9])|1[0-9][0-9]|[1-9]?[0-9])|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])')

# Getting the data

In [93]:
categories = ['soc.religion.christian', 'sci.space', 'sci.electronics', 'talk.religion.misc', 'rec.motorcycles']

In [None]:
newsgroups_train_data = fetch_20newsgroups(data_home='20_Newsgroup_Data/',
                                           subset='train', categories=categories)
newsgroups_test_data = fetch_20newsgroups(data_home='20_Newsgroup_Data/',
                                          subset='test', categories=categories)

**Train Dataset DataFrame**

In [None]:
train_data_newsgroup_df = pd.DataFrame({"Data": newsgroups_train_data['data'], "Target": newsgroups_train_data['target']})

In [None]:
train_data_newsgroup_df['Target'].value_counts()

**Test Dataset DataFrame**

In [None]:
test_data_newsgroup_df = pd.DataFrame({"Data": newsgroups_test_data['data'], "Target": newsgroups_test_data['target']})

In [None]:
test_data_newsgroup_df

In [None]:
# Codes to save the train and test dataframes.

train_data_newsgroup_df.to_csv('train_data_newsgroup_df.csv', sep = '\t')
test_data_newsgroup_df.to_csv('test_data_newsgroup_df.csv', sep = '\t')

## Load the train and test dataframes which are already saved to avoid interacting with the sklearn.dataset API

In [None]:
# Codes for loading train and test dataframes.

train_data_newsgroup_df_loaded = pd.read_csv('train_data_newsgroup_df.csv', sep = '\t')
train_data_newsgroup_df = train_data_newsgroup_df_loaded[['Data', 'Target']]

test_data_newsgroup_df_loaded = pd.read_csv('test_data_newsgroup_df.csv', sep = '\t')
test_data_newsgroup_df = test_data_newsgroup_df_loaded[['Data', 'Target']]

# Utility Functions for Preprocessing the 20 Newsgroups Dataset

In [None]:
def clean_header(text):
    text = re.sub(r'(From:\s+[^\n]+\n)', '', text)
    text = re.sub(r'(Subject:[^\n]+\n)', '', text)
    text = re.sub(r'(([\sA-Za-z0-9\-]+)?[A|a]rchive-name:[^\n]+\n)', '', text)
    text = re.sub(r'(Last-modified:[^\n]+\n)', '', text)
    text = re.sub(r'(Version:[^\n]+\n)', '', text)

    return text

def clean_text(text):        
    text = text.lower()
    text = text.strip()
    text = re.sub(re_url, '', text)
    text = re.sub(re_email, '', text)
    text = re.sub(f'[{re.escape(string.punctuation)}]', '', text)
    text = re.sub(r'(\d+)', ' ', text)
    text = re.sub(r'(\s+)', ' ', text)
    
    return text

def preprocess(text):
    text = clean_header(text)
    text = clean_text(text)
    return text

def add_start_end_tokens(string):
    return '<start> ' + string + ' <end>'

stop_words = stopwords.words('english')

# Preprocess the train and test data

In [None]:
train_data_newsgroup_df['Data Cleaned'] = train_data_newsgroup_df['Data'].apply(preprocess)
train_data_newsgroup_df['Data Cleaned'] = train_data_newsgroup_df['Data Cleaned'].str.split().apply(lambda x: ' '.join([word for word in x if word not in stop_words]))
train_data_newsgroup_df['Data Cleaned'] = train_data_newsgroup_df['Data Cleaned'].apply(add_start_end_tokens)

In [None]:
train_data_newsgroup_df

In [None]:
test_data_newsgroup_df['Data Cleaned'] = test_data_newsgroup_df['Data'].apply(preprocess)
test_data_newsgroup_df['Data Cleaned'] = test_data_newsgroup_df['Data Cleaned'].str.split().apply(lambda x: ' '.join([word for word in x if word not in stop_words]))
test_data_newsgroup_df['Data Cleaned'] = test_data_newsgroup_df['Data Cleaned'].apply(add_start_end_tokens)

In [None]:
test_data_newsgroup_df

## Codes for loading the cleaned train and test dataframes to save time IF running on Google Colab

In [None]:
# Codes for loading the CLEANED train and test dataframes.

train_data_newsgroup_df_loaded = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DE-P05 NLP MODEL/train_data_newsgroup_df.csv', sep = '\t')
train_data_newsgroup_df = train_data_newsgroup_df_loaded[['Data', 'Target', 'Data Cleaned']]

test_data_newsgroup_df_loaded = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/DE-P05 NLP MODEL/test_data_newsgroup_df.csv', sep = '\t')
test_data_newsgroup_df = test_data_newsgroup_df_loaded[['Data', 'Target', 'Data Cleaned']]

## Codes for loading the cleaned train and test dataframes to save time IF running on Jupyter Notebook

In [5]:
train_data_newsgroup_df_loaded = pd.read_csv('train_data_newsgroup_df.csv', sep = '\t')
train_data_newsgroup_df = train_data_newsgroup_df_loaded[['Data', 'Target', 'Data Cleaned']]

test_data_newsgroup_df_loaded = pd.read_csv('test_data_newsgroup_df.csv', sep = '\t')
test_data_newsgroup_df = test_data_newsgroup_df_loaded[['Data', 'Target', 'Data Cleaned']]

**Start setting up the datasets, `X_train`, `y_train`, `X_test` and `y_test`.**

In [6]:
X_train, y_train = train_data_newsgroup_df['Data Cleaned'].to_numpy(), train_data_newsgroup_df['Target'].to_numpy()

In [7]:
X_test, y_test = test_data_newsgroup_df['Data Cleaned'].to_numpy(), test_data_newsgroup_df['Target'].to_numpy()

In [8]:
tokenizer = Tokenizer(lower = False,filters= '', oov_token = '<UKW>')

In [9]:
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

In [81]:
X_train

array([[   0,    0,    0, ...,  673, 1723,    4],
       [   0,    0,    0, ..., 1027,  121,    4],
       [   0,    0,    0, ...,   43,  178,    4],
       ...,
       [   0,    0,    0, ..., 5319,   64,    4],
       [   0,    0,    0, ..., 2227, 5655,    4],
       [   0,    0,    0, ..., 3281, 9259,    4]])

## Store the tokenizer's configuration as a .json file, so as to reload it when preprocessing incoming tweets.

In [39]:
tokenizer_config_string = tokenizer.to_json()

In [40]:
tokenizer_config_filename = os.path.join('.','tokenizer_config.json')

with open(tokenizer_config_filename, 'w') as file:
    json.dump(tokenizer_config_string, file)

## Get the size of the vocabulary in the 20 Newsgroup dataset and the maximum length of all sentences.

In [55]:
vocab_size = len(tokenizer.word_index)
vocab_size

34073

In [56]:
max_sentence_length = -1

for idxx, i in enumerate(X_train):
    if len(i) > max_sentence_length:
        max_sentence_length = len(i)
        idx = idxx
        
max_sentence_length, idx

(6279, 2649)

## Store `vocab_size` and `max_sentence_length` in a .json file

In [57]:
rnn_model_params = {
    "vocab_size": vocab_size,
    "max_sentence_length": max_sentence_length
}

In [58]:
model_params_filename = os.path.join('.','model_params.json')

with open(model_params_filename, 'w') as file:
    json.dump(rnn_model_params, file)

## Pad the sentences to its maximum length with zeros

In [67]:
X_train = tf.keras.preprocessing.sequence.pad_sequences(X_train, maxlen=max_sentence_length, padding='pre', truncating = 'pre')

In [17]:
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=max_sentence_length, padding='pre', truncating = 'pre')

In [73]:
X_train[1,-100:]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     3,
           5,   937,   172,   484,     2,     9,  1026,  5486,     8,
        1964,   472,  1756,  4322,    42,   183,  4088,   101,  3400,
        1048,  4573,  3561,  3144, 14279,  9733,  1048, 14280,  5905,
       11551,   440,  3259,  2454,    16,  3260,  3562,  3145, 11552,
        3145,    32,  1049,  1347,  1280,  4574,   937,  1027,   121,
           4])

## Make a tensorflow.data.Dataset object out of the numpy arrays of data

In [18]:
train_dataset = tf.data.Dataset.from_tensor_slices((X_train,y_train))
test_dataset = tf.data.Dataset.from_tensor_slices((X_test,y_test))

In [19]:
batch_size = 16

train_dataset = train_dataset.batch(16)
test_dataset = test_dataset.batch(16)

## Make the RNN model

In [20]:
def get_model(vocab_size):
    """
    This function takes a vocabulary size and batch size, and builds and returns a 
    Sequential model according to the above specification.
    """
    
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(input_dim = vocab_size, output_dim = 256, mask_zero = True),
        tf.keras.layers.GRU(units = 256),
        tf.keras.layers.Dense(5)
    ])
    
    return model

rnn_model = get_model(vocab_size = vocab_size+1)

rnn_model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['sparse_categorical_accuracy'])

rnn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 256)         8722944   
_________________________________________________________________
gru (GRU)                    (None, 256)               394752    
_________________________________________________________________
dense (Dense)                (None, 5)                 1285      
Total params: 9,118,981
Trainable params: 9,118,981
Non-trainable params: 0
_________________________________________________________________


## Load the RNN Model if running on Google Colab

In [None]:
load_model_flag = False # A flag indicating if one should proceed with training, False => DO NOT PROCEED with training

In [18]:
if load_model_flag: # This prevents the training from running accidentally.
    rnn_model = tf.keras.models.load_model('/content/drive/MyDrive/Colab Notebooks/DE-P05 NLP MODEL/rnn_model_weights.h5')

## Load the RNN Model if running on Jupyter Notebook

In [49]:
load_model_flag = True

In [50]:
if load_model_flag:
    rnn_model = tf.keras.models.load_model('models/rnn_model_weights.h5')

## Train or re-train the RNN model if need be

In [None]:
train_model_flag = False # A flag indicating if one should proceed with training, False => DO NOT PROCEED with training

In [19]:
if train_model_flag: # This prevents the training from running accidentally.
    history = rnn_model.fit(train_dataset, epochs = 2, use_multiprocessing=True)

Epoch 1/2
Epoch 2/2


Last training done on 17 April 2021 achieved such a training accuracy.

```
Epoch 1/2
173/173 [==============================] - 2441s 14s/step - loss: 0.0741 - sparse_categorical_accuracy: 0.9775
Epoch 2/2
173/173 [==============================] - 2417s 14s/step - loss: 0.0238 - sparse_categorical_accuracy: 0.9949
```

## SAVE the RNN Model if running on Google Colab

In [None]:
save_flag = False

In [21]:
if save_flag:
    rnn_model.save('/content/drive/MyDrive/Colab Notebooks/DE-P05 NLP MODEL/rnn_model_weights.h5')

## SAVE the RNN Model if running on Jupyter Notebook

In [None]:
save_flag = False

In [None]:
if save_flag:
    rnn_model.save('models/rnn_model_weights.h5')

## Evaluate the RNN Model on test dataset

In [8]:
rnn_model.evaluate(test_dataset)

NameError: name 'test_dataset' is not defined

On 17 April 2021, model was evaluated, results as follow:

```
115/115 [==============================] - 109s 939ms/step - loss: 0.6161 - sparse_categorical_accuracy: 0.8244
Loss, Accuracy = [0.6161214709281921, 0.8244274854660034]
```

# Apply model on Tweets

## Preprocess the tweets

Load the tweets dataframe first.

In [21]:
train_tweets_df=pd.read_csv('train_tweets.csv')
train_tweets_df.head()

Unnamed: 0,id,label,tweet
0,1,0,@user when a father is dysfunctional and is s...
1,2,0,@user @user thanks for #lyft credit i can't us...
2,3,0,bihday your majesty
3,4,0,#model i love u take with u all the time in ...
4,5,0,factsguide: society now #motivation


Make a generator to yield a random tweet.

In [100]:
def generator_random_tweet(train_tweets_df):
    random_int = np.random.randint(len(train_tweets_df))
    random_single_tweet = train_tweets_df['tweet'][random_int]
    yield random_single_tweet

In [22]:
random_int = np.random.randint(len(train_tweets_df))

random_single_tweet = train_tweets_df['tweet'][random_int]

In [29]:
random_single_tweet, random_int

('@user is this for real? #waspi 50sborn in abject povey but we are all in it together! @user  ',
 21960)

## Utility functions for preprocessing the tweet

In [26]:
wnl = WordNetLemmatizer()

### strip_linkes and strip_all_entities from
### https://stackoverflow.com/questions/8376691/how-to-remove-hashtag-user-link-of-a-tweet-using-regular-expression
def strip_links(text):
    link_regex    = re.compile('((https?):((//)|(\\\\))+([\w\d:#@%/;$()~_?\+-=\\\.&](#!)?)*)', re.DOTALL)
    links         = re.findall(link_regex, text)
    for link in links:
        text = text.replace(link[0], ', ')    
    return text

def remove_stopwords_and_lemmatize(text):
    stopword = stopwords.words('english')
    text_splitted = text.split()    
    text = " ".join([wnl.lemmatize(word) for word in text_splitted if word not in stopword])
    return text

def preprocess_single_tweet(single_tweet):
    """single_tweet is a string, a tweet, output is another string which is processed."""
    
    single_tweet = remove_stopwords_and_lemmatize(strip_links(single_tweet))
    single_tweet = (lambda single_twt: re.sub(r'[^a-zA-Z]', ' ', single_twt))(single_tweet)
    single_tweet = (lambda x: re.sub('  ', ' ', x))(single_tweet)
    
    return single_tweet

In [74]:
example_tweet = preprocess_single_tweet(random_single_tweet)
print(f"Before processed: {random_single_tweet}\nAfter processed: {example_tweet}")

Before processed: @user is this for real? #waspi 50sborn in abject povey but we are all in it together! @user  
After processed:  user real  waspi  sborn abject povey together  user


## Load the tokenizer's configuration and tokenize the tweet

In [30]:
tokenizer_config_filename = os.path.join('.','tokenizer_config.json')

In [41]:
with open(tokenizer_config_filename) as file:
    # Load its content and make a new json string
    tokenizer_config_string = json.load(file)

In [43]:
tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizer_config_string)

In [87]:
tokenized_tweet = tokenizer.texts_to_sequences([example_tweet])

In [88]:
tokenized_tweet

[[1499, 120, 1, 1, 1, 1, 490, 1499]]

## Get the model params and the model, then pad the sequence

In [None]:
model_params_filename = os.path.join('.','model_params.json')

In [59]:
with open(model_params_filename) as file:
    # Load its content and make a new json string
    model_params = json.load(file)

In [61]:
vocab_size = model_params['vocab_size']
max_sentence_length = model_params['max_sentence_length']

In [89]:
tokenized_tweet = tf.keras.preprocessing.sequence.pad_sequences(tokenized_tweet, maxlen=max_sentence_length, padding='pre', truncating = 'pre')
tokenized_tweet

array([[   0,    0,    0, ...,    1,  490, 1499]])

## Predict the category of the tweet

In [90]:
prediction = rnn_model.predict(tokenized_tweet)

In [98]:
category = categories[np.squeeze(np.argmax(prediction, axis = -1))]

In [99]:
category

'soc.religion.christian'

## Wrap the entire process of prediction on a tweet emitted from Kakfa's Consumer

In [4]:
load_model_flag = True

### Get the tokenizer's and model's parameters ###
tokenizer_config_filename = os.path.join('.','tokenizer_config.json')
model_params_filename = os.path.join('.','model_params.json')

with open(tokenizer_config_filename) as file:
    # Load its content and make a new json string
    tokenizer_config_string = json.load(file)

### Get the tokenizer ###
tokenizer = tf.keras.preprocessing.text.tokenizer_from_json(tokenizer_config_string)

with open(model_params_filename) as file:
    # Load its content and make a new json string
    model_params = json.load(file)
    
vocab_size = model_params['vocab_size']
max_sentence_length = model_params['max_sentence_length']

if load_model_flag:
    rnn_model = tf.keras.models.load_model('models/rnn_model_weights.h5')
    
# Get to be predicted tweets from Kafka, then predict them one by one
for _ in range(50):
    random_single_tweet = next(generator_random_tweet(train_tweets_df)) # Simulating Kafka emitting the tweets
    example_tweet = preprocess_single_tweet(random_single_tweet)
    tokenized_tweet = tokenizer.texts_to_sequences([example_tweet])
    tokenized_tweet = tf.keras.preprocessing.sequence.pad_sequences(tokenized_tweet, maxlen=max_sentence_length, padding='pre', truncating = 'pre')
    prediction = rnn_model.predict(tokenized_tweet)
    category = categories[np.squeeze(np.argmax(prediction, axis = -1))]
    
    print(f"""
    The original tweet is: {random_single_tweet}.\n
    Predicted Category is: {category}.
    """)

NameError: name 'generator_random_tweet' is not defined