# Introduction


This is about predicting the next word of the sentence, this can be told mathematical as

P(wi+1 | wi, wi-1, wi-2) => predicting the word i+1 based on the words i, i-1, i-2 ...

Real life examples:
- In google search bar when you type some words it will show the completion
- Whatsapp or other places when you typing, it will show the next word suggestion
- Gmail word completion while writing the mail.

# Pipeline


# TODO

In [63]:
# basic packages.
from typing import Final
import re
import numpy as np
import pandas as pd 
import kaggle_utils as utils 

# visualization packages.
import plotly.graph_objects as go

# NLP framework.
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from tensorflow import keras
import gensim

# model preprocessing and metrics.
from sklearn.model_selection import train_test_split

from tensorflow.keras.callbacks import EarlyStopping

In [64]:

# Get the data
file_path: str = "data/medium_data.csv"
data = pd.read_csv(file_path)
data.head(5)

# We are just gonna consider the title of the articles.
data = data[["title"]]

# Basic data common analysis.
column, categorical_data, numerical_data, missing_data = utils.Common_data_analysis(data)

data


Numerical data list [] ---> total 0 numerical values
Categorical data list ['title'] ---> total 1 categorical values



Unnamed: 0,data type,Missing Value(NA),?[]na null ' ',% of Missing value(NA),% of Missing value(?[]na null ' ')
title,object,0,0,0.0,0.0


Unnamed: 0,title
0,A Beginner’s Guide to Word Embedding with Gens...
1,Hands-on Graph Neural Networks with PyTorch & ...
2,How to Use ggplot2 in Python
3,Databricks: How to Save Files in CSV on Your L...
4,A Step-by-Step Implementation of Gradient Desc...
...,...
6503,“We” vs “I” — How Should You Talk About Yourse...
6504,How Donald Trump Markets Himself
6505,Content and Marketing Beyond Mass Consumption
6506,5 Questions All Copywriters Should Ask Clients...


# Preprocessing

In [65]:
# preprocessing 


# can i remove punctuation: yes i have to
# can i remove links: yes it is not gonna give any useful information here
# can i remove numbers: yes, we can because "2 people died through the disaster" and  "people died through the disaster" are both gonna give the same meaning.
# can i remove stop words: yes, we can.


class PreprocessData:
    """
    Class to preprocess the data.
    """

    def __init__(self, data: pd.DataFrame, column_name: str) -> None:
        """ 
        Initialize the dataframe.

        :param data: The data dataframe to preprocess.
        :param column_name: Which column we are doing the preprocessing on.
        """
        self.data: pd.DataFrame = data
        self.column_name: str = column_name
        self.file_name: str = "pre-processed-data.pkl"

    def _remove_symbols(self):
        """ 
        This function does the following things.
        * Removes the numbers.
        * Removes punctuations.
        * Removes any symbols.
        """
        # 1 | remove the punctuation words and symbols and numbers.
        # will create a regex and apply over the text.
        self.data[self.column_name] = self.data[self.column_name].apply(lambda sentence: re.sub(r"[0123456789!\"\'\’\“\”\—#$%&()*+,-./:;<=>?@[\]^_`{|}~]", " ", sentence)) 

    def _remove_links(self):
        """ 
        This function does the following things.
        * Removes any links.
        """
        self.data[self.column_name] = self.data[self.column_name].apply(lambda sentence: re.sub(r"(w+://S+)", " ", sentence))

    def _case_fold_to_lower_case(self):
        """ 
        This function does the following things.
        * Change the sentence into lower case.
        """
        self.data[self.column_name] = self.data[self.column_name].str.lower()

    def _fix_typo(self):
        """ 
        This function does the following things.
        * Fix the typo of the each word -- in real word when you get data from tweets, mails, articles etc
        * there is a chance of typo since they are written by humans. We need to fix it to get better results.
        """
        # we are using 
        pass

    def _tokenization(self):
        """ 
        This function does the following things.
        * Tokenization -- which is splitting the sentence into words.
        """
        # 5 | tokenize.
        def tokenize_sentence(sentence):
            return nltk.word_tokenize(sentence)
        
        self.data[self.column_name] = self.data[self.column_name].apply(lambda sentence: tokenize_sentence(sentence))

    def _remove_stop_words(self):
        """ 
        This function does the following things.
        * Removes the stop words from the tokens.
        """
        def remove_stop_word(words: list) -> list:
            """  
            Remove the stop words from the list of words.

            :param words: The list of words in a sentence.
            :returns: List of words which are not stop words.
            """
            stop_words = stopwords.words('english')
            return [word for word in words if word not in stop_words]
            
        self.data[self.column_name] = self.data[self.column_name].apply(lambda words: remove_stop_word(words))

    def _lemmatization(self):
        """ 
        This function does the following things.
        * Lemmatization -- which is converting every word into it's root form.
        """

        def create_lemma(words: list):
            """ 
            Create the lemma for the list of string.
            """
            lemmetizer = WordNetLemmatizer()
            lemmatized_tokens = [lemmetizer.lemmatize(token) for token in words]
            return lemmatized_tokens
        
        self.data[self.column_name] = self.data[self.column_name].apply(lambda row: create_lemma(row))

    def run(self, save_preprocessed_dataframe: bool = True,
            verbose: bool = True,
            fix_typo: bool = True,
            lemmatize: bool = True,
            remove_stop_words: bool = True,
            ) -> pd.DataFrame:
        """ 
        Actually run the preprocessing pipeline.
        Steps which are involved in it.
        * 1 - Remove symbols (including numbers, symbols, special characters)
        * 2 - Remove links.
        * 3 - Case fold -- change all to lower case.
        * 4 - Fix typos.
        * 5 - tokenization.
        * 6 - Remove stop words.
        * 7 - lemmatization.
        """
        star_print_length: int = 70
        if verbose: print("*"*star_print_length)
        if verbose: print("Preprocessing data...")
        if verbose: print("*"*star_print_length)

        # 1 | Remove symbols (including numbers, symbols, special characters).
        if verbose: print("")
        if verbose: print("\tRemoving numbers, punctuations and special characters...")
        self._remove_symbols()

        # 2 | Remove links.
        
        self._remove_links()
        if verbose: print("\tRemoving links...")
        # 3 | Case fold -- change all to lower case.
        self._case_fold_to_lower_case()

        # 4 | Fix typos.
        if fix_typo:
            if verbose: print("\tFixing typos...")
            self._fix_typo()

        # 5 | tokenization.
        if verbose: print("\tTokenization -- Splitting the sentence into tokens...")
        self._tokenization()

        # 6 | Remove stop words.
        if remove_stop_words:
            if verbose: print("\tRemoving stop words...")
            self._remove_stop_words()

        # 7 | lemmatization.
        if lemmatize:
            if verbose: print("\tLemmatization...")
            self._lemmatization()

        # save the dataframe, so that we can skip the preprocessing next time.
        if save_preprocessed_dataframe:
            if verbose: print("\tSaving the dataframe for future use...")
            data.to_pickle(self.file_name)
            # pd.read_pickle(file_name)

        if verbose: print()
        if verbose: print("*"*star_print_length)
        if verbose: print("Preprocessing is done successfully.")
        if verbose: print("*"*star_print_length)

        return self.data


# can be done easily using spacy -- but less control over it
# def spacy_tokenizer(doc):
#   return [t.lemma_.lower() for t in nlp(doc) if \
#           len(t) > 2 and \
#           not t.is_punct and \
#           not t.is_space and \
#           not t.is_stop and \
#           t.is_alpha]
# tokenizer = keras.preprocessing.text.Tokenizer(filters="", lower=False, split='|', oov_token='OOV')

# # Import the NLP module which contains Spark NLP and NLU libraries
# from johnsnowlabs import nlp
# spark = nlp.start(nlp=False)

# # Use Norvig model
# nlp.load("en.spell.norvig").predict("Plaese alliow me tao introdduce myhelf, I am a man of wealth und tiaste")



# 2 | word to vectors

In [66]:
# From the source code:

# - **fit_on_texts** Updates internal vocabulary based on a list of texts. This method creates the vocabulary index based on word frequency.
#  So if you give it something like, "The cat sat on the mat." It will create a dictionary s.t. word_index["the"] = 1;
# word_index["cat"] = 2 it is word -> index dictionary so every word gets a unique integer value. 0 is reserved for padding.
# So lower integer means more frequent word (often the first few are stop words because they appear a lot).


# - **texts_to_sequences** Transforms each text in texts to a sequence of integers. So it basically takes each word in the text and 
# replaces it with its corresponding integer value from the word_index dictionary. Nothing more, nothing less, certainly no magic involved.


# Why don't combine them? Because you almost always fit once and convert to sequences many times.
# You will fit on your training corpus once and use that exact same word_index dictionary at train / eval / testing / prediction
#time to convert actual text into sequences to feed them to the network. So it makes sense to keep those methods separate.


class WordToVectors:
    """
    Converting the word to vectors(numbers). Using pretrained "word2vec" model. 
    
    """

    def __init__(self, data: pd.DataFrame, column_name: str) -> None:
        """ 
        Initialize the variables.

        :param data: The data dataframe to preprocess.
        :param column_name: Which column we are doing the preprocessing on.
        """
        self.data = data
        self.column_name = column_name
        self.tokenizer = None
        # To store the generated vectors.
        self.df = pd.DataFrame()
        self.token_matrix = None

        # Store the word-index for future reference.
        self.word_index = None
        self.embedding_matrix: np.ndarray = None

        # get the inform about the embedding metrix.
        self.num_tokens: int = 0
        self.embedding_matrix_num_features = 0

    def _tokens_to_vectors(self) -> None:
        """ 
        Generate the vectors from the tokens.
        """
        # First converting the tokens into corresponding numbers.
        # oov_token='<OOV>' replaces all out-of-vocabulary words with <OOV>.
        if not self.tokenizer:
            self.tokenizer = keras.preprocessing.text.Tokenizer(lower=True, oov_token="<OOV>")

        # give the tweets texts to the models and fit the texts.
        self.tokenizer.fit_on_texts(self.data[self.column_name])

        # why token_index -- because the number is actually the index of the word which is stored in the word_index dictionary.
        self.token_matrix = self.tokenizer.texts_to_sequences(self.data[self.column_name])

        self.word_index = self.tokenizer.word_index

    def _sequence_padding(self, max_padding_length: int) -> None:
        """ 
        Pad the sequence, this is because all the sentence won't be in the same length.
        We will decide the max padding length and convert all the sentence to that length.
        if the sentence length < max_padding_length --> fill the remaining place with 0.
        if the sentence length > max_padding_length --> Trim the sentence.
        """
        self.token_matrix = keras.preprocessing.sequence.pad_sequences(self.token_matrix, 
                                                                       maxlen=max_padding_length)
    
    def _generate_n_grams(self) -> pd.DataFrame:
        """ 
        Generate the n grams.
        so the sentence now will be [ '1' ,'2' ,'3', '4' ]

        Then, we have to make a n_gram model for good prediction.
        this sequence will be splitted into multiple inputs as..

                [ '1' ,'2' ,'3', '4' ] -- ["My" ,"name", "is", "pavithra"]

                [ '1' ,'2' ,'3' ] -- ["My" ,"name", "is"]
                
                [ '1' ,'2' ] -- ["My" ,"name"]

        :param data: The dataframe.
        :param column_name: Where the tokens are present.
        :returns: A dataframe with n gram sequence as column values.
        """
        # store the thing in a dataframe.
        df_n_grams_target: pd.DataFrame = pd.DataFrame({"target": []})
        array: list = []
        target = []

        for row in self.token_matrix:
            for index in range(2, len(row) + 1):
                array.append(row[:index][:-1])
                target.append(row[:index][-1])

        # after creating the n-grams split it into features and labels.
        del self.df
        self.df = pd.DataFrame()
        self.token_matrix = array
        df_n_grams_target["target"] = target

        print(f"\tNumber of total sequences --> {len(self.token_matrix)}")
        
        return df_n_grams_target
    
    def _create_embedding_matrix(self) -> None:
        """ 
        
        """
        # load the google pre-build model.
        word_vectors = gensim.models.KeyedVectors.load_word2vec_format('/home/pavithra/projects/NLP/GoogleNews-vectors-negative300.bin', binary=True,)

        # + 1 to account for padding token. 0 is always reserved for padding in enbedding layer.
        self.num_tokens: int = len(self.word_index) + 1

        # Initialize a matrix of zeroes of size: vocabulary * embedding dimension.
        self.embedding_matrix_num_features: int = 300
        self.embedding_matrix = np.zeros((self.num_tokens, self.embedding_matrix_num_features))

        for word, i in self.word_index.items():
            if word_vectors.has_index_for(word):
                self.embedding_matrix[i] = word_vectors[word].copy()
    
    def run(self, max_padding_length: int):
        """ 
        Convert the tokens into vectors.

        :returns df: The dataframe has the tokens list.
        :returns target: The target dataframe.
        :returns num_tokens: Number of tokens in the vocabulary.
        :returns embedding_matrix_num_features: the dimention of the embedding matrix.
        :returns embedding_matrix: The embedding matrix.
        """
        star_print_length: int = 70
        print()
        print("*"*star_print_length)
        print("Word to vectorization process in progress...")
        print("*"*star_print_length)

        print("\tToken to numbers ...")
        self._tokens_to_vectors()  

        # for this models i wanna do n-gram before padding the sequence .
        print("\tGenerating the n-grams ...")
        target = self._generate_n_grams()

        print("\tpadding the sequence tokens...")
        self._sequence_padding(max_padding_length)

        print("\tCreating embedding matrix...")
        self._create_embedding_matrix()
        
        print("\tSaving the embedding matrix...")
        file_name: str = "embedding_matrix" # the extension will be npy
        np.save(file_name, self.embedding_matrix, allow_pickle=True)
        # to load -- np.load(file_name, allow_pickle=True)

        print(f"Number of tokens in the vocabulary --> {self.num_tokens - 1}")

        print()
        print("*"*star_print_length)
        print("Converted words to vectors.")
        print("*"*star_print_length)

        return self.token_matrix, target, self.num_tokens, self.embedding_matrix_num_features, self.embedding_matrix



# 5 | Generate n-grams and labels

Now we have sentence in each word represented by numbers, we have to create a n-gram data.
we have sentence like "My name is Pavithra" --> this will get converted into a sequence with their respective tokens.

word_index dictionary for it:
    {
        "My": 1,
        "name": 2,
        "is": 3,
        "Pavithra": 4
    }

so the sentence now will be [ '1' ,'2' ,'3', '4' ]

Then, we have to make a n_gram model for good prediction.
this sequence will be splitted into multiple inputs as..

 [ '1' ,'2' ,'3', '4' ] -- ["My" ,"name", "is", "pavithra"]

 [ '1' ,'2' ,'3' ] -- ["My" ,"name", "is"]
 
 [ '1' ,'2' ] -- ["My" ,"name"]



In [67]:
def one_hot_encode(data: pd.DataFrame, column_name: str, total_unique_words: int) -> pd.DataFrame:
    """ 
    One hot encode the dataframe values.

    :returns: one-hot encoded dataframe.
    """
    from keras.utils import to_categorical

    # NOTE: num_classes -- the max value you will be having in the list + 1. since we need to create that many columns.
    # ex: 
    # [1,3,5] -- output will be having 6 columns --> [0,1,2,3,4,5] {index will always start from 0 in python} -- total classes here is 6.
    # 
    one_hot_matrix = to_categorical(data[column_name], num_classes=total_unique_words)
    one_hot_df = pd.DataFrame(one_hot_matrix)

    return one_hot_df

# 4 | Model creation

In [77]:
def train_model(features, num_tokens, embedding_dim, embedding_matrix, padding_length,
                weight_decay=1e-4, dropout_rate=0.2, lr=0.001,
                num_epoches=20, batch_size=256):
    """ 
    Create the LSTM model and return the model object.

    :param features: tuple of 4 values -- X_train, y_train, X-val, y_val.
    :param num_tokens: The number of tokens in the vocabulary.
    :param embedding_dim: The dimention of the embedding matrix.
    :param padding_length: the length used for padding, this is the input feature length.
    :param weight_decay: Delay for the l2 regularization. Default is:1e-4.
    :param dropout_rate: Dropout regularization rate.
    :param lr: learning rate.
    :param num_epoches: The number of epoches,
    :param batch_size: batch size to consider for each gradient decent.

    :returns:  the DL model.
    """
    model = keras.Sequential()

    # add the embedding layer as the first layer.
    embedding_layer = keras.layers.Embedding(
                            num_tokens,
                            embedding_dim,
                            embeddings_initializer=keras.initializers.Constant(embedding_matrix),
                            input_length=padding_length,
                            trainable=True
    )

    model.add(embedding_layer)

    # add a lstm layer and dropout layer to prevent overfittitng.
    model.add(keras.layers.LSTM(units=50, 
                                kernel_regularizer=keras.regularizers.l2(weight_decay)))
    model.add(keras.layers.Dropout(dropout_rate))

    # add a lstm layer and dropout layer to prevent overfittitng.
    # model.add(keras.layers.LSTM(50, kernel_regularizer=keras.regularizers.l2(weight_decay)))
    # model.add(keras.layers.Dropout(dropout_rate))

    # model.add(keras.layers.Dense(50, activation='relu',
    #                              kernel_regularizer=keras.regularizers.l2(weight_decay)))
    
    model.add(keras.layers.Dense(num_tokens, activation='softmax'))


    # add the optimizers.
    opt = keras.optimizers.Adam(learning_rate=lr)

    # get the callbacks.
    # checkpoint_path = "training_wights"
    # # Create a callback that saves the model's weights
    # checkpoint_cb = keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
    #                                                 save_weights_only=True,
    #                                                 verbose=0,
    #                                                 save_freq='epoch')
    # patience is the number of epochs to wait before stopping, if the model is not improving.
    early_stopping_cb = EarlyStopping(monitor='val_accuracy', verbose=0, patience=3, restore_best_weights=True)


    # compile the model.
    model.compile(loss="categorical_crossentropy",
                  optimizer=opt, 
                  metrics=['accuracy'])

    print(model.summary())

    # fit the model.
    X_train, y_train, X_val, y_val = features
    history = model.fit(X_train, y_train, 
                        validation_data=(X_val, y_val),
                        shuffle=True,
                        epochs=num_epoches,
                        batch_size=batch_size,
                        callbacks=[early_stopping_cb])
    

    # save the model.
    model_name: str = f"saved_model/lstm_lr{lr}_batch_size_{batch_size}.h5"
    model.save(model_name)

    return model, history


# putting it all together

In [69]:
# finding the max sentence length

# df_length = pd.DataFrame()
# df_length["length"] = data['title'].apply(lambda x: len(x))
# max_length = df_length["length"].max()

# print("max length of the tweets --->", max_length)
# del df_length


#-----------------------------------
# output
#-----------------------------------
# max length of the tweets ---> 38

In [71]:
COLUMN_NAME: Final[str] = "title"
MAX_PADDING_LENGTH: Final[int] = 40
TOTAL_WORDS_IN_VOC: int = 0

data: pd.DataFrame = PreprocessData(data, COLUMN_NAME).run(remove_stop_words=False) # because i need the words like -- "we, i, ..."
token_matrix, target_df , num_tokens, embedding_matrix_num_features, embedding_matrix = WordToVectors(data, COLUMN_NAME).run(MAX_PADDING_LENGTH)

# convert the target into one-hot encoded value.
TOTAL_WORDS_IN_VOC = num_tokens  # one extra was added for padding.
target_df = one_hot_encode(target_df, "target", total_unique_words=TOTAL_WORDS_IN_VOC)

# split the input into training and validation set.
data = pd.DataFrame(token_matrix)
X_train, X_val, y_train, y_val = train_test_split(data, target_df, train_size=0.85, shuffle=True, random_state=1)

# train the model.
print(f"\nNumber of values in train X -----------> {len(X_train)}")
print(f"Number of values in train y -----------> {len(y_train)}")
print(f"Number of values in Val X -----------> {len(X_val)}")
print(f"Number of values in val y -----------> {len(y_val)}")

# model, history = train_model((X_train, y_train, X_val, y_val), num_tokens=TOTAL_WORDS_IN_VOC,
#                               embedding_dim=embedding_matrix_num_features,
#                               embedding_matrix=embedding_matrix, 
#                               padding_length=MAX_PADDING_LENGTH)



Number of values in train X -----------> 40684
Number of values in train y -----------> 40684
Number of values in Val X -----------> 7180
Number of values in val y -----------> 7180


In [78]:
model, history = train_model((X_train, y_train, X_val, y_val), num_tokens=TOTAL_WORDS_IN_VOC,
                              embedding_dim=embedding_matrix_num_features,
                              embedding_matrix=embedding_matrix, 
                              padding_length=MAX_PADDING_LENGTH)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 40, 300)           2077500   
                                                                 
 lstm_6 (LSTM)               (None, 50)                70200     
                                                                 
 dropout_5 (Dropout)         (None, 50)                0         
                                                                 
 dense_5 (Dense)             (None, 6925)              353175    
                                                                 
Total params: 2500875 (9.54 MB)
Trainable params: 2500875 (9.54 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/20


2024-05-04 23:54:02.445975: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 1126946800 exceeds 10% of free system memory.


Epoch 2/20
Epoch 3/20
Epoch 4/20