# Introduction


This is about predicting the next word of the sentence, this can be told mathematical as

P(wi+1 | wi, wi-1, wi-2) => predicting the word i+1 based on the words i, i-1, i-2 ...

Real life examples:
- In google search bar when you type some words it will show the completion
- Whatsapp or other places when you typing, it will show the next word suggestion
- Gmail word completion while writing the mail.

# Pipeline


# TODO

In [2]:
# basic packages.
import re
import numpy as np
import pandas as pd 
import kaggle_utils as utils 

# visualization packages.
import plotly.graph_objects as go

# NLP framework.
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from tensorflow import keras
import gensim

# model preprocessing and metrics.
from sklearn.model_selection import train_test_split

from tensorflow.keras.callbacks import EarlyStopping

2024-05-03 19:36:23.747421: W tensorflow/stream_executor/platform/default/dso_loader.cc:59] Could not load dynamic library 'libcudart.so.10.1'; dlerror: libcudart.so.10.1: cannot open shared object file: No such file or directory
2024-05-03 19:36:23.747443: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [7]:

# Get the data
file_path: str = "data/medium_data.csv"
data = pd.read_csv(file_path)
data.head(5)

# We are just gonna consider the title of the articles.
data = data[["title"]]

# Basic data common analysis.
column, categorical_data, numerical_data, missing_data = utils.Common_data_analysis(data)


Numerical data list [] ---> total 0 numerical values
Categorical data list ['title'] ---> total 1 categorical values



Unnamed: 0,data type,Missing Value(NA),?[]na null ' ',% of Missing value(NA),% of Missing value(?[]na null ' ')
title,object,0,0,0.0,0.0


In [3]:
# preprocessing 


# can i remove punctuation: yes i have to
# can i remove links: yes it is not gonna give any useful information here
# can i remove numbers: yes, we can because "2 people died through the disaster" and  "people died through the disaster" are both gonna give the same meaning.
# can i remove stop words: yes, we can.


class PreprocessData:
    """
    Class to preprocess the data.
    """

    def __init__(self, data: pd.DataFrame, column_name: str) -> None:
        """ 
        Initialize the dataframe.

        :param data: The data dataframe to preprocess.
        :param column_name: Which column we are doing the preprocessing on.
        """
        self.data: pd.DataFrame = data
        self.column_name: str = column_name
        self.file_name: str = "pre-processed-data.pkl"

    def _remove_symbols(self):
        """ 
        This function does the following things.
        * Removes the numbers.
        * Removes punctuations.
        * Removes any symbols.
        """
        # 1 | remove the punctuation words and symbols and numbers.
        # will create a regex and apply over the text.
        self.data[self.column_name] = self.data[self.column_name].apply(lambda sentence: re.sub(r"[0123456789!\"#$%&()*+,-./:;<=>?@[\]^_`{|}~]", " ", sentence)) 

    def _remove_links(self):
        """ 
        This function does the following things.
        * Removes any links.
        """
        self.data[self.column_name] = self.data[self.column_name].apply(lambda sentence: re.sub(r"(w+://S+)", " ", sentence))

    def _case_fold_to_lower_case(self):
        """ 
        This function does the following things.
        * Change the sentence into lower case.
        """
        self.data[self.column_name] = self.data[self.column_name].str.lower()

    def _fix_typo(self):
        """ 
        This function does the following things.
        * Fix the typo of the each word -- in real word when you get data from tweets, mails, articles etc
        * there is a chance of typo since they are written by humans. We need to fix it to get better results.
        """
        # we are using 
        pass

    def _tokenization(self):
        """ 
        This function does the following things.
        * Tokenization -- which is splitting the sentence into words.
        """
        # 5 | tokenize.
        def tokenize_sentence(sentence):
            return nltk.word_tokenize(sentence)
        
        self.data[self.column_name] = self.data[self.column_name].apply(lambda sentence: tokenize_sentence(sentence))

    def _remove_stop_words(self):
        """ 
        This function does the following things.
        * Removes the stop words from the tokens.
        """
        def remove_stop_word(words: list) -> list:
            """  
            Remove the stop words from the list of words.

            :param words: The list of words in a sentence.
            :returns: List of words which are not stop words.
            """
            stop_words = stopwords.words('english')
            return [word for word in words if word not in stop_words]
            
        self.data[self.column_name] = self.data[self.column_name].apply(lambda words: remove_stop_word(words))

    def _lemmatization(self):
        """ 
        This function does the following things.
        * Lemmatization -- which is converting every word into it's root form.
        """

        def create_lemma(words: list):
            """ 
            Create the lemma for the list of string.
            """
            lemmetizer = WordNetLemmatizer()
            lemmatized_tokens = [lemmetizer.lemmatize(token) for token in words]
            return lemmatized_tokens
        
        self.data[self.column_name] = self.data[self.column_name].apply(lambda row: create_lemma(row))

    def run(self, save_preprocessed_dataframe: bool = True,
            verbose: bool = True,
            fix_typo: bool = True,
            lemmatize: bool = True,
            remove_stop_words: bool = True,
            ) -> pd.DataFrame:
        """ 
        Actually run the preprocessing pipeline.
        Steps which are involved in it.
        * 1 - Remove symbols (including numbers, symbols, special characters)
        * 2 - Remove links.
        * 3 - Case fold -- change all to lower case.
        * 4 - Fix typos.
        * 5 - tokenization.
        * 6 - Remove stop words.
        * 7 - lemmatization.
        """
        if verbose: print("*"*50)
        if verbose: print("\nPreprocessing data...")
        if verbose: print("*"*50)

        # 1 | Remove symbols (including numbers, symbols, special characters).
        if verbose: print("")
        if verbose: print("\tRemoving numbers, punctuations and special characters...")
        self._remove_symbols()

        # 2 | Remove links.
        
        self._remove_links()
        if verbose: print("\tRemoving links...")
        # 3 | Case fold -- change all to lower case.
        self._case_fold_to_lower_case()

        # 4 | Fix typos.
        if fix_typo:
            if verbose: print("\tFixing typos...")
            self._fix_typo()

        # 5 | tokenization.
        if verbose: print("\tTokenization -- Splitting the sentence into tokens...")
        self._tokenization()

        # 6 | Remove stop words.
        if remove_stop_words:
            if verbose: print("\tRemoving stop words...")
            self._remove_stop_words()

        # 7 | lemmatization.
        if lemmatize:
            if verbose: print("\tLemmatization...")
            self._lemmatization()

        # save the dataframe, so that we can skip the preprocessing next time.
        if save_preprocessed_dataframe:
            if verbose: print("\tSaving the dataframe for future use...")
            data.to_pickle(self.file_name)
            # pd.read_pickle(file_name)

        if verbose: print("Preprocessing is done successfully.")
        if verbose: print("*"*50)

        return self.data


# can be done easily using spacy -- but less control over it
# def spacy_tokenizer(doc):
#   return [t.lemma_.lower() for t in nlp(doc) if \
#           len(t) > 2 and \
#           not t.is_punct and \
#           not t.is_space and \
#           not t.is_stop and \
#           t.is_alpha]
# tokenizer = keras.preprocessing.text.Tokenizer(filters="", lower=False, split='|', oov_token='OOV')

# # Import the NLP module which contains Spark NLP and NLU libraries
# from johnsnowlabs import nlp
# spark = nlp.start(nlp=False)

# # Use Norvig model
# nlp.load("en.spell.norvig").predict("Plaese alliow me tao introdduce myhelf, I am a man of wealth und tiaste")



In [None]:
# From the source code:

# - **fit_on_texts** Updates internal vocabulary based on a list of texts. This method creates the vocabulary index based on word frequency.
#  So if you give it something like, "The cat sat on the mat." It will create a dictionary s.t. word_index["the"] = 1;
# word_index["cat"] = 2 it is word -> index dictionary so every word gets a unique integer value. 0 is reserved for padding.
# So lower integer means more frequent word (often the first few are stop words because they appear a lot).


# - **texts_to_sequences** Transforms each text in texts to a sequence of integers. So it basically takes each word in the text and 
# replaces it with its corresponding integer value from the word_index dictionary. Nothing more, nothing less, certainly no magic involved.


# Why don't combine them? Because you almost always fit once and convert to sequences many times.
# You will fit on your training corpus once and use that exact same word_index dictionary at train / eval / testing / prediction
#time to convert actual text into sequences to feed them to the network. So it makes sense to keep those methods separate.


class WordToVectors:
    """
    Converting the word to vectors(numbers). Using pretrained "word2vec" model. 
    
    """

    def __init__(self, data: pd.DataFrame, column_name: str) -> None:
        """ 
        Initialize the variables.

        :param data: The data dataframe to preprocess.
        :param column_name: Which column we are doing the preprocessing on.
        """
        self.data = data
        self.column_name = column_name
        self.tokenizer = None
        # To store the generated vectors.
        self.df = pd.DataFrame()

        # Store the word-index for future reference.
        self.word_index = None
        self.embedding_matrix = None

        # get the inform about the embedding metrix.
        self.num_tokens: int = 0
        self.embedding_matrix_num_features = 0

    def _tokens_to_vectors(self) -> None:
        """ 
        Generate the vectors from the tokens.
        """
        # First converting the tokens into corresponding numbers.
        # oov_token='<OOV>' replaces all out-of-vocabulary words with <OOV>.
        if not self.tokenizer:
            self.tokenizer = keras.preprocessing.text.Tokenizer(lower=True, oov_token="<OOV>")

        # give the tweets texts to the models and fit the texts.
        self.tokenizer.fit_on_texts(self.data[self.column_name])

        # why token_index -- because the number is actually the index of the word which is stored in the word_index dictionary.
        self.df["token_index"] = self.tokenizer.texts_to_sequences(self.data[self.column_name])

        self.word_index = self.tokenizer.word_index

    def _sequence_padding(self, max_padding_length: int) -> None:
        """ 
        Pad the sequence, this is because all the sentence won't be in the same length.
        We will decide the max padding length and convert all the sentence to that length.
        if the sentence length < max_padding_length --> fill the remaining place with 0.
        if the sentence length > max_padding_length --> Trim the sentence.
        """
        self.df["token_index"] = keras.preprocessing.sequence.pad_sequences(self.data, 
                                                                            maxlen=max_padding_length)
    
    def _create_embedding_matrix(self) -> None:
        """ 
        
        """
        # load the google pre-build model.
        word_vectors = gensim.models.KeyedVectors.load_word2vec_format('/home/pavithra/projects/NLP/GoogleNews-vectors-negative300.bin', binary=True,)

        # + 1 to account for padding token. 0 is always reserved for padding in enbedding layer.
        self.num_tokens: int = len(self.word_index) + 1

        # Initialize a matrix of zeroes of size: vocabulary * embedding dimension.
        self.embedding_matrix_num_features: int = 300
        self.embedding_matrix = np.zeros((self.num_tokens, self.embedding_matrix_num_features))

        for word, i in self.word_index.items():
            if word_vectors.has_index_for(word):
                self.embedding_matrix[i] = word_vectors[word].copy()
    
    def run(self, max_padding_length: int):
        """ 
        Convert the tokens into vectors.
        """
        print("Word to vectorization process in progress...")
        self._tokens_to_vectors()
        self._sequence_padding(max_padding_length)
        self._create_embedding_matrix()

        return self.num_tokens, self.embedding_matrix_num_features, self.embedding_matrix

