In [320]:
import nltk
from nltk.util import ngrams
from nltk.probability import FreqDist, LidstoneProbDist
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
import random
from autocorrect import Speller

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/manish/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [321]:
# Sample data loading from a text file and remove line number
with open('./eng_news_2020_10K-sentences-1.txt', 'r', encoding='utf-8') as file:
    data = file.readlines()

processed_data = []

for line in data:
    sentances = sent_tokenize(line)
    for sentence in sentances:
        words = sentence.split()[1:]
        sentence = ' '.join(words)
        processed_data.append(sentence)

print(processed_data[0])

“18 months ago, we expelled a boy at Nations for selling drugs in six schools.


In [333]:

class Model:
    def __init__(self, ngram:int, th:int = 0.060, seed:int = 42) -> None:
        self.model          =   {}
        self.n              =   ngram
        self.spell          =   Speller(lang='en')
        self.stop_words     =   set(stopwords.words('english'))
        self.max_next       =   100
        self.dictionary     =   {}
        self.thresold       =   th
        self.seeds          =   seed
        self.random         =   random
        self.start_tag      =   '<s>'
        self.stop_tag       =   '</s>'

        self.random.seed(self.seeds)

    def nextWord(self, context:str, random:bool = True)->str:
        context = self.getTokens(context)[:-1]
        return self._nextWord(tokens=context,random=random)
    
    def _nextWord(self, tokens:list, random=True)->str:
        words = self._getTopWords(tokens)
        if len(words) > 1 and random:
            return self.random.choice(words)
        elif len(words) > 0:
            return words[0]
        else:
            return None
        
    def _getTopWords(self,tokens:list, n:int= None)->list:
        req = self.n-1
        if len(tokens) > req:
            tokens = tokens[-req:]
        try:
            next_words = sorted(self.model[tuple(tokens)].items(), key= lambda x: x[1], reverse=True)
            words = [word[0] for word in next_words if word[1] > self.thresold]
            print("words: ",words)
            if n is not None:
                return words[:n]
            return words
        except:
            return [self.stop_tag]
        
    def getTokens(self, sentence:str)->list:
        tokens = []
        token_list = list(word_tokenize(sentence.lower()))
        token_list = [word for word in token_list if word.isalnum()]
        token_list = list(filter(lambda token: nltk.tokenize.punkt.PunktToken(token).is_non_punct, token_list))
        prefix = [self.start_tag]*(self.n-1)
        sufix = [self.stop_tag]
        tokens.extend(prefix)
        tokens.extend(token_list)
        tokens.extend(sufix)
        return tokens
    
    def addToDictionary(self, word:str):
        entry = self.dictionary.get(word)
        if entry is None:
            self.dictionary[word] = 0
        self.dictionary[word] += 1
        
    def fit(self, train_data:list):
        for sentence in train_data:
            words = self.getTokens(sentence)
            n_grams_list = list(ngrams(words, self.n))
            for n_grams in n_grams_list:
                key, value = n_grams[:-1], n_grams[-1]
                
                if key not in self.model:
                    self.model[key] = {}
                if value not in self.model[key]:
                    self.model[key][value] = 1
                else:
                    self.model[key][value] += 1
        for key in self.model:
            total_count = float(sum(self.model[key].values()))
            for w3 in self.model[key]:
                self.model[key][w3] /= total_count

    def complete_sentence(self, context:str, random:bool = True)->str:
        words = self.getTokens(context)[:-1]
        next_word = words[-1]
        max = self.max_next
        while  next_word !=  self.stop_tag:
            next_word = self._nextWord(words,random)
            words.append(next_word)
            max -= 1
            if max==0:
                break
        sentence = " ".join(words)
        sentence = sentence.replace(self.start_tag, "")
        sentence = sentence.replace(self.stop_tag, ".")
        sentence = sentence.strip()
        return sentence


            

In [334]:
model = Model(3)
model.fit(processed_data)

print(model.nextWord("I am"))

words:  ['not', 'going']
not


In [335]:
print(model.nextWord("I am"))
print(model.nextWord("I am"))
print(model.nextWord("I am"))

words:  ['not', 'going']
not
words:  ['not', 'going']
going
words:  ['not', 'going']
not


In [336]:
sentence = model.complete_sentence("I")
print(sentence)

words:  ['have', 'm', 'think']
words:  ['also', 'to', 'no', 'a', 'been']
words:  ['be']
words:  ['a']
words:  []


KeyError: ('a', None)