# Language modelling using Ngrams

## Aim: To build a bigram and trigram model on a corpus and compare the probabilities of models using python and nltk



In [None]:
!pip install advertools -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.1/310.1 KB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m272.9/272.9 KB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m54.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m82.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m93.9/93.9 KB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 KB[0m [31m22.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.3/57.3 KB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m74.6/74.6 KB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!pip install contractions -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m287.5/287.5 KB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.5/104.5 KB[0m [31m11.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import advertools as adv
import re
import contractions

In [None]:
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
class ngram:

  def __init__(self,text):
    self.ltext=text.lower()
    self.text=contractions.fix(self.ltext)
    self.stop_words=set(adv.stopwords['english'])

  def remove_quotes_brackets(self):
    self.text1=re.sub("[\"\'\(\)]", "", self.text)
    return self.text1

  def remove_punctuations(self,text):
    text1=re.sub(r"[^\w\s]", " ",text)
    return text1

  def tokenize(self,text):
    self.t_text=word_tokenize(text)
    return self.t_text

  def stop_word(self,text):
    self.sw=[]
    for i in text:
      if i not in self.stop_words:
        self.sw.append(i)
    return self.sw

  def count(self,lst):
    d={}
    for i in lst:
      d[i]=lst.count(i)
    return d

  def preprocess(self):
    q_text=self.remove_quotes_brackets()
    p_text=self.remove_punctuations(q_text)
    t_text=self.tokenize(p_text)
    swtext=self.stop_word(t_text)
    word=' '.join(swtext)
    ctext=self.count(t_text)
    return p_text,t_text,ctext

  def n_grams(self,txt,sent,pred,words,n):
    sent=sent.lower()
    pred=pred.lower()
    sent_tup=tuple(sent.split(" "))
    new=sent+' '+pred
    tc=txt.count(new)
    if n==2:
      b=list(nltk.bigrams(words))
      wc=self.count(b)
      cnt=wc[sent_tup]
      prob1=round(tc/cnt,2)
    elif n==3:
      t=list(nltk.trigrams(words))
      wc1=self.count(t)
      cnt1=wc1[sent_tup]
      prob1=round(tc/cnt1,2)
    return prob1

  def model(self):
    p_text,t_text,ctext=self.preprocess()
    q=input('The sentence : ')
    words=q.split(' ')
    n=len(words)
    # n=2
    proba={}
    for i in t_text:
      prob=self.n_grams(p_text,q,i,t_text,n)
      proba[i]=prob
    k=list(proba.keys())
    v=list(proba.values())
    max_v=max(v)
    max_k=k[v.index(max(v))]
    print(f"The predicted word after the sentence '{q}' is '{max_k}' with probability '{max_v}'")

In [None]:
txt='The girl bought a chocolate. The boy ate the chocolate. The girl bought a toy. The girl played with the toy'

In [None]:
t.model()

The sentence : bought a
The predicted word after the sentence 'bought a' is 'chocolate' with probability '0.5'


In [None]:
t=ngram(txt)
t.model()

The sentence : girl bought a
The predicted word after the sentence 'girl bought a' is 'chocolate' with probability '0.5'


## Conclusion: The model is able to predict the next words based on bigrams and trigrams correctly.