In [None]:
import numpy as np
import pandas as pd
import os, sys
import gensim, nltk, re, string
from collections import Counter
import unicodedata as ud

In [None]:
from sklearn.model_selection import train_test_split

## Preprocess corpus

In [None]:
def tokenize(row):
    return nltk.tokenize.wordpunct_tokenize(row)

In [None]:
def remove_punct(token):
    return ''.join(c for c in token if not ud.category(c).startswith('P'))

In [None]:
def reduce_vocab(df):
    filtered_df = [[token if counter[token]>5 else unk_token for token in row] for row in df]
    return filtered_df

In [8]:
corpus_path = '../data/book corpus/book_corpus.txt'
corpus = open(corpus_path).read().split('\n')
corpus = [tokenize(row) for row in corpus]
corpus_no_punct = [[remove_punct(token) for token in row if remove_punct(token) != ''] for row in corpus]
corpus_no_punct = [row for row in corpus_no_punct if len(row) > 0]

In [9]:
print(' '.join(corpus[0]),'\n')
print(' '.join(corpus_no_punct[0]))

مونا لیزا کی مسکراہٹ میں کیا بھید ہے ؟ اس کے ہونٹوں پر یہ شفق کا سونا ، سورج کا جشن طلوع ہے یا غروب ہوتے ہوئے آفتاب کا گہرا ملال ؟ ان نیم وا متبسم ہونٹوں کے درمیان یہ باریک سی کالی لکیر کیا ہے ؟ یہ طلوع و غروب کے عین بیچ میں اندھیرے کی آبشار کہاں سے گر رہی ہے ؟ 

مونا لیزا کی مسکراہٹ میں کیا بھید ہے اس کے ہونٹوں پر یہ شفق کا سونا سورج کا جشن طلوع ہے یا غروب ہوتے ہوئے آفتاب کا گہرا ملال ان نیم وا متبسم ہونٹوں کے درمیان یہ باریک سی کالی لکیر کیا ہے یہ طلوع و غروب کے عین بیچ میں اندھیرے کی آبشار کہاں سے گر رہی ہے


In [8]:
tokens = [token for row in corpus_no_punct for token in row]
unique_tokens = list(set(tokens))
print('Total number of tokens:', len(tokens))
print('Total number of unique tokens:', len(unique_tokens))    

Total number of tokens: 7949711
Total number of unique tokens: 115202


In [9]:
start_token = '"س"'
end_token = '"ش"'
unk_token = '"انک"'

In [10]:
corpus_no_punct = [[start_token]+row+[end_token] for row in corpus_no_punct]
tokens = [token for row in corpus_no_punct for token in row]
counter = Counter(tokens)
filtered_corpus = reduce_vocab(corpus_no_punct)
print(' '.join(filtered_corpus[0]))

"س" مونا لیزا کی مسکراہٹ میں کیا بھید ہے اس کے ہونٹوں پر یہ شفق کا سونا سورج کا جشن طلوع ہے یا غروب ہوتے ہوئے آفتاب کا گہرا ملال ان نیم وا متبسم ہونٹوں کے درمیان یہ باریک سی کالی لکیر کیا ہے یہ طلوع و غروب کے عین بیچ میں اندھیرے کی آبشار کہاں سے گر رہی ہے "ش"


In [11]:
print('Size of original vocabulary: ',len(counter))
tokens = [token for row in filtered_corpus for token in row]
counter = Counter(tokens)
print('Size of reduced vocabulary: ',len(counter))

Size of original vocabulary:  115204
Size of reduced vocabulary:  29109


In [12]:
train_corpus, test_corpus = train_test_split(filtered_corpus,test_size=0.1,random_state=42) 
len(train_corpus), len(test_corpus)

(142515, 15836)

In [13]:
del corpus, corpus_no_punct, filtered_corpus, counter, tokens, unique_tokens

In [32]:
[29109**i for i in range(1,4)]

[29109, 847333881, 24665041942029]

# N-Gram Langauge Model

### Unigram

In [28]:
class Unigram():
    def __init__(self, corpus=None):
        if corpus is None:
            self.model = None
        else:
            self.fit(corpus)
    
    def fit(self, corpus):
        tokens = [token for row in corpus for token in row]
        self.num_tokens = len(tokens)
        self.model = Counter(tokens)
        for token,count in self.model.items():
            self.model[token] = count/self.num_tokens
        self.V = len(self.model)
    
    def predict(self, test_corpus):
        M = sum([len(row) for row in test_corpus])
        l = -np.sum([np.sum(np.log([self.prob(token, unk='"انک"') for token in row])) for row in test_corpus]) / M
        return 2**l
    
    def prob(self, token, unk=None):
        if unk is None:
            return self.model.get(token, 0)            
        else:
            return self.model.get(token, self.model.get(unk, 0))
        
    def count(self, token):
        return self.model[token]*self.num_tokens

In [19]:
unigram = Unigram()
unigram.fit(train_corpus)

In [20]:
unigram.predict(test_corpus)

120.51595575442532

### Bigram

In [45]:
class Bigram():
    def __init__(self, corpus = None, smoothing=False, smoothing_val=1, backoff=False, discount_val=0.5):
        if corpus is None:
            self.model = None
        else:
            self.fit(corpus)
            
        self.smoothing = smoothing
        self.smoothing_val = smoothing_val
        self.backoff = backoff
        self.discount_val = discount_val
        
        assert not ((self.smoothing is True) and (self.backoff is True))
    
    def fit(self, corpus):
        self.unigram = Unigram(corpus=corpus)
        
        corpus = [[tuple(row[i:i+2]) for i in range(len(row)-1)] for row in corpus]
        bigram_tokens = [bigram_token for row in corpus for bigram_token in row]  
        
        self.num_tokens = len(bigram_tokens)
        self.V = self.unigram.V**2

        self.model = Counter(bigram_tokens)
        
        # no model smoothing/discounting
        if (self.smoothing is False) and (self.backoff is False):
            for bigram_token,count in self.model.items():
                self.model[bigram_token] = count/self.unigram.count(bigram_token[0])
        
        # model smoothing
        elif (self.smoothing is True) and (self.backoff is False):
            for bigram_token,count in self.model.items():
                self.model[bigram_token] = (count+self.smoothing_val)/(self.unigram.count(bigram_token[0])+(self.smoothing_val*self.V))
                
            for token, p in self.unigram.model.items():
                self.unigram.model[token] = 1/(self.unigram.count(token)+(self.smoothing_val*self.V))
        
        # model discounting
        elif (self.smoothing is False) and (self.backoff is True):
            for bigram_token,count in self.model.items():
                self.model[bigram_token] = (count-self.discount_val)/self.unigram.count(bigram_token[0])

            alpha = (self.discount_val*len(self.model))/self.num_tokens
            M = sum([self.unigram.count(token) for token in self.unigram.model.keys()])
            
            for token, p in self.unigram.model.items():
                self.unigram.model[token] = alpha * self.unigram.count(token)/self.unigram.num_tokens
                
    def predict(self, test_corpus):
        test_corpus = [[tuple(row[i:i+2]) for i in range(len(row)-1)] for row in test_corpus]
        M = sum([len(row) for row in test_corpus])
        
        if (self.smoothing is False) and (self.backoff is False):
            l = -np.sum([np.sum(np.log([self.prob(token) for token in row])) for row in test_corpus]) / M
        else:
            l = -np.sum([np.sum(np.log([self.prob(token, unk='"انک"') for token in row])) for row in test_corpus]) / M
            
        return 2**lz
    
    def prob(self, token, unk=None):
        if unk is None:
            return self.model.get(token, 0)            
        else:
            return self.model.get(token, self.unigram.prob(token[0], unk=unk))
        
    def count(self, token):
        if (self.smoothing is False) and (self.backoff is False):
            return self.model[token]*self.unigram.count(token[0])
        elif (self.smoothing is True) and (self.backoff is False):
            return (self.model[token]*(self.unigram.count(token[0])+(self.smoothing_val*self.V)))-self.smoothing_val
        elif (self.smoothing is False) and (self.backoff is True):
            return (self.model[token]*self.unigram.count(token[0]))+self.discount_val

#### Perplexity

In [18]:
bigram = Bigram()
bigram.fit(train_corpus)
print('Perplexity: ', bigram.predict(test_corpus))



Perplexity:  inf


In [19]:
bigram = Bigram(smoothing=True)
bigram.fit(train_corpus)
print('Perplexity: ', bigram.predict(test_corpus))

Perplexity:  96119.63410999677


In [20]:
bigram = Bigram(smoothing=True, smoothing_val=0.001)
bigram.fit(train_corpus)
print('Perplexity: ', bigram.predict(test_corpus))

Perplexity:  876.6444731553388


In [22]:
bigram = Bigram(smoothing=True, smoothing_val=1e-5)
bigram.fit(train_corpus)
print('Perplexity: ', bigram.predict(test_corpus))

Perplexity:  80.15561471602935


In [24]:
bigram = Bigram(backoff=True)
bigram.fit(train_corpus)
print('Perplexity: ', bigram.predict(test_corpus))

Perplexity:  49.855559635033465


## Trigram

In [41]:
class Trigram():
    def __init__(self, corpus=None, smoothing=False, smoothing_val=1, backoff=False, discount_val=0.5):
        if corpus is None:
            self.model = None
        else:
            self.fit(corpus)
            
        self.smoothing = smoothing
        self.smoothing_val = smoothing_val
        self.backoff = backoff
        self.discount_val = discount_val
        
        assert not ((self.smoothing is True) and (self.backoff is True))
    
    def fit(self, corpus):
        self.bigram = Bigram(smoothing=self.smoothing, smoothing_val=self.smoothing_val, backoff=self.backoff, discount_val=self.discount_val)
        self.bigram.fit(corpus)
        
        corpus = [[tuple(row[i:i+3]) for i in range(len(row)-2)] for row in corpus]
        trigram_tokens = [trigram_token for row in corpus for trigram_token in row]  
        
        self.num_tokens = len(trigram_tokens)
        self.V = self.bigram.unigram.V**3

        self.model = Counter(trigram_tokens)
        
        if (self.smoothing is False) and (self.backoff is False):
            for trigram_token,count in self.model.items():
                self.model[trigram_token] = count/self.bigram.count(trigram_token[:2])
        
        elif (self.smoothing is True) and (self.backoff is False):
            for trigram_token,count in self.model.items():
                self.model[trigram_token] = (count+self.smoothing_val)/(self.bigram.count(trigram_token[:2])+(self.smoothing_val*self.V))
                
            for token, p in self.bigram.model.items():
                self.bigram.model[token] = 1/(self.bigram.count(token)+(self.smoothing_val*self.V))
                
        elif (self.smoothing is False) and (self.backoff is True):
            for trigram_token,count in self.model.items():
                self.model[trigram_token] = (count-self.discount_val)/self.bigram.count(trigram_token[:2])

            alpha = (self.discount_val*len(self.model))/self.num_tokens
            M = sum([self.bigram.count(token) for token in self.bigram.model.keys()])
            
            for token, p in self.bigram.model.items():
                self.bigram.model[token] = alpha * self.bigram.count(token)/self.bigram.num_tokens
                
    def predict(self, test_corpus):
        test_corpus = [[tuple(row[i:i+3]) for i in range(len(row)-2)] for row in test_corpus]
        M = sum([len(row) for row in test_corpus])
        
        if (self.smoothing is False) and (self.backoff is False):
            l = -np.sum([np.sum(np.log([self.prob(token) for token in row])) for row in test_corpus]) / M
        else:
            l = -np.sum([np.sum(np.log([self.prob(token, unk='"انک"') for token in row])) for row in test_corpus]) / M
            
        return 2**l
    
    def prob(self, token, unk=None):
        if unk is None:
            return self.model.get(token, 0)            
        else:
            return self.model.get(token, self.bigram.prob(token[:2], unk=unk))

#### Perplexity

In [31]:
trigram = Trigram()
trigram.fit(train_corpus)
print('Perplexity: ', trigram.predict(test_corpus))

V  29108 847275664 24662500027712




Perplexity:  inf


In [42]:
trigram = Trigram(smoothing=True)
trigram.fit(train_corpus)
print('Perplexity: ', trigram.predict(test_corpus))

Perplexity:  331407611.39817506


In [44]:
del trigram

In [43]:
trigram = Trigram(smoothing=True, smoothing_val=0.001)
trigram.fit(train_corpus)
print('Perplexity: ', trigram.predict(test_corpus))

Perplexity:  3015852.8945301343


In [44]:
del trigram

In [46]:
trigram = Trigram(smoothing=True, smoothing_val=1e-5)
trigram.fit(train_corpus)
print('Perplexity: ', trigram.predict(test_corpus))

Perplexity:  128855.96118349521


In [47]:
del trigram

In [50]:
trigram = Trigram(smoothing=True, smoothing_val=1e-9)
trigram.fit(train_corpus)
print('Perplexity: ', trigram.predict(test_corpus))

Perplexity:  386.15454132013974


In [51]:
del trigram

In [48]:
trigram = Trigram(backoff=True)
trigram.fit(train_corpus)
print('Perplexity: ', trigram.predict(test_corpus))

Perplexity:  200.6903463255154


In [49]:
del trigram

In [54]:
trigram = Trigram(backoff=True, discount_val=0.7)
trigram.fit(train_corpus)
print('Perplexity: ', trigram.predict(test_corpus))

Perplexity:  194.58949638277534


In [55]:
del trigram

# Language Model

In [79]:
class NGram_LM():
    
    def __init__(self, n, corpus=None, train=True, smoothing=False, smoothing_val=1, backoff=False, discount_val=0.5, reduced_vocab=True, unk_token='"انک"'):
        self.n = n
        self.is_unigram = self.n == 1
            
        self.train = train
        self.smoothing = smoothing
        self.smoothing_val = smoothing_val
        self.backoff = backoff
        self.discount_val = discount_val
        self.reduced_vocab = reduced_vocab
        self.unk_token = unk_token
        
        if corpus is None:
            self.model = None
        else:
            self.fit(corpus)
            
        assert not ((self.smoothing is True) and (self.backoff is True))
        
    def fit(self, corpus):
        
        if self.is_unigram is True:
            
            ngrams = [ngram for row in corpus for ngram in row]
            self.num_ngrams = len(ngrams)
            self.model = Counter(ngrams)
            self.V = len(self.model)
            
            if self.train is True:
                self.compute_probabilities()
                
        else:
            
            self._backoff_model = NGram_LM(self.n-1, corpus=corpus, train=False, smoothing=self.smoothing, smoothing_val=self.smoothing_val, backoff=self.backoff, discount_val=self.discount_val)
            
            corpus_vocab = len([ngram for row in corpus for ngram in row])
            ngram_corpus = [[tuple(row[i:i+self.n]) for i in range(len(row)-self.n-1)] for row in corpus]
            ngrams = [ngram for row in ngram_corpus for ngram in row]
            del ngram_corpus

            self.num_ngrams = len(ngrams)
            self.V = corpus_vocab**self.n

            self.model = Counter(ngrams)

            if self.train is True:
                self.compute_probabilities()
                
            
    def compute_probabilities(self):
        if self.is_unigram is True:
            
            for ngrams, count in self.model.items():
                self.model[ngrams] = count/self.num_ngrams
                    
        else:
            if (self.smoothing is False) and (self.backoff is False):
                if self._backoff_model.is_unigram:
                    for ngram, count in self.model.items():
                        self.model[ngram] = count/self._backoff_model.model[ngram[0]]
                else:
                    for ngram, count in self.model.items():
                        self.model[ngram] = count/self._backoff_model.model[ngram[:self.n-1]]
                        
            elif (self.smoothing is True) and (self.backoff is False):
                
                if self._backoff_model.is_unigram:
                    for ngram, count in self.model.items():
                        self.model[ngram] = (count+self.smoothing_val)/(self._backoff_model.model[ngram[0]]+(self.smoothing_val*self.V))
                else:
                    for ngram, count in self.model.items():
                        self.model[ngram] = (count+self.smoothing_val)/(self._backoff_model.model[ngram[:self.n-1]]+(self.smoothing_val*self.V))

                for ngram, count in self._backoff_model.model.items():
                    self._backoff_model.model[ngram] = self.smoothing_val/(count+(self.smoothing_val*self.V))

                
            elif (self.smoothing is False) and (self.backoff is True):
                
                if self._backoff_model.is_unigram:
                    for ngram, count in self.model.items():
                        self.model[ngram] = (count-self.discount_val)/self._backoff_model.model[ngram[0]]
                else:
                    for ngram, count in self.model.items():
                        self.model[ngram] = (count-self.discount_val)/self._backoff_model.model[ngram[:self.n-1]]

                self.alpha = (self.discount_val*len(self.model))/self.num_ngrams
                M = sum(self._backoff_model.model.values())

                for ngram, count in self._backoff_model.model.items():
                    self._backoff_model.model[ngram] = self.alpha * count/self._backoff_model.num_ngrams
      
    def prob(self, ngram):
        
        if self.is_unigram:
            return self.unigram_prob(ngram)
            
        else:
            if (self.smoothing is False) and (self.backoff is False):
                return self.model.get(ngram, 0)
            else:                
                if self._backoff_model.is_unigram:
                    return self.model.get(ngram, self._backoff_model.unigram_prob(ngram[0]))
                else:
                    return self.model.get(ngram, self._backoff_model.prob(ngram[:self.n-1]))        
         
    def unigram_prob(self, ngram):
        if self.reduced_vocab:
            return self.model.get(ngram, self.model[self.unk_token])            
        else:
            return self.model.get(ngram, 0)    
            
    def perplexity(self, test_corpus):
        if self.is_unigram is False:   
            test_corpus = [[tuple(row[i:i+self.n]) for i in range(len(row)-self.n-1)] for row in test_corpus]
            
        M = sum([len(row) for row in test_corpus])
        l = -np.sum([np.sum(np.log([self.prob(ngram) for ngram in row])) for row in test_corpus]) / M
            
        return 2**l

In [80]:
# Unigram
model = NGram_LM(n=1)
model.fit(train_corpus)
print('Perplexity: ', model.perplexity(test_corpus))

Perplexity:  119.83675041373233


In [81]:
# Bigram
model = NGram_LM(n=2)
model.fit(train_corpus)
print('Perplexity: ', model.perplexity(test_corpus))



Perplexity:  inf


In [82]:
# Bigram with smoothing
for val in [1, 1e-3, 1e-6, 1e-10]:
    model = NGram_LM(n=2, smoothing=True, smoothing_val=val)
    model.fit(train_corpus)
    print(val, '\tPerplexity: ', model.perplexity(test_corpus))

1 	Perplexity:  220209624.11635014
0.001 	Perplexity:  3482075.176018939
1e-06 	Perplexity:  52003.43720722636


In [83]:
# Bigram with backoff
model = NGram_LM(n=2, backoff=True)
model.fit(train_corpus)
print('Perplexity: ', model.perplexity(test_corpus))

Perplexity:  51.01568008195498


In [84]:
# Trigram
model = NGram_LM(n=3)
model.fit(train_corpus)
print('Perplexity: ', model.perplexity(test_corpus))



Perplexity:  inf


In [85]:
# Trigram with smoothing
for val in [1, 1e-3, 1e-6, 1e-10]:
    model = NGram_LM(n=3, smoothing=True, smoothing_val=val)
    model.fit(train_corpus)
    print(val, '\tPerplexity: ', model.perplexity(test_corpus))

1 	Perplexity:  853816356246.01
0.001 	Perplexity:  82956831813.16142
1e-06 	Perplexity:  7381474896.397572


In [86]:
# Trigram with backoff
model = NGram_LM(n=3, backoff=True)
model.fit(train_corpus)
print('Perplexity: ', model.perplexity(test_corpus))

Perplexity:  61.33946116892834
