In [1]:
pip install nltk pandas

Note: you may need to restart the kernel to use updated packages.


In [2]:
# Import the required packages
import zipfile
import nltk
import re
import pandas as pd
import math
from nltk.tokenize import sent_tokenize, word_tokenize
from collections import defaultdict

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to C:\Users\Sujit Kumar
[nltk_data]     Killi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Unzip the dataset
dataset_path = 'A1_DATASET.zip'
extract_path = '/content/'

with zipfile.ZipFile(dataset_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

In [5]:
# Open file function
def open_file(file_path):
    file = open(file_path,'r')
    lines = file.readlines()
    file.close()
    return lines

In [6]:
# pre_process the reviews to clean the strings
def pre_process(lines):
    result = []
    for line in lines:
        line = line.rstrip('\n')
        sentences = sent_tokenize(line)
        for sentence in sentences:
            sentence = sentence.rstrip('.')
            # Substitue non string or white space character
            sentence = re.sub(r'[^\w\s]', '', sentence)
            # Substitute numerical digits
            sentence = re.sub(r'\d', '', sentence).lower()
            tokens = word_tokenize(sentence)
            if len(tokens) > 0:
                result.append(['<s>'] + tokens + ['</s>'])
    return result

In [7]:
# Pre-process the reviews
train_file_reviews = open_file('/content/A1_DATASET/train.txt')
train_sentences = pre_process(train_file_reviews)

In [8]:
# Generate the unigram frequencies and bigram frequencies
def get_uni_and_bigram_frequencies(sentences):
    unigram_frequencies,bigram_frequencies = defaultdict(int),defaultdict(int)
    for sentence in sentences:
        for ind in range(len(sentence)-1):
            unigram_frequencies[sentence[ind]]+=1
            if ind < len(sentence)-2:
                bigram_frequencies[" ".join(sentence[ind:ind+2])]+=1
    return unigram_frequencies,bigram_frequencies

In [9]:
unigram_frequencies,bigram_frequencies = get_uni_and_bigram_frequencies(train_sentences)

In [10]:
# Generate the unigram_table based on frequencies
def get_unigram_table(unigram_frequency,add_numerator = 0,add_denominator = 0):
    total_sum = sum(unigram_frequency.values())
    unigram_table = pd.DataFrame(unigram_frequency.items(),columns=['UniGram', 'Counts'])
    unigram_table['Probability'] = (unigram_table['Counts'] + add_numerator)/(total_sum + add_denominator)
    return unigram_table

In [11]:
# Generate the bigram_table based on frequencies
def get_bigram_table(unigram_frequency,bigram_frequency,add_numerator = 0,add_denominator = 0):
    bigram_table = pd.DataFrame(bigram_frequency.items(),columns=['BiGram','Counts'])
    def create_probability(row):
        return (row['Counts'] + add_numerator)/(unigram_frequency[row['BiGram'].split(" ")[0]] + add_denominator)
    bigram_table['Probability'] = bigram_table.apply(create_probability,axis = 1)
    return bigram_table

In [12]:
unigram_table = get_unigram_table(unigram_frequencies)
unigram_table

Unnamed: 0,UniGram,Counts,Probability
0,<s>,5193,0.061968
1,i,1712,0.020429
2,booked,86,0.001026
3,two,128,0.001527
4,rooms,202,0.002410
...,...,...,...
5908,stirrers,1,0.000012
5909,yo,1,0.000012
5910,yahoo,1,0.000012
5911,guarantee,1,0.000012


In [13]:
bigram_table = get_bigram_table(unigram_frequencies,bigram_frequencies)
bigram_table

Unnamed: 0,BiGram,Counts,Probability
0,<s> i,710,0.136723
1,i booked,21,0.012266
2,booked two,1,0.011628
3,two rooms,3,0.023438
4,rooms four,1,0.004950
...,...,...,...
33968,stand by,1,0.142857
33969,their promise,1,0.009091
33970,promise they,1,0.250000
33971,they advertise,1,0.002041


**Unknown words handling**

In [14]:
# Replace n<=1 words with unknown word
with_Unk_train_sentences = []
for sentence in train_sentences:
    updated_sentence = []
    updated_sentence.append('<s>')
    for ind in range(1,len(sentence)-1):
        if unigram_frequencies[sentence[ind]] <= 1:
            updated_sentence.append('<unk>')
        else:
            updated_sentence.append(sentence[ind])
    updated_sentence.append('</s>')
    with_Unk_train_sentences.append(updated_sentence)

In [15]:
with_UNK_unigram_frequencies,with_UNK_bigram_frequencies = get_uni_and_bigram_frequencies(with_Unk_train_sentences)

In [16]:
with_UNK_unigram_table = get_unigram_table(with_UNK_unigram_frequencies)
with_UNK_unigram_table

Unnamed: 0,UniGram,Counts,Probability
0,<s>,5193,0.061968
1,i,1712,0.020429
2,booked,86,0.001026
3,two,128,0.001527
4,rooms,202,0.002410
...,...,...,...
2967,dried,2,0.000024
2968,disgusted,2,0.000024
2969,palm,2,0.000024
2970,doll,2,0.000024


In [17]:
with_UNK_bigram_table = get_bigram_table(with_UNK_unigram_frequencies,with_UNK_bigram_frequencies)
with_UNK_bigram_table

Unnamed: 0,BiGram,Counts,Probability
0,<s> i,710,0.136723
1,i booked,21,0.012266
2,booked two,1,0.011628
3,two rooms,3,0.023438
4,rooms four,1,0.004950
...,...,...,...
30089,you paid,1,0.002242
30090,not stand,1,0.001629
30091,stand by,1,0.142857
30092,their promise,1,0.009091


**Smoothing with Katz backoff**

In [18]:
def katz_backoff_unigram(term,unigram_dict,alpha=0.1):
    prob = unigram_dict[term]
    if prob == 0:
        prob = alpha*unigram_dict['<unk>']
    return prob
def katz_backoff_bigram(term,unigram_dict,bigram_dict,alpha=0.1):
    prob = bigram_dict[term]
    if prob == 0:
        prob = alpha*unigram_dict[term.split(" ")[0]]
    if prob == 0:
        prob = alpha*unigram_dict['<unk>']
    return prob

**Smoothing with linear interpolation**

In [19]:
def linear_interpolation_unigram(term,unigram_dict):
    prob = unigram_dict[term]
    if prob == 0:
        prob = unigram_dict['<unk>']
    return prob
def linear_interpolation_bigram(term,unigram_dict,bigram_dict,lambda1=0.9,lambda2=0.1):
    prob1 = bigram_dict[term]
    prob2 = unigram_dict[term.split(" ")[0]]
    if prob2 == 0:
        prob2 = unigram_dict['<unk>']
    prob = lambda1*prob1 + lambda2*prob2
    return prob

**Perplexity**

In [20]:
test_file_reviews = open_file('/content/A1_DATASET/val.txt')
test_sentences = pre_process(test_file_reviews)

In [21]:
# Create dictionary from the n-gram tables
def get_probab_dictionaries(unigram_table,bigram_table):
    unigram_dict,bigram_dict = defaultdict(int), defaultdict(int)
    for index,row in unigram_table.iterrows():
        unigram_dict[row['UniGram']] = row['Probability']
    for index,row in bigram_table.iterrows():
        bigram_dict[row['BiGram']] = row['Probability']
    return unigram_dict, bigram_dict

In [34]:
def calculate_perplexities(unigram_table,bigram_table,test_sentences,alpha=0.1,lambda1=0.9,lambda2=0.1):
    unigram_dict,bigram_dict = get_probab_dictionaries(unigram_table,bigram_table)
    unigram_total_perplexity_kb,bigram_total_perplexity_kb = 0,0
    unigram_total_perplexity_li,bigram_total_perplexity_li = 0,0
    for testSet in test_sentences:
        unigram_log_probabilities_kb,bigram_log_probabilities_kb = 0,0
        unigram_log_probabilities_li,bigram_log_probabilities_li = 0,0
        for ind in range(len(testSet)-1):
            unigram_log_probabilities_kb+= math.log(katz_backoff_unigram(testSet[ind],unigram_dict,alpha))
            unigram_log_probabilities_li+= math.log(linear_interpolation_unigram(testSet[ind],unigram_dict))
            if ind < len(testSet)-2:
                bigram_log_probabilities_kb+=math.log(katz_backoff_bigram(" ".join(testSet[ind:ind+2]),unigram_dict,bigram_dict,alpha))
                bigram_log_probabilities_li+=math.log(linear_interpolation_bigram(" ".join(testSet[ind:ind+2]),unigram_dict,bigram_dict,lambda1,lambda2))
        unigram_total_perplexity_kb+= math.exp(-unigram_log_probabilities_kb/(len(testSet)-1))
        bigram_total_perplexity_kb+= math.exp(-bigram_log_probabilities_kb/(len(testSet)-1))
        unigram_total_perplexity_li+= math.exp(-unigram_log_probabilities_li/(len(testSet)-1))
        bigram_total_perplexity_li+= math.exp(-bigram_log_probabilities_li/(len(testSet)-1))
    print("Katz backoff")
    print("Unigram perplexity Katz backoff", round(unigram_total_perplexity_kb/len(test_sentences),2))
    print("Bigram perplexity Katz backoff", round(bigram_total_perplexity_kb/len(test_sentences),2))
    print("\n")
    print("Linear interpolation")
    print("Unigram perplexity Linear interpolation", round(unigram_total_perplexity_li/len(test_sentences),2))
    print("Bigram perplexity Linear interpolation", round(bigram_total_perplexity_li/len(test_sentences),2))
    print("\n")

**Training perplexities**

In [35]:
print("Training set \n")
calculate_perplexities(with_UNK_unigram_table,with_UNK_bigram_table,train_sentences)

Training set 

Katz backoff
Unigram perplexity Katz backoff 374.22
Bigram perplexity Katz backoff 38.34


Linear interpolation
Unigram perplexity Linear interpolation 349.65
Bigram perplexity Linear interpolation 31.53




**Test perplexities**

**alpha=0.1, lambda1=0.9, lambda2=0.1**

In [36]:
print("Test set \n")
calculate_perplexities(with_UNK_unigram_table,with_UNK_bigram_table,test_sentences)

Test set 

Katz backoff
Unigram perplexity Katz backoff 341.41
Bigram perplexity Katz backoff 198.11


Linear interpolation
Unigram perplexity Linear interpolation 310.11
Bigram perplexity Linear interpolation 170.94




**alpha=0.2, lambda1=0.8, lambda2=0.2**

In [37]:
print("Test set \n")
calculate_perplexities(with_UNK_unigram_table,with_UNK_bigram_table,test_sentences,0.2,0.8,0.2)

Test set 

Katz backoff
Unigram perplexity Katz backoff 330.93
Bigram perplexity Katz backoff 146.51


Linear interpolation
Unigram perplexity Linear interpolation 310.11
Bigram perplexity Linear interpolation 121.71




**alpha=0.3, lambda1=0.7, lambda2=0.3**

In [38]:
print("Test set \n")
calculate_perplexities(with_UNK_unigram_table,with_UNK_bigram_table,test_sentences,0.3,0.7,0.3)

Test set 

Katz backoff
Unigram perplexity Katz backoff 325.25
Bigram perplexity Katz backoff 123.63


Linear interpolation
Unigram perplexity Linear interpolation 310.11
Bigram perplexity Linear interpolation 101.57




**alpha=0.4, lambda1=0.6, lambda2=0.4**

In [39]:
print("Test set \n")
calculate_perplexities(with_UNK_unigram_table,with_UNK_bigram_table,test_sentences,0.4,0.6,0.4)

Test set 

Katz backoff
Unigram perplexity Katz backoff 321.41
Bigram perplexity Katz backoff 109.93


Linear interpolation
Unigram perplexity Linear interpolation 310.11
Bigram perplexity Linear interpolation 90.8


