<a href="https://colab.research.google.com/github/masonreznov/CS-332-NLP-LAB/blob/main/LAB-6/Lab_6_language_modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from nltk.corpus import reuters
from nltk import bigrams, trigrams
from collections import Counter, defaultdict
import nltk
nltk.download('reuters')
nltk.download('punkt')

### Build a trigram language model using a basic MLE estimator

In [None]:
## code referred from https://nlpforhackers.io/language-models/ and https://www.analyticsvidhya.com/blog/2019/08/comprehensive-guide-language-model-nlp-python-code/

## empty default dict dictionary to initialise the language model
## the model is a keyed-value datastructure which is of the form model[key*][key] storing the default value of 0 

trigram_mle_model = defaultdict(lambda: defaultdict(lambda: 0))



## Count frequency of co-occurance  
for sentence in reuters.sents():
    for w1, w2, w3 in trigrams(sentence, pad_right=True, pad_left=True):
        trigram_mle_model[(w1, w2)][w3] += 1

# ## Transform the counts to probabilities
for w1_w2 in trigram_mle_model:
    total_count = float(sum(trigram_mle_model[w1_w2].values()))
    for w3 in trigram_mle_model[w1_w2]:
        trigram_mle_model[w1_w2][w3] /= total_count

In [None]:
import nltk
nltk.download('reuters')

In [None]:
## Retrieving the probabilities of every other word following the context word 'he will'
trigram_mle_model["he","will"]

In [None]:
## Retrieving the probabilities of the word 'ask' following the context word 'he will'
trigram_mle_model["he","will"]['ask']

### Todo #1: Build a bigram language model using a basic MLE estimator

In [None]:
## HINT refer the trigram model code

bigram_mle_model = defaultdict(lambda: defaultdict(lambda: 0))

for sentence in reuters.sents():
    for w1, w2 in bigrams(sentence, pad_right=True, pad_left=True):
        #TODO-1: count the occurence of w2 preceded by w1

for w1 in bigram_mle_model:
    total_count = float(sum(bigram_mle_model[w1].values()))
    for w2 in bigram_mle_model[w1]:
        #TODO-2: compute the prob of w2 given w1

In [None]:
print(f"trigram mle score: {trigram_mle_model['he','will']['ask']}")
print("------------------------------------------------------------")
print(f"bigram mle score: {bigram_mle_model['he']['will']}")



### Language model estimation using nltk library

In [None]:
from nltk.lm import MLE, Laplace, KneserNeyInterpolated
from nltk.util import everygrams

import nltk
nltk.download("punkt")

In [None]:
## MLE estimator
## CODE referred from https://github.com/murthyrudra/IIITL_NLP_Lab/blob/main/Lab05/Lab05.ipynb
# https://www.nltk.org/api/nltk.lm.html
def build_mle_estimator(n):
    texts = [w.lower() for w in reuters.words()]
    
    # generate ngrams
    ngrams = list(everygrams(texts, max_len=n))

    # build ngram language models
    lm = MLE(n)
    lm.fit([ngrams], vocabulary_text=texts)
    print(lm.vocab)
    
    return lm

In [None]:
## the bigram MLE Language model
lm_mle_bigram = build_mle_estimator(2)

In [None]:
## sample texts for testing the conditional probability of the next word give the context word/words
def print_lm_scores(lm):
    print(f"Probability of 'he' followed by 'will': {lm.score(word='will', context=['he']):.5f}")
    print(f"Probability of 'america' followed by 'first': {lm.score(word='first', context=['america']):.5f}")

    print(f"Probability of 'you' followed by 'thank': {lm.score(word='thank', context=['you']):.5f}")
    print(f"Probability of 'thank' followed by 'you': {lm.score(word='you', context=['thank']):.5f}")


In [None]:
print_lm_scores(lm_mle_bigram)

#### TODO-2: Build a laplace estimator

Refer [this](https://www.nltk.org/api/nltk.lm.html)

In [None]:

def build_laplace_estimator(n):
    texts = [w.lower() for w in reuters.words()]
    
    # generate ngrams
    ngrams = list(everygrams(texts, max_len=n))

    # build ngram language models
    lm = # complete this line
    lm.fit([ngrams], vocabulary_text=texts)
    print(lm.vocab)
    return lm

In [None]:
## the bigram Laplace Language model
lm_laplace_bigram = build_laplace_estimator(2)

In [None]:
## compare the scores of the bigram MLE and bigram laplace
print_lm_scores(lm_mle_bigram)
print_lm_scores(lm_laplace_bigram)

Probability of 'he' followed by 'will': 0.00154
Probability of 'america' followed by 'first': 0.00010
Probability of 'you' followed by 'thank': 0.00003
Probability of 'thank' followed by 'you': 0.00003


In [None]:
lm_lap_tri = build_laplace_estimator(3)
print_lm_scores(lm_lap_tri)

#### TODO-3: Build a interpolated kneyserney estimator

Refer [this](https://www.nltk.org/api/nltk.lm.html)

In [None]:
def build_kneyser_estimator(n):
    # comple the code

    
        
    return lm

In [None]:
lm_kn_bi = build_kneyser_estimator(2)
lm_kn_tri = build_kneyser_estimator(3)

In [None]:
print_lm_scores(lm_kn_bi)
print_lm_scores(lm_kn_tri)


In [None]:
##### You can generate words from the trained Language models using generate() in nltk
##### Refer https://www.nltk.org/api/nltk.lm.html and generate words using lm.generate() for all the language models

#### Try to generate the next 10 words given the context/seed word ['he','will'] for all the models

**Submit the colab notebook link in this [form](https://forms.gle/1f5zLKen8s3PaivK6) on or before 19/04/2022**