# Natural Language Processing
# Assignment 1
Ashutosh Maurya (MDS202110)

## Corpus Creation

In [None]:
import json
import os
import nltk
import copy
import regex as re
import string
import pickle

from collections import Counter
from collections import OrderedDict
from itertools import tee, islice


In [None]:
import zipfile
zip_folder = zipfile.ZipFile('pdf_json.zip', 'r')
# imgdata = archive.read('img_01.png')

In [None]:
zip_path = 'pdf_json.zip'
zip_folder = zipfile.ZipFile(zip_path, 'r')

In [None]:
len(zip_folder.namelist())-1

56529

In [None]:
### extract_body_text(file):
### input: .json file
### output: stores the content of the 'body_text' tag of the input file into a string
### also transforms all text into lowercase


def extract_body_text(file):

    paper_content = json.load(file)
    body_text = ""
    if 'body_text' in paper_content:
#         print("----DOC STARTS----")
        for bt in paper_content ['body_text']:
            body_text = (body_text + bt ['text']).lower()

#             print((body_text + '\n').lower ())

#         print("----DOC ENDS----")
    return body_text

In [None]:
### make_corpus(n_docs):
### input: number of files(documents)
### output: stores the output from extract_body_text(file) for the given number of files
### check for .json as some files weren't json


def make_corpus(n_docs=56529):
    count = 0
    corpus = ""
#     n_docs = 10
    with zipfile.ZipFile('pdf_json.zip') as z:
        for file in z.namelist():
            if not os.path.isdir(file) and file.filename.endswith('.json'):
                # read the file
                with z.open(file) as f:
                    corpus += extract_body_text(f)
                count += 1
            if count>=n_docs:
                break
    return corpus

In [None]:
%%time
corpus = make_corpus(50000)

CPU times: total: 12min 20s
Wall time: 12min 21s


In [None]:
pickle.dump(corpus,open('corpus.txt','w'))

In [None]:
with open('corpus.txt', "r", encoding = "utf8") as file:
    full_corpus = file.readlines()

In [None]:
full_corpus = str(full_corpus)

In [None]:
len(full_corpus)

1264601399

The variable ```full_corpus``` contains the required corpus.

## Preprocessing

- We shall first tokenize the sentences using ```nltk.tokenize.sent_tokenize```, because we need to tag the sentences with a start and end tag.
- Since our task is to predict missing text, we cannot remove stop words. Similarly, we cannot lemmatize the corpus.
- We shall remove all punctuation, and any extra spaces
- We shall remove all URLs.
- We shall remove all digits.
- We shall remove all special characters

In [None]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [None]:
### padding(n):
### input: n for n-gram
### output: corpus with begin and end sentence tags


def padding(n):
    token_corpus = []
    for doc in full_corpus:
        # adding 1 and 2 of either tags for bigrams and trigrams respectively
        token_doc = ["bgnsntnc "*(n-1) + j.lower() + " endsntnc"*(n-1) for j in sent_tokenize(doc)]
        token_corpus.append(token_doc)
    return token_corpus

In [None]:
%%time
corpus_bi = padding(2)

In [None]:
%%time
corpus_tri = padding(3)

CPU times: total: 7min 9s
Wall time: 7min 9s


In [None]:
# pickle.dump(corpus_bi,open('corpus_bi','wb'))

In [None]:
# pickle.dump(corpus_tri,open('corpus_tri','wb'))

In [None]:
corpus_bi = pickle.load(open('corpus_bi','rb'))

In [None]:
corpus_tri = pickle.load(open('corpus_tri','rb'))

In [None]:
### preprocessing(corpus):
### input: text corpus (list of lists of sentences)
### output: preprocessed corpus (str)
### applies the preprocessing steps using regex and string manupulation

def preprocessing(corpus):

    prep_corpus = copy.deepcopy(corpus)

    prep_corpus = ' '.join(str(y) for x in prep_corpus for y in x)
#     ref_pattern = re.compile(r'\[[0-9/]+\]')
    url_pattern = re.compile(r'https?:\/\/.\S*')
    prep_corpus = re.sub(url_pattern," ",prep_corpus)
    prep_corpus = "".join([char for char in prep_corpus if char.isalpha() or char.isspace()])
    prep_corpus = re.sub('\s+', ' ', prep_corpus).strip()

    return prep_corpus

In [None]:
%%time
prep_corpus_bi = preprocessing(corpus_bi)

CPU times: total: 6min 52s
Wall time: 8min 49s


In [None]:
%%time
prep_corpus_tri = preprocessing(corpus_tri)

CPU times: total: 7min 49s
Wall time: 20min


The variables ```prep_corpus_bi``` and ```prep_corpus_bi``` contain the bigram and trigram preprocessed corpus respectively.

In [None]:
# pickle.dump(prep_corpus_bi,open('prep_corpus_bi','wb'))

In [None]:
# pickle.dump(prep_corpus_tri,open('prep_corpus_tri','wb'))

## Vocabulary Count

Counting the number of unique words in ```prep_corpus_bi``` will give us the vocabulary count.

In [None]:
%%time
prep_corpus_bi = pickle.load(open('prep_corpus_bi','rb'))

CPU times: total: 1.39 s
Wall time: 3.81 s


In [None]:
%%time
prep_corpus_tri = pickle.load(open('prep_corpus_tri','rb'))

CPU times: total: 1.5 s
Wall time: 4.15 s


In [None]:
%%time
all_words_bi = prep_corpus_bi.split(" ")

CPU times: total: 23.5 s
Wall time: 24.2 s


In [None]:
# %%time
# pickle.dump(all_words_bi,open('all_words_bi','wb'))

CPU times: total: 4min 25s
Wall time: 14min 8s


In [None]:
%%time
all_words_bi = pickle.load(open('all_words_bi','rb'))

CPU times: total: 35.3 s
Wall time: 50.9 s


In [None]:
%%time
all_words_tri = prep_corpus_tri.split(" ")

CPU times: total: 39.1 s
Wall time: 47.6 s


In [None]:
%%time
vocabulary = Counter(all_words_bi)

CPU times: total: 42.6 s
Wall time: 53.1 s


In [None]:
# pickle.dump(vocabulary,open('vocabulary','wb'))

In [None]:
V = len(vocabulary)

In [None]:
V

1337035

The vocabulary count is 1337035.

## Bigram and Trigram Language Models

In [None]:
from itertools import tee, islice

### ngrams(words, n):
### input: words_list and n for n-gram
### output: itertool object counting frequency
### uses a generator object to iteratively yield the count

def ngrams(word_list, n):
    temp = word_list
    while True:
        first, second = tee(temp)
        slice = tuple(islice(first, n))
        if len(slice) == n:
            yield slice
            next(second)
            temp = second
        else:
            break

In [None]:
### make_model(words,n):
### input: words_list and n for n-gram
### output: frequency table (dictionary with tuples (ngrams) as keys and frequency as values)
### using a Counter with the output of ngrams() allows us to count the frequency iteratively, which is faster

def make_model(word_list,n_gram):

    freq_table = Counter(ngrams(word_list, n_gram))
#     freq_table = OrderedDict(freq_table.most_common())
#     print("Most common {}-grams :".format(n_gram), list(freq_table.items())[:10])

    return freq_table

In [None]:
%%time
unigram_model = OrderedDict(vocabulary.most_common())

CPU times: total: 3.53 s
Wall time: 6.19 s


In [None]:
%%time
unigram_model = pickle.load(open('unigram_model','rb'))

CPU times: total: 1.31 s
Wall time: 2.4 s


In [None]:
# pickle.dump(unigram_model,open('unigram_model','wb'))

In [None]:
%%time
bigram_model = make_model(all_words_bi, 2)

CPU times: total: 4min 18s
Wall time: 5min 34s


In [None]:
%%time
bigram_model = pickle.load(open('bigram_model','rb'))

CPU times: total: 40.9 s
Wall time: 1min 3s


In [None]:
# pickle.dump(bigram_model,open('bigram_model','wb'))

In [None]:
unigram_model["the"]

10403597

In [None]:
bigram_model[("and","the")]

360116

In [None]:
%%time
trigram_model = make_model(all_words_tri, 3)

CPU times: total: 13min 40s
Wall time: 48min 52s


In [None]:
trigram_model[("and","the","people")]

119

In [None]:
### a memory-efficient way to dump and load trigram using itertools and generators

%%time
def chunks(data, SIZE):
    it = iter(data)
    for i in range(0, len(data), SIZE):
        yield {k:data[k] for k in islice(it, SIZE)}

def dump_trigram():
    batch = 0
    for item in chunks(trigram_model,int(len(trigram_model)/5)):
        print(batch)
        pickle.dump(item,open('trigram_model'+ str(batch),'wb'),protocol=pickle.HIGHEST_PROTOCOL)
        batch += 1

dump_trigram()

0
1
2
3
4
5
CPU times: total: 2min 22s
Wall time: 3min 58s


In [None]:
def load_trigram():
    trigram = {}
    for i in range(6):
        file = open('trigram_model'+str(i),'rb')
        batch = pickle.load(file)
        trigram = {**trigram, **batch} # merges dictionaries

    return trigram

trigram_model = load_trigram()

In [None]:
len(bigram_model),len(trigram_model)

(19646034, 69904872)

In [None]:
### predict_topten_trigram(target):
### input: first,second word of trigram, word previous and previous to previous to the missing word (str)
### output: dict with top-ten trigrams as keys and probability as values
### probability is calculated using Laplacian smoothing
### sort only after accessing the required bigrams, then taking first ten

def predict_topten_bigram(target):
    topten = {}
    for ngram in bigram_model.keys():
        if ngram[0] == target and ngram[1] != 'endsntnc':

            prob = (bigram_model[ngram]+1)/(unigram_model[target] + len(unigram_model))

            topten[ngram] = prob
#             print((ngram, model[ngram]))

    topten = dict(sorted(topten.items(), key = lambda x : x[1], reverse=True))

    return dict(islice(topten.items(),10))

In [None]:
### predict_topten_trigram(target1,target2):
### input: first,second word of trigram, word previous and previous to previous to the missing word (str)
### output: dict with top-ten trigrams as keys and probability as values
### probability is calculated using Laplacian smoothing
### sort only after accessing the required bigrams, then taking first ten

def predict_topten_trigram(target1, target2):
    topten = {}
    for ngram in trigram_model.keys():
        if ngram[0] == target1 and ngram[1] == target2 and ngram[2] != 'endsntnc':

            prob = (trigram_model[ngram]+1)/(bigram_model[(target1,target2)] + len(bigram_model))

            topten[ngram] = prob
#             print((ngram, model[ngram]))

    topten = dict(sorted(topten.items(), key = lambda x : x[1], reverse=True))

    return dict(islice(topten.items(),10))

The ```bigram_model``` and ```trigram_model``` variables contain the bigram and trigram models respectively.

We save them in the disk using ```pickle.dump(model,file)```.

## Predicting the missing text

To predict the missing word, we extract the n words previous to it for a n-gram model, and extract the relevant n-grams from the dictionary of all n-grams, and then select the one with the highest probabillity.

In [None]:
### pred_miss_word_bigram(sent):
### input: sentence (str)
### output: prints top-ten bigrams with probability and fills the blank with the bigram with highest probability


def pred_miss_word_bigram(sent):

    sent = sent.replace("___", "xxxmissingwordxxx")
    prep_sent = preprocessing([[sent]])
    sent_pad = "bgnsntnc " + prep_sent + " endsntnc"

    print("Preprocessed sentence: ",sent_pad)
    words = sent_pad.split()

    for i in range(1,len(words)):
        if words[i] == "xxxmissingwordxxx":
            target = words[i-1]

            topten_bigram = predict_topten_bigram(target)

            print(*topten_bigram.items(), sep='\n')

            words[i] = sorted(topten_bigram.items(), key = lambda x : x[1], reverse=True)[0][0][1]

    print("-----------------------------------***-----------------------------------")
    print("Filled sentence: ", " ".join(words))
    print("-----------------------------------***-----------------------------------")

In [None]:
### pred_miss_word_bigram(sent):
### input: sentence (str)
### output: prints top-ten bigrams with probability and fills the blank with the bigram with highest probability

def pred_miss_word_trigram(sent):

    sent = sent.replace("___", "xxxmissingwordxxx")
    prep_sent = preprocessing([[sent]])
    sent_pad = "bgnsntnc bgnsntnc " + prep_sent + " endsntnc endsntnc"

    print("Preprocessed sentence: ",sent_pad)
    words = sent_pad.split()

    for i in range(2,len(words)):
        if words[i] == "xxxmissingwordxxx":
            target1 = words[i-2]
            target2 = words[i-1]
            topten_trigram = predict_topten_trigram(target1,target2)

            print(*topten_trigram.items(), sep='\n')

            words[i] = sorted(topten_trigram.items(), key = lambda x : x[1], reverse=True)[0][0][2]

    print("-----------------------------------***-----------------------------------")
    print("Filled sentence: ", " ".join(words))
    print("-----------------------------------***-----------------------------------")

In [None]:
test_sentences_pred = ["all houses were ___ ventilated",
                  "it aims to develop an integrated ___ to reach mmps exposed to malaria with prevention diagnosis and treatment ___ by involving non-health ___ stakeholders from provincial to community level",
                  "this is because engineers do not work in ___ but rather as a team"]


In [None]:
for sentence in test_sentences_pred:
    print(pred_miss_word_bigram(sentence))

Preprocessed sentence:  bgnsntnc all houses were xxxmissingwordxxx ventilated endsntnc
(('were', 'not'), 0.01245136281178955)
(('were', 'used'), 0.012113702847909056)
(('were', 'also'), 0.009276146001742083)
(('were', 'performed'), 0.008972049842654501)
(('were', 'found'), 0.007881832761670296)
(('were', 'collected'), 0.007563987574113329)
(('were', 'obtained'), 0.006384806191268397)
(('were', 'observed'), 0.005918552372880122)
(('were', 'identified'), 0.005815030276169456)
(('were', 'detected'), 0.005102507094903073)
-----------------------------------***-----------------------------------
Filled sentence:  bgnsntnc all houses were not ventilated endsntnc
-----------------------------------***-----------------------------------
None
Preprocessed sentence:  bgnsntnc it aims to develop an integrated xxxmissingwordxxx to reach mmps exposed to malaria with prevention diagnosis and treatment xxxmissingwordxxx by involving nonhealth xxxmissingwordxxx stakeholders from provincial to communit

In [None]:
%%time
for sentence in test_sentences_pred:
    print(pred_miss_word_trigram(sentence))

Preprocessed sentence:  bgnsntnc bgnsntnc all houses were xxxmissingwordxxx ventilated endsntnc endsntnc
(('houses', 'were', 'made'), 1.5270227286644003e-07)
(('houses', 'were', 'built'), 1.5270227286644003e-07)
(('houses', 'were', 'investigated'), 1.0180151524429336e-07)
(('houses', 'were', 'malaria'), 1.0180151524429336e-07)
(('houses', 'were', 'contacted'), 1.0180151524429336e-07)
(('houses', 'were', 'tested'), 1.0180151524429336e-07)
(('houses', 'were', 'then'), 1.0180151524429336e-07)
(('houses', 'were', 'no'), 1.0180151524429336e-07)
(('houses', 'were', 'not'), 1.0180151524429336e-07)
(('houses', 'were', 'temperature'), 1.0180151524429336e-07)
-----------------------------------***-----------------------------------
Filled sentence:  bgnsntnc bgnsntnc all houses were made ventilated endsntnc endsntnc
-----------------------------------***-----------------------------------
None
Preprocessed sentence:  bgnsntnc bgnsntnc it aims to develop an integrated xxxmissingwordxxx to reach m

## Perplexity

Calculated using the formula $$\text{Perplexity}(W) =  \left( \frac{1}{\prod_{i=1}^N P(w_i|w_1,w_2,\ldots,w_{i-1})}  \right)^{1/N}$$
where $W$ is the sentence and $w_i$ are the words. We calculate the probabilities using chain rule.

In [None]:
### perplexity_score_bigram(sent):
### input: sentence (str)
### output: perplexity score (float)
### tags are added because it is a full sentence, so first bigram is ("tag","firstword"), and so on.


def perplexity_score_bigram(sent):
    sent_pad = "bgnsntnc " + sent + " endsntnc"
    words = sent_pad.split()
    prob = 1
    for i in range(len(words)-1):
        bigram = (words[i],words[i+1])
        prob *= (bigram_model[bigram]+1)/(unigram_model[bigram[0]] + len(unigram_model))

    return 1/prob**(1/len(words))

In [None]:
perplexity_score_bigram("it appears that the overall code stroke volume has decreased since the covid pandemic")

540.3838731126516

In [None]:
perplexity_score_bigram("half a century ago hypertension was not treatable")

3215.0028205315984

In [None]:
perplexity_score_bigram("sarahs tv is broadcasting an advert for private healthcare")

31731.14663008426

In [None]:
### perplexity_score_bigram(sent):
### input: sentence (str)
### output: perplexity score (float)
### tags are added because it is a full sentence, so first trigram is ("tag","tag","firstword"), and so on.

def perplexity_score_trigram(sent):
    sent_pad = "bgnsntnc bgnsntnc " + sent + " endsntnc endsntnc"
    words = sent_pad.split()
    prob = 1
    for i in range(len(words)-2):
        trigram = (words[i],words[i+1],words[i+2])
#         print((bigram_model_ord[bigram]+1)/(unigram_model[bigram[0]] + len(unigram_model)))
        prob *= (trigram_model[trigram]+1)/(bigram_model[(trigram[0],trigram[1])] + len(bigram_model))

    return 1/prob**(1/len(words))

In [None]:
perplexity_score_trigram("it appears that the overall code stroke volume has decreased since the covid pandemic")

25399.76565179547

In [None]:
perplexity_score_trigram("half a century ago hypertension was not treatable")

105619.44405951035

In [None]:
perplexity_score_trigram("sarahs tv is broadcasting an advert for private healthcare")

456819.137728019