# **N-Gram Language Models Implementation**

For the 20N and BAC datasets, perform the processing required to build two N-Gram Language Models:



In [1]:
#I. Read the files and build two large consolidate files that are the union of all the documents in 20N and BAC.

import xml.etree.ElementTree as ET
import re

In [2]:
def normalize(text) -> str:
    """
    Normalizes a sentence by stem method.
    """
    from gensim.parsing.porter import PorterStemmer 

    p=PorterStemmer()
    sentences = p.stem_sentence(text)
    return sentences

In [3]:
# loop for all characters

def replace_number(text) -> str:
    """
    Replaces all numeric characters with a NUM.
    """
    sentences = re.sub(r'\d+', 'NUM', text)
    return sentences



In [4]:
def revome_punctuation(text) -> str:
    """
    Removes punctuation from a text.
    """
    sentences = re.sub(r'[^\w\s]', ' ', text)
    sentences = re.sub(r'_+', ' ', sentences)
    sentences = re.sub(r'"', ' ', sentences)
    sentences = re.sub(r'-', ' ', sentences)
    sentences = re.sub(r'[^\x20-\x7e]', '', sentences)
    sentences = re.sub(r'\s+', ' ', sentences)
    return sentences

In [5]:

def split_sentences(text) -> list:
    """
    Splits a text into sentences.
    """
    sentences = re.split(r'[.!?]', text)
    sentences = [revome_punctuation(s.strip()) for s in sentences if s.strip() != ""]
    sentences = [s for s in sentences if s.strip() != ""]
    return sentences

In [6]:
def calculate_frequency(text:list) -> dict:
    """
    Calculates the frequency of each word in a text.
    """
    frequency = {}
    for sentence in text:
        for word in sentence.strip().split(' '):
            if word not in frequency:
                frequency[word] = 1
            else:
                frequency[word] += 1
    return frequency

In [7]:
def get_sentences_N20(path) -> list:
    """
    Reads the N20 corpus and returns a list of sentences.
    """
    sentences = []
    lines = []
    try:
        with open(path, encoding="utf8", errors='ignore') as f:
            lines = f.readlines()
        for line in lines:
            sentences_proc = split_sentences(replace_number(normalize(line)))
            sentences = sentences+sentences_proc
    except Exception as e:
        print(f"{path} {str(e)}")
    return sentences

In [8]:
# generate corpus for 20N
path = 'Datasets/20news-18828/'

def get_senteces_from_path_20N(path)->list:
    import os
    sentences = []
    for dirs in sorted(os.listdir(path)):
        tmpdir = path+dirs+'/'
        if not dirs.startswith('.'):
            for filename in sorted(os.listdir(tmpdir)):                
                sentences = sentences + get_sentences_N20(tmpdir+filename)    
    return sentences
    


s_20N = get_senteces_from_path_20N(path)   
len(s_20N)

989441

In [9]:
dic = calculate_frequency(s_20N)
#sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]

In [10]:
def replace_UNK(text:list, dic:dict) -> list:
    """
    Replaces all words that are in the dictionary with frequency = 1 by UNK.
    """
    sentences = []
    for sentence in text:
        words = sentence.strip().split(' ')
        for word in words:
            if word in dic and dic[word] <= 3:
                words[words.index(word)] = 'UNK'
        sentences.append(' '.join(words))
    return sentences

In [11]:
s_20N=replace_UNK(s_20N, dic)

In [12]:
# save list in a file 
with open('salida/corpus_20N.txt', 'w') as f:
    for item in s_20N:
        if item != '':
            f.write("<s> %s </s>\n" % item)

In [13]:
# read file to list of sentences
sentences = []
with open('salida/corpus_20N.txt','r') as f:
    for line in f:
        sentences.append(line.strip())

In [14]:
# Update frequency after replace UNK
dic = calculate_frequency(sentences)

In [15]:
group = 'G02'

count = len(sentences)
# train test skitlearn random without replacement
from sklearn.model_selection import train_test_split
train_N20, test_N20 = train_test_split(sentences, test_size=0.2, random_state=120)

# save train list in a file
with open(f'salida/20N_{group}_training.txt', 'w') as f:
    for item in train_N20:
        f.write("%s\n" % item)
# save test list in a file
with open(f'salida/20N_{group}_test.txt', 'w') as f:
    for item in test_N20:
        f.write("%s\n" % item)

In [16]:
def calculate_probabilities_unigram(dict_work: dict) -> dict:
    """
    Calculates the probabilities of each word in the dictionary.
    """
    probabilities = {}
    total = sum(dict_work.values())
    for word in dict_work:
        probabilities[word] = dict_work[word]/total
    return probabilities

In [17]:
import json

dic_train = calculate_frequency(train_N20)
dic_train_prob = calculate_probabilities_unigram(dic_train)

# save dic as json file
with open(f'salida/20N_{group}_unigrams.json', 'w') as f:
    json.dump(dic_train_prob, f)

In [18]:
import json
group = 'G02'

# read unigram from json
with open(f'salida/20N_{group}_unigrams.json', 'r') as f:
    dic_train_prob = json.load(f)
list_words = list(dic_train_prob.keys())

In [19]:
import numpy as np
data_len = len(list_words)
arr = np.ones((data_len, data_len), dtype=np.float32)

In [20]:
ls = []
#np.array([ b for a in train_N20[:4] for b in a.split(' ')])

res = np.array([f'{word} {sentence.split()[j + 1]}' for sentence in train_N20 
       for j, word in enumerate(sentence.split()) if j < len(sentence.split()) - 1])
bigram_exist = np.unique(res, return_counts=True)
bigram_exist = dict(zip(bigram_exist[0], bigram_exist[1]))


In [22]:
def get_bigram_prob(word1:str, word2:str, bigram_exist:dict, dic_train_prob:dict) -> float:
    """
    Calculates the probability of a bigram.
    """
    list_words = list(dic_train_prob.keys())
    vocabulary_size = len(list_words)
    
    if word1 in list_words and word2 in list_words:
        if word1+' '+word2 in bigram_exist:
            return (bigram_exist[word1+' '+word2]+1)/(dic_train_prob[word1]+vocabulary_size)
        else:
            return 1/(dic_train_prob[word1]+vocabulary_size)
        
    else:
        return 0
    

In [27]:
# example of get_bigram_prob
get_bigram_prob('i', 'want', bigram_exist, dic_train_prob)

0.018984186520537457

In [32]:
# save bigram as json file
import json
from numpyencoder import NumpyEncoder


with open(f'salida/20N_{group}_bigrams.json', 'w') as f:
    json.dump(bigram_exist, f,cls=NumpyEncoder)

In [34]:
# load bigram from json
import json

with open(f'salida/20N_{group}_bigrams.json', 'r') as f:
    bigram_exist = json.load(f)

In [None]:
## Trigrama

In [41]:
res = np.array([f'{word} {sentence.split()[j + 1]} {sentence.split()[j + 2]}' for sentence in train_N20
       for j, word in enumerate(sentence.split()) if j < len(sentence.split()) - 2])

trigram_exist = np.unique(res, return_counts=True)
trigram_exist = dict(zip(trigram_exist[0], trigram_exist[1]))

In [42]:
def get_trigram_prob(word1:str, word2:str, word3:str, trigram_exist:dict, dic_train_prob:dict) -> float:
    """
    Calculates the probability of a trigram.
    """
    list_words = list(dic_train_prob.keys())
    vocabulary_size = len(list_words)
    if word1 in list_words and word2 in list_words and word3 in list_words:
        if word1+' '+word2+' '+word3 in trigram_exist:
            return (trigram_exist[word1+' '+word2+' '+word3]+1)/(dic_train_prob[word1]+vocabulary_size)
        else:
            return 1/(dic_train_prob[word1]+vocabulary_size)
    else:
        return 0

In [43]:
get_trigram_prob('i', 'want', 'to', trigram_exist, dic_train_prob)

0.012039908554847725

In [44]:
# save trigram as json file
import json
from numpyencoder import NumpyEncoder

with open(f'salida/20N_{group}_trigrams.json', 'w') as f:
    json.dump(trigram_exist, f,cls=NumpyEncoder)

In [45]:
# load trigram from json
import json

with open(f'salida/20N_{group}_trigrams.json', 'r') as f:
    trigram_exist = json.load(f)

# **Perplexity**

In [52]:
sentence_test = test_N20[0]

In [53]:
def get_sentence_probability_unigrams(sentence, unigram_probabilities):
    sentence_probability = 1
    for word in sentence.split(' '):
        sentence_probability *= unigram_probabilities[word]
    return sentence_probability

In [66]:
ppu=get_sentence_probability_unigrams(sentence_test, dic_train_prob)

In [67]:
def get_sentence_probability_bigrams(sentence, bigram_probabilities,dic_train_prob):
    sentence_probability = 1
    word_list = sentence.split(' ')
    for i in range(1,len(word_list)):
        if word_list[i-1] != '' and word_list[i] != '':
            sentence_probability *= get_bigram_prob(word_list[i-1], word_list[i], bigram_probabilities, dic_train_prob)
    return sentence_probability

In [68]:
ppb=get_sentence_probability_bigrams(sentence_test, bigram_exist, dic_train_prob)

In [69]:
def get_sentence_probability_trigrams(sentence, trigram_exist, dic_train_prob):
    sentence_probability = 1
    word_list = sentence.split(' ')
    for i in range(2,len(word_list)):
        if word_list[i-2] != '' and word_list[i-1] != '' and word_list[i] != '':
            sentence_probability *= get_trigram_prob(word_list[i-2], word_list[i-1], word_list[i], trigram_exist, dic_train_prob)
    return sentence_probability

In [70]:
ppt=get_sentence_probability_trigrams(sentence_test, trigram_exist, dic_train_prob)

In [72]:
print(f'Probability of unigrams: {ppu}')
print(f'Probability of bigrams: {ppb}')
print(f'Probability of trigrams: {ppt}')

Probability of unigrams: 9.651900697901276e-32
Probability of bigrams: 2.393847391761494e-32
Probability of trigrams: 4.200642259945541e-32


In [74]:
# print max value of probability
print(f'Max probability of N-grams: {max(ppu, ppb, ppt)} is unigrams')

Max probability of N-grams: 9.651900697901276e-32 is unigrams


## Prediction

In [79]:
def predict_next_word(word:str, bigram_exist, dic_train_prob:dict) -> str:
    """
    Predicts the next word in a sentence.
    """
    list_words = list(dic_train_prob.keys())
    vocabulary_size = len(list_words)
    if word in list_words:
        return max(list_words, key=lambda x: get_bigram_prob(word, x, bigram_exist, dic_train_prob))
    else:
        return ' '

In [83]:
# create sentence of 10 words using predict_next_word
sentence_test = 'i'
for i in range(10):
    sentence_test += ' ' + predict_next_word(sentence_test.split()[-1], bigram_exist, dic_train_prob)
    print(sentence_test)

i m
i m </s>
i m </s> <s>
i m </s> <s> NUM
i m </s> <s> NUM </s>
i m </s> <s> NUM </s> <s>
i m </s> <s> NUM </s> <s> NUM
i m </s> <s> NUM </s> <s> NUM </s>
i m </s> <s> NUM </s> <s> NUM </s> <s>
i m </s> <s> NUM </s> <s> NUM </s> <s> NUM
