# **N-Gram Language Models Implementation**

For the 20N and BAC datasets, perform the processing required to build two N-Gram Language Models:



In [73]:
#I. Read the files and build two large consolidate files that are the union of all the documents in 20N and BAC.

import xml.etree.ElementTree as ET
import re

In [74]:
def normalize(text) -> str:
    """
    Normalizes a sentence by stem method.
    """
    from gensim.parsing.porter import PorterStemmer 

    p=PorterStemmer()
    sentences = p.stem_sentence(text)
    return sentences

In [75]:
# loop for all characters

def replace_number(text) -> str:
    """
    Replaces all numeric characters with a NUM.
    """
    sentences = re.sub(r'\d+', 'NUM', text)
    return sentences



In [76]:
def revome_punctuation(text) -> str:
    """
    Removes punctuation from a text.
    """
    sentences = re.sub(r'[^\w\s]', ' ', text)
    sentences = re.sub(r'_+', ' ', sentences)
    sentences = re.sub(r'"', ' ', sentences)
    sentences = re.sub(r'-', ' ', sentences)
    sentences = re.sub(r'[^\x20-\x7e]', '', sentences)
    sentences = re.sub(r'\s+', ' ', sentences)
    return sentences

In [77]:

def split_sentences(text) -> list:
    """
    Splits a text into sentences.
    """
    sentences = re.split(r'[.!?]', text)
    sentences = [revome_punctuation(s.strip()) for s in sentences if s.strip() != ""]
    sentences = [s for s in sentences if s.strip() != ""]
    return sentences

In [78]:
def calculate_frequency(text:list) -> dict:
    """
    Calculates the frequency of each word in a text.
    """
    frequency = {}
    for sentence in text:
        for word in sentence.strip().split(' '):
            if word not in frequency:
                frequency[word] = 1
            else:
                frequency[word] += 1
    return frequency

In [79]:
def get_sentences_N20(path) -> list:
    """
    Reads the N20 corpus and returns a list of sentences.
    """
    sentences = []
    lines = []
    try:
        with open(path, encoding="utf8", errors='ignore') as f:
            lines = f.readlines()
        for line in lines:
            sentences_proc = split_sentences(replace_number(normalize(line)))
            sentences = sentences+sentences_proc
    except Exception as e:
        print(f"{path} {str(e)}")
    return sentences

In [80]:
# generate corpus for 20N
path = 'Datasets/20news-18828/'

def get_senteces_from_path_20N(path)->list:
    import os
    sentences = []
    for dirs in sorted(os.listdir(path)):
        tmpdir = path+dirs+'/'
        if not dirs.startswith('.'):
            for filename in sorted(os.listdir(tmpdir)):                
                sentences = sentences + get_sentences_N20(tmpdir+filename)    
    return sentences
    


s_20N = get_senteces_from_path_20N(path)   
len(s_20N)

989441

In [98]:
dic = calculate_frequency(s_20N)
sorted(dic.items(), key=lambda x: x[1], reverse=True)[:10]

[('NUM', 253724),
 ('the', 238663),
 ('to', 120377),
 ('of', 107317),
 ('a', 106198),
 ('and', 95075),
 ('i', 87623),
 ('in', 80825),
 ('is', 69169),
 ('that', 65149)]

In [82]:
def replace_UNK(text:list, dic:dict) -> list:
    """
    Replaces all words that are in the dictionary with frequency = 1 by UNK.
    """
    sentences = []
    for sentence in text:
        words = sentence.strip().split(' ')
        for word in words:
            if word in dic and dic[word] == 1:
                words[words.index(word)] = 'UNK'
        sentences.append(' '.join(words))
    return sentences

In [83]:
s_20N=replace_UNK(s_20N, dic)

In [84]:
# save list in a file 
with open('salida/corpus_20N.txt', 'w') as f:
    for item in s_20N:
        if item != '':
            f.write("<s> %s </s>\n" % item)

In [85]:
# read file to list of sentences
sentences = []
with open('salida/corpus_20N.txt','r') as f:
    for line in f:
        sentences.append(line.strip())

In [86]:
group = 'G02'

count = len(sentences)
# train test skitlearn random without replacement
from sklearn.model_selection import train_test_split
train_N20, test_N20 = train_test_split(sentences, test_size=0.2, random_state=120)

# save train list in a file
with open(f'salida/20N_{group}_training.txt', 'w') as f:
    for item in train_N20:
        f.write("%s\n" % item)
# save test list in a file
with open(f'salida/20N_{group}_test.txt', 'w') as f:
    for item in test_N20:
        f.write("%s\n" % item)

In [87]:
def calculate_probabilities_unigram(dict_work: dict) -> dict:
    """
    Calculates the probabilities of each word in the dictionary.
    """
    probabilities = {}
    total = sum(dict_work.values())
    for word in dict_work:
        probabilities[word] = dict_work[word]/total
    return probabilities

In [88]:
import json

dic_train = calculate_frequency(train_N20)
dic_train_prob = calculate_probabilities_unigram(dic_train)

# save dic as json file
with open(f'salida/20N_{group}_unigrams.json', 'w') as f:
    json.dump(dic_train_prob, f)

In [89]:
import json
group = 'G02'

# read unigram from json
with open(f'salida/20N_{group}_unigrams.json', 'r') as f:
    dic_train_prob = json.load(f)
list_words = list(dic_train_prob.keys())

In [96]:
import numpy as np
data_len = len(list_words)
arr = np.ones((data_len, data_len), dtype=np.float32)


[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [102]:
for sentence in train_N20:
    words = sentence.strip().split(' ')
    for i in range(len(words)-1):
        arr[list_words.index(words[i]), list_words.index(words[i+1])] += 1

In [107]:
voc = len(list_words)

In [108]:
for i in range(data_len):
    for j in range(data_len):
        arr[i, j] = arr[i, j]/(dic_train_prob[list_words[i]]+voc)

In [111]:
np.save(f'salida/20N_{group}_bigrams_np.txt',arr)

In [None]:
# create bigrams from train_N20
from itertools import permutations

def create_bigrams(text:list) -> dict:
    """
    Creates bigrams from a list of sentences.
    """
    bigrams = {}
    uninques = []
    for sentence in text:
        words = sentence.strip().split(' ')
        for i in range(len(words)-1):
            if words[i] not in uninques:
                uninques.append(words[i])
            if words[i] not in bigrams:
                bigrams[words[i]] = {}
                bigrams[words[i]][words[i+1]] = 1
            else:
                if words[i+1] not in bigrams[words[i]]:
                    bigrams[words[i]][words[i+1]] = 1
                else:
                    bigrams[words[i]][words[i+1]] += 1
        if words[-1] not in uninques:
            uninques.append(words[-1])
    print(len(uninques))
    # Laplace smoothing 
    combs = list(permutations(uninques, 2))
    for w1, w2 in combs:
        if w1 not in bigrams:
            bigrams[w1] = {}
            bigrams[w1][w2] = 1
        else:
            if w2 not in bigrams[w1]:
                bigrams[w1][w2] = 1
            else:
                bigrams[w1][w2] += 1
    return bigrams

In [None]:
def probabilidades_bigram(bigram:dict, unigram:dict) -> dict:
    """
    Calculates the probabilities of each bigram in the dictionary.
    """
    vocabulario = len(unigram) 
    for w1 in bigram:
        for w2 in bigram[w1]:
            bigram[w1][w2] = bigram[w1][w2]/(unigram[w1] + vocabulario)
    return bigram

In [None]:
# save json file
import json
bg = create_bigrams(train_N20)


In [None]:
bg_prob = probabilidades_bigram(bg, dic_train_prob)

# save dic as json file
with open(f'salida/20N_{group}_bigrams.json', 'w') as f:
    json.dump(bg_prob, f)

In [112]:
def calculate_trigram(sentences, unigram,list_word):

	trigram = np.ones((len(list_word), len(list_word), len(list_word)), dtype=np.float32)
	for sentence in sentences:
		words = sentence.split()
		for i in range(len(words) - 2):
			trigram[list_word.index(words[i]), list_word.index(words[i+1]), list_word.index(words[i+2])] += 1
	voc = len(list_word)
	for i in range(len(list_word)):
		for j in range(len(list_word)):
			for k in range(len(list_word)):
				trigram[i, j, k] /= (unigram[list_word[i]] + voc)
	return trigram



In [None]:
trigram = calculate_trigram(sentences, dic_train_prob,list_words)


In [113]:
def get_trigram_value(trigram,list_word, word1, word2, word3):
	return trigram[list_word.index(word1), list_word.index(word2), list_word.index(word3)]


In [None]:
get_trigram_value(trigram,list_words,'c','d','</s>')

# **Perplexity**

In [114]:
def get_sentence_probability_unigrams(sentence, unigram_probabilities):
    sentence_probability = 1
    for word in sentence.split(' '):
        sentence_probability *= unigram_probabilities[word]
    return sentence_probability

In [117]:
get_sentence_probability_unigrams(sentences[0],dic_train_prob)

1.913427259649652e-18

In [None]:
def get_sentence_probability_bigrams(sentence, bigram_probabilities):
    sentence_probability = 1
    word_list = sentence.split(' ')
    for i in range(1,len(word_list)):
        if word_list[i-1] != '' and word_list[i] != '':
                bigram = word_list[i-1] + ' ' + word_list[i]
        sentence_probability *= bigram_probabilities[bigram]
    return sentence_probability

In [None]:
get_sentence_probability_bigrams(sentences[0],bg_prob)

In [None]:
def get_sentence_probability_trigrams(sentence, trigram_probabilities):
    sentence_probability = 1
    word_list = sentence.split(' ')
    for i in range(2,len(word_list)):
        if word_list[i-2] != '' and word_list[i-1] != '' and word_list[i] != '':
                trigram = word_list[i-2] + ' ' + word_list[i-1] + ' ' + word_list[i]
        sentence_probability *= trigram_probabilities[trigram]
    return sentence_probability