In [1]:
import xml.etree.ElementTree as ET
import xml.etree
import pandas as pd
import numpy as np
import os, re, json
from gensim.parsing.porter import PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from nltk import FreqDist

from sklearn.model_selection import train_test_split

In [2]:
path_BAC = 'Datasets/BAC/blogs/'

In [3]:
def expand(text):
    text_proc = re.sub(r"won\'t", "will not", text)
    text_proc = re.sub(r"can\'t", "can not", text_proc)
    text_proc = re.sub(r"n\'t", " not", text_proc)
    text_proc = re.sub(r"\'re", " are", text_proc)
    text_proc = re.sub(r"\'s", " is", text_proc)
    text_proc = re.sub(r"\'d", " would", text_proc)
    text_proc = re.sub(r"\'ll", " will", text_proc)
    text_proc = re.sub(r"\'t", " not", text_proc)
    text_proc = re.sub(r"\'ve", " have", text_proc)
    text_proc = re.sub(r"\'m", " am", text_proc)
    return text_proc

In [4]:
def replace_numbers(text):
    #Combinaciones de números y letras
    text_proc = re.sub('[a-zA-Z]+[0-9]+[a-zA-Z]+', 'NUM', text)
    text_proc = re.sub('[0-9]+[a-zA-Z]+|[a-zA-Z]+[0-9]+', 'NUM', text_proc)
    #Números con puntos
    text_proc = re.sub('[0-9]+\.+[0-9]+', 'NUM', text_proc)
    #Números con espacios
    text_proc = re.sub('([0-9]+\s)*[0-9]+', 'NUM', text_proc)
    #Unión de tokens seguidos (dos o más casos anteriores seguidos)
    text_proc = re.sub('((NUM)+\s)*(NUM)+', 'NUM', text_proc)
    return text_proc

In [5]:
def preprocess_BAC(text):
    text_proc = re.sub('<date>[a-zA-Z0-9,]*<\/date>',' ', text)
    text_proc = re.sub('<Blog>',' ', text_proc)
    text_proc = re.sub('<\/Blog>',' ', text_proc)
    text_proc = re.sub('<post>',' ', text_proc)
    text_proc = re.sub('<\/post>',' ', text_proc)
    text_proc = text.lower()
    text_proc = expand(text_proc)
    text_proc = re.sub('[^a-z0-9 .!?]', ' ', text_proc)
    text_proc = re.sub('\.{2,}', ' ', text_proc)
    text_proc = re.sub('\s+', ' ', text_proc)
    text_proc = replace_numbers(text_proc)
    return text_proc

In [6]:
def remove_punctuation(text):
    text_proc = re.sub('[^a-z\s]',' ',text)
    text_proc = re.sub('\s+', ' ', text_proc)
    return text_proc

In [7]:
def tokenize_sentence_BAC(path):
    out_list = []
    list_BAC = os.listdir(path)
    for i, file in enumerate(list_BAC):
        with open(path+file, 'r',encoding="mbcs") as f:
            xml_string = f.read()
            xml_string = preprocess_BAC(xml_string)
            out_list += sent_tokenize(xml_string)
    return out_list

In [8]:
def get_inverted_index_BAC(sentence_list):
    out_idx = {}
    for i, sentence in enumerate(sentence_list):
        for word in sentence.split(' '):
            if word != '':
                if word not in out_idx:
                    out_idx[word] = [i]
                elif i not in out_idx[word] and len(out_idx[word]) == 1:
                    out_idx[word].append(i)
    return out_idx

In [9]:
def add_UNK_tags_BAC(sentences_list, least_frequent_words):
    for (index, word) in least_frequent_words_BAC:
        sentences_list[index] = sentences_list[index].replace(word, '<UNK>')
    
    for i, sentence in enumerate(sentences_list):
        sentences_list[i] = '<s> ' + sentence.strip() + ' </s>'
    return sentences_list

In [10]:
out_list_sentences_raw_BAC = tokenize_sentence_BAC(path_BAC)
out_list_sentences_BAC = [remove_punctuation(sentence) for sentence in out_list_sentences_raw_BAC]
out_list_sentences_BAC

[' blog date may date post well everyone got up and going this morning ',
 'it is still raining but that is okay with me ',
 'sort of suits my mood ',
 'i could easily have stayed home in bed with my book and the cats ',
 'this has been a lot of rain though ',
 'people have wet basements there are lakes where there should be golf courses and fields everything is green green green ',
 'but it is supposed to be degrees by friday so we will be dealing with mosquitos next week ',
 'i heard winnipeg described as an old testament city on urllink cbc radio one last week and it sort of rings true ',
 'floods infestations etc ',
 'etc post date may date post my four year old never stops talking ',
 'she will say mom ',
 'and when i say yes ',
 'she will say ummm ummm oh yeah ',
 'where do lady bugs hide in the rain ',
 'anything to hear her own voice ',
 'very very exhausting ',
 'now i remember ',
 'this is why i go to work ',
 'sigh post date may date post actually it is not raining yet but i

In [11]:
inv_BAC = get_inverted_index_BAC(out_list_sentences_BAC)
least_frequent_words_BAC = [(item[1][0], item[0]) for item in inv_BAC.items() if len(item[1]) == 1]
least_frequent_words_BAC

[(82, 'ineveitable'),
 (123, 'poopeyhead'),
 (177, 'momly'),
 (446, 'jusat'),
 (497, 'mehgan'),
 (520, 'vnted'),
 (598, 'weieners'),
 (611, 'gianopolis'),
 (644, 'opur'),
 (730, 'kordalewski'),
 (763, 'malevalent'),
 (805, 'zimnie'),
 (820, 'albinak'),
 (853, 'evereryone'),
 (988, 'snippity'),
 (1019, 'lookis'),
 (1078, 'chemisry'),
 (1089, 'xocatiexo'),
 (1109, 'hahahahahahahahahahahahaahahahahahahahahaha'),
 (1182, 'lunh'),
 (1273, 'metranomes'),
 (1363, 'tyhough'),
 (1372, 'jerkyest'),
 (1584, 'drudgeries'),
 (1662, 'cleanng'),
 (1750, 'loweered'),
 (1997, 'highter'),
 (2027, 'overweis'),
 (2086, 'hwlp'),
 (2174,
  'ahhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh'),
 (2380, 'trustworhy'),
 (2547, 'creeeptacular'),
 (2548, 'creeptacular'),
 (2648, 'cloass'),
 (2669, 'loahte'),
 (2699,
  'ahhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh'),
 (2747, 'togetther'),
 (2885, 'distanceing'),
 (2949, 'ooowwwwwwwwwwwww

In [12]:
tagged_sentences_BAC = add_UNK_tags_BAC(out_list_sentences_BAC, least_frequent_words_BAC)
tagged_sentences_BAC

['<s> blog date may date post well everyone got up and going this morning </s>',
 '<s> it is still raining but that is okay with me </s>',
 '<s> sort of suits my mood </s>',
 '<s> i could easily have stayed home in bed with my book and the cats </s>',
 '<s> this has been a lot of rain though </s>',
 '<s> people have wet basements there are lakes where there should be golf courses and fields everything is green green green </s>',
 '<s> but it is supposed to be degrees by friday so we will be dealing with mosquitos next week </s>',
 '<s> i heard winnipeg described as an old testament city on urllink cbc radio one last week and it sort of rings true </s>',
 '<s> floods infestations etc </s>',
 '<s> etc post date may date post my four year old never stops talking </s>',
 '<s> she will say mom </s>',
 '<s> and when i say yes </s>',
 '<s> she will say ummm ummm oh yeah </s>',
 '<s> where do lady bugs hide in the rain </s>',
 '<s> anything to hear her own voice </s>',
 '<s> very very exhausti

In [13]:
train_BAC, test_BAC = train_test_split(tagged_sentences_BAC, test_size=0.2, random_state=0)
print("Train: {} - Test: {}".format(len(train_BAC),len(test_BAC)))

Train: 7098164 - Test: 1774541


In [14]:
with open("BAC_G02_training", "w") as out_file_training:
    for sentence in train_BAC:
        out_file_training.write(sentence+"\n")

In [15]:
with open("BAC_G02_testing", "w") as out_file_testing:
    for sentence in test_BAC:
        out_file_testing.write(sentence+"\n")

# N-gram models

In [16]:
training_file = "BAC_G02_training"

In [17]:
def read_sentences(file_path):
    file = open(file_path, 'r')
    return file.read().splitlines()

## Unigrams

In [18]:
def get_unigram_frequencies(sentence_list):
    word_count = 0
    unigram_frequencies = {}
    for sentence in sentence_list:
        for word in sentence.split(' '):
            if word != '':
                word_count += 1
                if word not in unigram_frequencies:
                    unigram_frequencies[word] = 0
                unigram_frequencies[word] += 1
    return word_count, unigram_frequencies

In [19]:
sentences = read_sentences(training_file)
sentences

['<s> this cop show is interesting to me because to me it is one of the first few shows that i am aware of that focuses mainly on the cop work not the personal life of said cops </s>',
 '<s> if i think of this as a one week effort then my brain will make a little contest of it get psyched into trying to see how well i can eat how much water i can drink how many hours i can exercise </s>',
 '<s> i guess they wanted to come at five and he said they had to be here by four </s>',
 '<s> rajiv gandhi s government was found involved in many cases of alleged corruptions which actually led to his defeat in the election </s>',
 '<s> i am addicted </s>',
 '<s> wad so boring besides the fact tt some ppl also came down we did the items thingy soo boring k not bad at least i passed all haa good enough for me i have better things to focus on anyway den <UNK> cried cos she was too scared of the standing broad jump her phobia she says haa but at least she should juz try rite she cant possibly cry on th

In [20]:
total_words, unigram_frequencies = get_unigram_frequencies(sentences)
unigram_frequencies

{'<s>': 7098164,
 'this': 698395,
 'cop': 2146,
 'show': 51229,
 'is': 2069442,
 'interesting': 31683,
 'to': 3168543,
 'me': 758434,
 'because': 197000,
 'it': 1619512,
 'one': 370494,
 'of': 1940160,
 'the': 4254428,
 'first': 129116,
 'few': 76462,
 'shows': 11739,
 'that': 1494355,
 'i': 4392191,
 'am': 626105,
 'aware': 4729,
 'focuses': 470,
 'mainly': 3891,
 'on': 788009,
 'work': 135156,
 'not': 1266990,
 'personal': 14026,
 'life': 139517,
 'said': 123894,
 'cops': 2218,
 '</s>': 7098164,
 'if': 354815,
 'think': 238898,
 'as': 472755,
 'a': 2337890,
 'week': 75564,
 'effort': 7239,
 'then': 274082,
 'my': 1161417,
 'brain': 10251,
 'will': 493262,
 'make': 132904,
 'little': 122750,
 'contest': 2687,
 'get': 314924,
 'psyched': 425,
 'into': 145222,
 'trying': 50173,
 'see': 185149,
 'how': 216477,
 'well': 238309,
 'can': 380479,
 'eat': 28656,
 'much': 177220,
 'water': 25921,
 'drink': 15961,
 'many': 80117,
 'hours': 43523,
 'exercise': 4661,
 'gave': 27046,
 'him': 20867

In [21]:
unigram_probabilities = [(item[0], item[1]/total_words) for item in unigram_frequencies.items()]
unigram_probabilities

[('<s>', 0.055008963432966074),
 ('this', 0.005412383401787609),
 ('cop', 1.6630953515182967e-05),
 ('show', 0.00039701170439389945),
 ('is', 0.016037648510888756),
 ('interesting', 0.0002455351818366924),
 ('to', 0.024555401371788625),
 ('me', 0.0058776703626907175),
 ('because', 0.001526699833406824),
 ('it', 0.01255080558680382),
 ('one', 0.00287123415268136),
 ('of', 0.015035745932906517),
 ('the', 0.03297073359817933),
 ('first', 0.0010006161202545964),
 ('few', 0.0005925610287408761),
 ('shows', 9.097426063128279e-05),
 ('that', 0.011580870708378958),
 ('i', 0.03403836176645154),
 ('am', 0.00485215431063543),
 ('aware', 3.664854574711102e-05),
 ('focuses', 3.642380313204098e-06),
 ('mainly', 3.0154259146121587e-05),
 ('on', 0.006106869081335422),
 ('work', 0.0010474245821519427),
 ('not', 0.009818849857503108),
 ('personal', 0.00010869792824042698),
 ('life', 0.0010812212216112684),
 ('said', 0.0009601469500512948),
 ('cops', 1.7188935180184447e-05),
 ('</s>', 0.05500896343296607

In [22]:
with open("BAC_G02_unigrams", "w") as out_file_unigrams:
    for (unigram, probability) in unigram_probabilities:
        out_file_unigrams.write(unigram+":"+str(probability)+"\n")

## Bigrams

In [23]:
def get_bigram_frequencies(sentence_list):
    bigram_frequencies = {}
    for sentence in sentence_list:
        word_list = sentence.split(' ')
        for i in range(1,len(word_list)):
            if word_list[i-1] != '' and word_list[i] != '':
                bigram = word_list[i-1] + ' ' + word_list[i]
                if bigram not in bigram_frequencies:
                    bigram_frequencies[bigram] = 0
                bigram_frequencies[bigram] += 1
    return bigram_frequencies

In [24]:
bigram_frequencies = get_bigram_frequencies(sentences)
bigram_frequencies

{'<s> this': 86326,
 'this cop': 36,
 'cop show': 21,
 'show is': 1570,
 'is interesting': 1402,
 'interesting to': 2398,
 'to me': 60282,
 'me because': 4008,
 'because to': 246,
 'me it': 5240,
 'it is': 380602,
 'is one': 15297,
 'one of': 76021,
 'of the': 363253,
 'the first': 53136,
 'first few': 964,
 'few shows': 41,
 'shows that': 987,
 'that i': 190525,
 'i am': 582819,
 'am aware': 487,
 'aware of': 2633,
 'of that': 16507,
 'that focuses': 37,
 'focuses mainly': 5,
 'mainly on': 88,
 'on the': 194909,
 'the cop': 438,
 'cop work': 1,
 'work not': 185,
 'not the': 22194,
 'the personal': 499,
 'personal life': 491,
 'life of': 3435,
 'of said': 428,
 'said cops': 1,
 'cops </s>': 311,
 '<s> if': 73623,
 'if i': 74970,
 'i think': 124325,
 'think of': 17065,
 'of this': 30598,
 'this as': 2025,
 'as a': 47465,
 'a one': 2081,
 'one week': 1394,
 'week effort': 3,
 'effort then': 9,
 'then my': 3256,
 'my brain': 3569,
 'brain will': 71,
 'will make': 6340,
 'make a': 14702,
 

In [25]:
bigram_probabilities = [(item[0], item[1]/unigram_frequencies[item[0].split(' ')[0]]) for item in bigram_frequencies.items()]
bigram_probabilities

[('<s> this', 0.012161736471572085),
 ('this cop', 5.154676078723358e-05),
 ('cop show', 0.0097856477166822),
 ('show is', 0.03064670401530383),
 ('is interesting', 0.0006774773103087692),
 ('interesting to', 0.07568727708865954),
 ('to me', 0.01902514815168991),
 ('me because', 0.005284573212698798),
 ('because to', 0.0012487309644670052),
 ('me it', 0.006908972962710005),
 ('it is', 0.23501029939883125),
 ('is one', 0.007391847657484481),
 ('one of', 0.2051882081761108),
 ('of the', 0.1872283729176975),
 ('the first', 0.012489575566915224),
 ('first few', 0.007466154465751727),
 ('few shows', 0.0005362140671183071),
 ('shows that', 0.08407871198568873),
 ('that i', 0.1274964784137638),
 ('i am', 0.13269436597816442),
 ('am aware', 0.0007778248057434456),
 ('aware of', 0.5567773313596955),
 ('of that', 0.008508061190829622),
 ('that focuses', 2.4759846221279415e-05),
 ('focuses mainly', 0.010638297872340425),
 ('mainly on', 0.022616294011822153),
 ('on the', 0.24734362170990432),
 ('t

In [26]:
with open("BAC_G02_bigrams", "w") as out_file_bigrams:
    for (bigram, probability) in bigram_probabilities:
        out_file_bigrams.write(bigram+":"+str(probability)+"\n")

## Trigrams

In [27]:
def get_trigram_frequencies(sentence_list):
    trigram_frequencies = {}
    for sentence in sentence_list:
        word_list = sentence.split(' ')
        for i in range(2,len(word_list)):
            if word_list[i-2] != '' and word_list[i-1] != '' and word_list[i] != '':
                trigram = word_list[i-2] + ' ' + word_list[i-1] + ' ' + word_list[i]
                if trigram not in trigram_frequencies:
                    trigram_frequencies[trigram] = 0
                trigram_frequencies[trigram] += 1
    return trigram_frequencies

In [28]:
trigram_frequencies = get_trigram_frequencies(sentences)
trigram_frequencies

{'<s> this cop': 3,
 'this cop show': 1,
 'cop show is': 2,
 'show is interesting': 3,
 'is interesting to': 275,
 'interesting to me': 199,
 'to me because': 530,
 'me because to': 2,
 'because to me': 46,
 'to me it': 973,
 'me it is': 2142,
 'it is one': 2270,
 'is one of': 8249,
 'one of the': 31253,
 'of the first': 1927,
 'the first few': 778,
 'first few shows': 2,
 'few shows that': 8,
 'shows that i': 90,
 'that i am': 27692,
 'i am aware': 481,
 'am aware of': 212,
 'aware of that': 65,
 'of that focuses': 1,
 'that focuses mainly': 1,
 'focuses mainly on': 5,
 'mainly on the': 26,
 'on the cop': 12,
 'the cop work': 1,
 'cop work not': 1,
 'work not the': 5,
 'not the personal': 4,
 'the personal life': 9,
 'personal life of': 7,
 'life of said': 1,
 'of said cops': 1,
 'said cops </s>': 1,
 '<s> if i': 11832,
 'if i think': 312,
 'i think of': 2279,
 'think of this': 378,
 'of this as': 120,
 'this as a': 556,
 'as a one': 37,
 'a one week': 43,
 'one week effort': 1,
 'wee

In [29]:
trigram_probabilities = [(item[0], item[1]/bigram_frequencies[' '.join(item[0].split(' ')[:-1])]) 
                        for item in trigram_frequencies.items()]
trigram_probabilities

[('<s> this cop', 3.4751986655237126e-05),
 ('this cop show', 0.027777777777777776),
 ('cop show is', 0.09523809523809523),
 ('show is interesting', 0.001910828025477707),
 ('is interesting to', 0.19614835948644793),
 ('interesting to me', 0.08298582151793162),
 ('to me because', 0.008792010882187054),
 ('me because to', 0.000499001996007984),
 ('because to me', 0.18699186991869918),
 ('to me it', 0.016140804883713215),
 ('me it is', 0.40877862595419845),
 ('it is one', 0.005964235605698341),
 ('is one of', 0.5392560632803818),
 ('one of the', 0.41111008800199944),
 ('of the first', 0.005304842630343039),
 ('the first few', 0.014641674194519724),
 ('first few shows', 0.002074688796680498),
 ('few shows that', 0.1951219512195122),
 ('shows that i', 0.0911854103343465),
 ('that i am', 0.14534575515024276),
 ('i am aware', 0.0008252991065836907),
 ('am aware of', 0.4353182751540041),
 ('aware of that', 0.024686669198632737),
 ('of that focuses', 6.0580359847337494e-05),
 ('that focuses ma

In [30]:
with open("BAC_G02_trigrams", "w") as out_file_trigrams:
    for (trigram, probability) in trigram_probabilities:
        out_file_trigrams.write(trigram+":"+str(probability)+"\n")

# Perplexity

In [31]:
def get_sentence_probability_unigrams(sentence, unigram_probabilities):
    sentence_probability = 1
    for word in sentence.split(' '):
        sentence_probability *= unigram_probabilities[word]
    return sentence_probability

In [33]:
def get_sentence_probability_bigrams(sentence, bigram_probabilities):
    sentence_probability = 1
    word_list = sentence.split(' ')
    for i in range(1,len(word_list)):
        if word_list[i-1] != '' and word_list[i] != '':
                bigram = word_list[i-1] + ' ' + word_list[i]
        sentence_probability *= bigram_probabilities[bigram]
    return sentence_probability

In [35]:
def get_sentence_probability_trigrams(sentence, trigram_probabilities):
    sentence_probability = 1
    word_list = sentence.split(' ')
    for i in range(2,len(word_list)):
        if word_list[i-2] != '' and word_list[i-1] != '' and word_list[i] != '':
                trigram = word_list[i-2] + ' ' + word_list[i-1] + ' ' + word_list[i]
        sentence_probability *= trigram_probabilities[trigram]
    return sentence_probability