# Model usage
## IMPORTANTE: Para ejecutar los notebooks, en la carpeta data añada los archivos que puede encontrar en el siguiete drive:
https://uniandes-my.sharepoint.com/:f:/g/personal/j_arboleda_uniandes_edu_co/EgEasT6fqDxFmBCiYZYRw0MBG87E7s4hFZuHCzTJ3DAXow?e=ybBMgw
## Integrantes
* Juan Esteban Arboleda
* Luccas Rojas

## Calculo de la perplejidad de cada modelo

In [1]:
import math
from nltk.tokenize import WhitespaceTokenizer

PATH_NEWS_UNIGRAM="../data/20N_l.rojasb_j.arboleda_unigrams.txt"
PATH_BAC_UNIGRAM="../data/BAC_l.rojasb_j.arboleda_unigrams.txt"
PATH_NEWS_BIGRAM="../data/20N_l.rojasb_j.arboleda_bigrams.txt"
PATH_BAC_BIGRAM="../data/BAC_l.rojasb_j.arboleda_bigrams.txt"
PATH_NEWS_TRIGRAM="../data/20N_l.rojasb_j.arboleda_trigrams.txt"
PATH_BAC_TRIGRAM="../data/BAC_l.rojasb_j.arboleda_trigrams.txt"

PATH_BAC_TEST = "../data/BAC_l.rojasb_j.arboleda_test.txt" 
PATH_NEWS_TEST = "../data/20N_l.rojasb_j.arboleda_test.txt"  

tokenizer = WhitespaceTokenizer()

In [2]:
def load_test(path):
    """
    Load test data from a file.
    Params
    ------
    path: str
        Path to the file.
    Returns
    -------
    test: list
        List of strings.
    """
    test = []
    with open(path, 'r') as f:
        for line in f:
            test.append(line.strip())
    return test
bac_test = load_test(PATH_BAC_TEST)
news_test = load_test(PATH_NEWS_TEST)

In [2]:
def load_ngram(path: str) -> dict:
    ngram_dict = {}
    f = open(path, "r")

    ngram = f.readline()
    while(len(ngram) != 0):
        ngram = ngram.split(",")
        ngram_dict[ngram[0]] = {
            "count": int(ngram[1]),
            "probability": float(ngram[2])
        }
        ngram = f.readline()

    return ngram_dict

In [4]:
def replace_unk(words:list,monogram:dict)->list:
    """
    Replace unknown words with <UNK>.
    Params
    ------
    sentence: str
        Sentence to be replaced.
    Returns
    -------
    sentence: str
        Sentence with <unk>.
    """
    words2 = []
    for word in words:
        if word not in monogram:
            words2.append("<UNK>")
        else:
            words2.append(word)
    return words2

In [5]:
def calculate_unigram_perplexity(model, test_data)->float:
    """    
    Calculate the perplexity of a unigram model.
    Params
    ------
    model: dict
        Dictionary with the model.
    test_data: list
        List of strings.
    Returns
    -------
    perplexity: float
        Perplexity of the model.
    """
    test_size = len(test_data)
    log_sum = 0
    for sentence in test_data:
        words = tokenizer.tokenize(sentence)
        words = replace_unk(words, model)
        for word in words:
            if word in model:
                probability = model[word]["probability"]
            else:
                probability = model["<UNK>"]["probability"]
            log_sum += math.log2(probability)
    return -(1)*(log_sum / test_size)

In [6]:
def calculate_bigram_perplexity(bigram_model, monogram_model, test_data)->float:
    """
    Calculate the perplexity of a bigram model.
    Params
    ------
    model: dict
        Dictionary with the model.
    test_data: list
        List of strings.
    Returns
    -------
    perplexity: float  
        Perplexity of the model.
    """
    test_size = len(test_data)
    log_sum = 0
    vocabulary = len(monogram_model)
    for sentence in test_data:
        words = tokenizer.tokenize(sentence)
        words = replace_unk(words, monogram_model)
        for i in range(len(words)-1):
            bi_gram = words[i]+" "+words[i+1]
            if bi_gram in bigram_model:
                probability = bigram_model[bi_gram]["probability"]
            else:
                probability = 1/monogram_model[words[i]]["count"]+vocabulary

            log_sum += math.log2(probability)
    return -1*(log_sum / test_size)

In [7]:
def calculate_trigram_perplexity(trigram, bigram_model, monogram_model, test_data)->float:
    """
    Calculate the perplexity of a trigram model.
    Params
    ------
    model: dict
        Dictionary with the model.
    test_data: list
        List of strings.
    Returns
    -------
    perplexity: float  
        Perplexity of the model.
    """
    test_size = len(test_data)
    log_sum = 0
    bigram_size = len(bigram_model)
    for sentence in test_data:
        words = tokenizer.tokenize(sentence)
        words = replace_unk(words, monogram_model)
        for i in range(len(words)-2):
            tri_gram = words[i]+" "+words[i+1]+" "+words[i+2]
            bi_gram = words[i]+" "+words[i+1]
            if tri_gram in trigram:
                probability = trigram[tri_gram]["probability"]
            elif bi_gram in bigram_model:
                probability = 1/(bigram_model[bi_gram]["count"]+bigram_size)
            else:
                probability = 1/(2*bigram_size)
            log_sum += math.log2(probability)
    return -1*(log_sum / test_size)

In [8]:
NEWS_UNIGRAMS_PATH  = "../data/20N_l.rojasb_j.arboleda_unigrams.txt"
NEWS_BIGRAM_PATH    = "../data/20N_l.rojasb_j.arboleda_bigrams.txt"
NEWS_TRIGRAM_PATH   = "../data/20N_l.rojasb_j.arboleda_trigrams.txt"

unigrams_model = load_ngram(NEWS_UNIGRAMS_PATH)
calculate_unigram_perplexity(unigrams_model, news_test)

226.18783455162796

In [9]:
bigram_model = load_ngram(NEWS_BIGRAM_PATH)
calculate_bigram_perplexity(bigram_model, unigrams_model, news_test)

157.1078879905547

In [10]:
trigram_model = load_ngram(NEWS_TRIGRAM_PATH)
calculate_trigram_perplexity(trigram_model, bigram_model, unigrams_model, news_test)

296.2412333496441

In [11]:
unigrams_model = load_ngram(PATH_BAC_UNIGRAM)
calculate_unigram_perplexity(unigrams_model, bac_test)

164.02407797343312

In [12]:
bigram_model = load_ngram(PATH_BAC_BIGRAM)
calculate_bigram_perplexity(bigram_model, unigrams_model, bac_test)

130.6983538060847

In [13]:
trigram_model = load_ngram(PATH_BAC_TRIGRAM)
calculate_trigram_perplexity(trigram_model, bigram_model, unigrams_model, bac_test)

: 

In [23]:
import random
def bigram_predict(first_word: str, model: dict, pprint = False, word_limit = 100) -> str:
    first_word = first_word.lower()
    fw_in_vocab = False
    for words_str in model:
        words = words_str.split(" ")
        if first_word == words[0]:
            fw_in_vocab = True
            break

    if fw_in_vocab:
        w = first_word
    else:
        w = "<UNK>"
    
    sentence = ""
    wcount = 0
    while (w != "</s>") and (wcount <= word_limit):
        sentence += w + " "
        if pprint:
            print(w + " ", end="")
        
        # get next word
        population = []
        weights = []
        for words_str in model:
            words = words_str.split(" ")
            if words[0] == w:
                population.append(words[1])
                weights.append(model[words_str]["count"])

            
        w = random.choices(population, weights)[0]
        wcount += 1

    return sentence

In [5]:
model = load_ngram(PATH_BAC_BIGRAM)

In [33]:
bigram_predict("dont", model, True)

dont know how hard because we copy of prom 

'dont know how hard because we copy of prom '