# Model usage
## Integrantes
* Juan Esteban Arboleda
* Luccas Rojas

## Calculo de la perplejidad de cada modelo

In [66]:
import math
from nltk.tokenize import WhitespaceTokenizer

PATH_NEWS_UNIGRAM="../data/20N_l.rojasb_j.arboleda_unigrams.txt"
PATH_BAC_UNIGRAM="../data/BAC_l.rojasb_j.arboleda_unigrams.txt"
PATH_NEWS_BIGRAM="../data/20N_l.rojasb_j.arboleda_bigrams.txt"
PATH_BAC_BIGRAM="../data/BAC_l.rojasb_j.arboleda_bigrams.txt"
PATH_NEWS_TRIGRAM="../data/20N_l.rojasb_j.arboleda_trigrams.txt"
PATH_BAC_TRIGRAM="../data/BAC_l.rojasb_j.arboleda_trigrams.txt"

PATH_BAC_TEST = "../data/BAC_l.rojasb_j.arboleda_test.txt" 
PATH_NEWS_TEST = "../data/20N_l.rojasb_j.arboleda_test.txt"  

tokenizer = WhitespaceTokenizer()

In [67]:
def load_test(path):
    """
    Load test data from a file.
    Params
    ------
    path: str
        Path to the file.
    Returns
    -------
    test: list
        List of strings.
    """
    test = []
    with open(path, 'r') as f:
        for line in f:
            test.append(line.strip())
    return test
bac_test = load_test(PATH_BAC_TEST)
news_test = load_test(PATH_NEWS_TEST)

In [68]:
def load_ngram(path: str) -> dict:
    ngram_dict = {}
    f = open(path, "r")

    ngram = f.readline()
    while(len(ngram) != 0):
        ngram = ngram.split(",")
        ngram_dict[ngram[0]] = {
            "count": int(ngram[1]),
            "probability": float(ngram[2])
        }
        ngram = f.readline()

    return ngram_dict

In [69]:
def replace_unk(words:list,monogram:dict)->list:
    """
    Replace unknown words with <UNK>.
    Params
    ------
    sentence: str
        Sentence to be replaced.
    Returns
    -------
    sentence: str
        Sentence with <unk>.
    """
    words2 = []
    for word in words:
        if word not in monogram:
            words2.append("<UNK>")
        else:
            words2.append(word)
    return words2

In [70]:
def calculate_unigram_perplexity(model, test_data)->float:
    """    
    Calculate the perplexity of a unigram model.
    Params
    ------
    model: dict
        Dictionary with the model.
    test_data: list
        List of strings.
    Returns
    -------
    perplexity: float
        Perplexity of the model.
    """
    test_size = len(test_data)
    log_sum = 0
    for sentence in test_data:
        words = tokenizer.tokenize(sentence)
        words = replace_unk(words, model)
        for word in words:
            if word in model:
                probability = model[word]["probability"]
            else:
                probability = model["<UNK>"]["probability"]
            log_sum += math.log2(probability)
    return -(1)*(log_sum / test_size)

In [71]:
def calculate_bigram_perplexity(bigram_model, monogram_model, test_data)->float:
    """
    Calculate the perplexity of a bigram model.
    Params
    ------
    model: dict
        Dictionary with the model.
    test_data: list
        List of strings.
    Returns
    -------
    perplexity: float  
        Perplexity of the model.
    """
    test_size = len(test_data)
    log_sum = 0
    vocabulary = len(monogram_model)
    for sentence in test_data:
        words = tokenizer.tokenize(sentence)
        words = replace_unk(words, monogram_model)
        for i in range(len(words)-1):
            bi_gram = words[i]+" "+words[i+1]
            if bi_gram in bigram_model:
                probability = bigram_model[bi_gram]["probability"]
            else:
                probability = 1/monogram_model[words[i]]["count"]+vocabulary

            log_sum += math.log2(probability)
    return -1*(log_sum / test_size)

In [72]:
def calculate_trigram_perplexity(trigram, bigram_model, monogram_model, test_data)->float:
    """
    Calculate the perplexity of a trigram model.
    Params
    ------
    model: dict
        Dictionary with the model.
    test_data: list
        List of strings.
    Returns
    -------
    perplexity: float  
        Perplexity of the model.
    """
    test_size = len(test_data)
    log_sum = 0
    vocabulary = len(monogram_model)
    for sentence in test_data:
        words = tokenizer.tokenize(sentence)
        words = replace_unk(words, monogram_model)
        for i in range(len(words)-2):
            tri_gram = words[i]+" "+words[i+1]+" "+words[i+2]
            bi_gram = words[i]+" "+words[i+1]
            if tri_gram in trigram:
                probability = trigram[tri_gram]["probability"]
            elif bi_gram in bigram_model:
                probability = 1/(bigram_model[bi_gram]["count"]+vocabulary^2)
            else:
                probability = 1/(1+vocabulary^2)
            log_sum += math.log2(probability)
    return -1*(log_sum / test_size)

In [73]:
NEWS_UNIGRAMS_PATH  = "../data/20N_l.rojasb_j.arboleda_unigrams.txt"
NEWS_BIGRAM_PATH    = "../data/20N_l.rojasb_j.arboleda_bigrams.txt"
NEWS_TRIGRAM_PATH   = "../data/20N_l.rojasb_j.arboleda_trigrams.txt"

unigrams_model = load_ngram(NEWS_UNIGRAMS_PATH)
calculate_unigram_perplexity(unigrams_model, news_test)

224.87758482318299

In [74]:
bigram_model = load_ngram(NEWS_BIGRAM_PATH)
calculate_bigram_perplexity(bigram_model, unigrams_model, news_test)

156.84539064251038

In [75]:
trigram_model = load_ngram(NEWS_TRIGRAM_PATH)
calculate_trigram_perplexity(trigram_model, bigram_model, unigrams_model, news_test)

294.29830390896524