# Punto 3 Recuperación ranqueada y vectorización de documentos (RRDV)
## Integrantes
* Juan Esteban Arboleda
* Luccas Rojas

## Calculo de la perplejidad de cada modelo

In [None]:
import math
from nltk.tokenize import WhitespaceTokenizer

PATH_NEWS_UNIGRAM="../data/20N_l.rojasb_j.arboleda_unigrams.txt"
PATH_BAC_UNIGRAM="../data/BAC_l.rojasb_j.arboleda_unigrams.txt"
PATH_NEWS_BIGRAM="../data/20N_l.rojasb_j.arboleda_bigrams.txt"
PATH_BAC_BIGRAM="../data/BAC_l.rojasb_j.arboleda_bigrams.txt"
PATH_NEWS_TRIGRAM="../data/20N_l.rojasb_j.arboleda_trigrams.txt"
PATH_BAC_TRIGRAM="../data/BAC_l.rojasb_j.arboleda_trigrams.txt"

PATH_BAC_TEST = "../data/BAC_l.rojasb_j.arboleda_test.txt" 
PATH_NEWS_TEST = "../data/20N_l.rojasb_j.arboleda_test.txt"  

tokenizer = WhitespaceTokenizer()

In [None]:
def load_test(path):
    """
    Load test data from a file.
    Params
    ------
    path: str
        Path to the file.
    Returns
    -------
    test: list
        List of strings.
    """
    test = []
    with open(path, 'r') as f:
        for line in f:
            test.append(line.strip())
    return test
bac_test = load_test(PATH_BAC_TEST)
news_test = load_test(PATH_NEWS_TEST)

In [None]:
def load_model(path:str)->dict:
    """
    Load a model from a file.
    Params
    ------
    path: str
        Path to the file.
    Returns
    -------
    model: dict
        Dictionary with the model.
    """
    model = {}
    with open(path, 'r') as f:
        for line in f:
            line = line.strip().split('\t')
            model[line[0]] = int(line[1])
    return model

In [None]:
def calculate_unigram_perplexity(model, test_data)->float:
    """    
    Calculate the perplexity of a unigram model.
    Params
    ------
    model: dict
        Dictionary with the model.
    test_data: list
        List of strings.
    Returns
    -------
    perplexity: float
        Perplexity of the model.
    """
    test_size = len(test_data)
    log_sum = 0
    for sentence in test_data:
        words = tokenizer.tokenize(sentence)
        for word in words:
            if word in model:
                probability = model[word]
            else:
                probability = model["<UNK>"]
            log_sum += math.log2(probability)
    return -1*(log_sum / test_size)

In [None]:
def calculate_bigram_perplexity(model, test_data)->float:
    """
    Calculate the perplexity of a bigram model.
    Params
    ------
    model: dict
        Dictionary with the model.
    test_data: list
        List of strings.
    Returns
    -------
    perplexity: float  
        Perplexity of the model.
    """
    test_size = len(test_data)
    log_sum = 0
    for sentence in test_data:
        words = tokenizer.tokenize(sentence)
        for word in words:
            if word in model:
                probability = model[word]
            else:
                probability = model["<UNK>"]
            log_sum += math.log2(probability)
    return -1*(log_sum / test_size)