# Punto 3 Recuperación ranqueada y vectorización de documentos (RRDV)
## Integrantes
* Juan Esteban Arboleda
* Luccas Rojas

## Calculo de la perplejidad de cada modelo

In [None]:
import math
from nltk.tokenize import WhitespaceTokenizer

PATH_NEWS_UNIGRAM="../data/20N_l.rojasb_j.arboleda_unigrams.txt"
PATH_BAC_UNIGRAM="../data/BAC_l.rojasb_j.arboleda_unigrams.txt"
PATH_NEWS_BIGRAM="../data/20N_l.rojasb_j.arboleda_bigrams.txt"
PATH_BAC_BIGRAM="../data/BAC_l.rojasb_j.arboleda_bigrams.txt"
PATH_NEWS_TRIGRAM="../data/20N_l.rojasb_j.arboleda_trigrams.txt"
PATH_BAC_TRIGRAM="../data/BAC_l.rojasb_j.arboleda_trigrams.txt"

PATH_BAC_TEST = "../data/BAC_l.rojasb_j.arboleda_test.txt" 
PATH_NEWS_TEST = "../data/20N_l.rojasb_j.arboleda_test.txt"  

tokenizer = WhitespaceTokenizer()

In [None]:
def load_test(path):
    """
    Load test data from a file.
    Params
    ------
    path: str
        Path to the file.
    Returns
    -------
    test: list
        List of strings.
    """
    test = []
    with open(path, 'r') as f:
        for line in f:
            test.append(line.strip())
    return test
bac_test = load_test(PATH_BAC_TEST)
news_test = load_test(PATH_NEWS_TEST)

In [None]:
def load_model(path:str)->dict:
    #{"<bigram>":{"count":0, "probability":0}}
    """
    Load a model from a file.
    Params
    ------
    path: str
        Path to the file.
    Returns
    -------
    model: dict
        Dictionary with the model.
    """
    model = {}
    with open(path, 'r') as f:
        for line in f:
            line = line.strip().split('\t')
            model[line[0]] = int(line[1])
    return model

In [None]:
def replace_unk(sentence:str,monogram:dict)->str:
    """
    Replace unknown words with <UNK>.
    Params
    ------
    sentence: str
        Sentence to be replaced.
    Returns
    -------
    sentence: str
        Sentence with <unk>.
    """
    words = tokenizer.tokenize(sentence)
    for word in words:
        if word not in monogram:
            sentence = sentence.replace(word, "<UNK>")
    return sentence

In [None]:
def calculate_unigram_perplexity(model, test_data)->float:
    """    
    Calculate the perplexity of a unigram model.
    Params
    ------
    model: dict
        Dictionary with the model.
    test_data: list
        List of strings.
    Returns
    -------
    perplexity: float
        Perplexity of the model.
    """
    test_size = len(test_data)
    log_sum = 0
    for sentence in test_data:
        sentence = replace_unk(sentence, model)
        words = tokenizer.tokenize(sentence)
        for word in words:
            if word in model:
                probability = model[word]["probability"]
            else:
                probability = model["<UNK>"]
            log_sum += math.log2(probability)
    return -1*(log_sum / test_size)

In [None]:
def calculate_bigram_perplexity(bigram_model, monogram_model, test_data)->float:
    """
    Calculate the perplexity of a bigram model.
    Params
    ------
    model: dict
        Dictionary with the model.
    test_data: list
        List of strings.
    Returns
    -------
    perplexity: float  
        Perplexity of the model.
    """
    test_size = len(test_data)
    log_sum = 0
    vocabulary = len(monogram_model)
    for sentence in test_data:
        sentence = replace_unk(sentence, monogram_model)
        words = tokenizer.tokenize(sentence)
        for i in range(len(words)-1):
            bi_gram = words[i]+" "+words[i+1]
            if bi_gram in bigram_model:
                probability = bigram_model[bi_gram]["probability"]
            else:
                probability = 1/monogram_model[words[i]]["count"]+vocabulary

            log_sum += math.log2(probability)
    return -1*(log_sum / test_size)

In [None]:
def calculate_trigram_perplexity(trigram, bigram_model, monogram_model, test_data)->float:
    """
    Calculate the perplexity of a trigram model.
    Params
    ------
    model: dict
        Dictionary with the model.
    test_data: list
        List of strings.
    Returns
    -------
    perplexity: float  
        Perplexity of the model.
    """
    test_size = len(test_data)
    log_sum = 0
    vocabulary = len(monogram_model)
    for sentence in test_data:
        sentence = replace_unk(sentence, monogram_model)
        words = tokenizer.tokenize(sentence)
        for i in range(len(words)-2):
            tri_gram = words[i]+" "+words[i+1]+" "+words[i+2]
            bi_gram = words[i]+" "+words[i+1]
            if tri_gram in trigram:
                probability = trigram[tri_gram]["probability"]
            elif bi_gram in bigram_model:
                probability = 1/(bigram_model[bi_gram]["count"]+vocabulary^2)
            else:
                probability = 1/(1+vocabulary^2)
            log_sum += math.log2(probability)
    return -1*(log_sum / test_size)