# Retrieval models
This notebook aims to implement the classical retrieval models based on tokenized texts and a defined word corpus

In [36]:
import pandas as pd
import pickle
import numpy as np
import math
from abc import ABC, abstractmethod

## Dataset import

In [7]:
word_corpus = pd.read_csv("../datasets/20-news-word-corpus.csv")
with open("../datasets/20-news-processed-no-singles.pickle", "rb") as f:
    dataset = pickle.load(f)

In [8]:
len(word_corpus)

1289

## The Retrieval model abstract class

In order to facilitate the implementation of new models, we first create an AbstractClass to define the expected behavior of the models

In [6]:
class RetrievalModel(ABC):
 
    def __init__(self, word_corpus: pd.DataFrame):
        self.word_corpus = word_corpus
        super().__init__()
    
    @abstractmethod
    def convert_dataset(self, dataset:list):
        pass
    
    @abstractmethod
    def convert_item(self, item:list):
        pass

## Binary model
A vetor with values 1 or 0 for each word of the word corpus, representing if they are in the encoded text tokens.

In [13]:
class BinaryModel(RetrievalModel):
    
    def __init__(self, word_corpus:pd.DataFrame):
        super().__init__(word_corpus)
        self.word_corpus = word_corpus.word.to_list()
        self.word_corpus.sort()
        
    def convert_dataset(self, dataset:list)-> np.ndarray:
        binary_dataset = []
        for item in dataset:
            binary_dataset.append(self.convert_item(item))
        return np.asarray(binary_dataset)
    
    def convert_item(self, item:list)-> list:
        binary_item = []
        for word in self.word_corpus:
            value = 1 if word in item else 0
            binary_item.append(value)
        return binary_item

In [None]:
def convert_to_binary(word_corpus: pd.DataFrame, dataset: list):
    words = word_corpus.word.to_list()
    binary_dataset = []
    for item in dataset:
        binary_item = []
        for word in words:
            value = 1 if word in item else 0
            binary_item.append(value)
        binary_dataset.append(binary_item)
    return np.asarray(binary_dataset)

In [15]:
binary_model = BinaryModel(word_corpus)
binary_dataset = binary_model.convert_dataset(dataset)
binary_dataset.shape

(19997, 1289)

In [26]:
np.save("../datasets/20-news-binary-model", binary_dataset)

## Bag-of-Words

The bag of words model is an expansion of the binary model, where the text is represented by a vector containing the count of each word from the word corpus in the converted text.

In [48]:
class BagOfWordsModel(RetrievalModel):
    
    def __init__(self, word_corpus:pd.DataFrame):
        super().__init__(word_corpus)
        self.word_corpus = word_corpus.word.to_list()
        self.word_corpus.sort()
        
    def convert_dataset(self, dataset:list)-> np.ndarray:
        bow_dataset = []
        for item in dataset:
            bow_dataset.append(self.convert_item(item))
        return np.asarray(bow_dataset)
    
    def convert_item(self, item:list)-> list:
        bow_item = []
        for word in self.word_corpus:
            bow_item.append(item.count(word))
        return bow_item

In [28]:
bow_model = BagOfWordsModel(word_corpus)
bow_dataset = bow_model.convert_dataset(dataset)
bow_dataset.shape

(19997, 1289)

In [29]:
np.save("../datasets/20-news-bow-model", bow_dataset)

## TF-IDF model
The TF-IDF model computes values for each word based on its ocurrence in each text, in the full corpus and in how many texts it appears.

In [54]:
class TfIdfModel(RetrievalModel):
    
    def __init__(self, word_corpus:pd.DataFrame):
        super().__init__(word_corpus)
        self.word_corpus = word_corpus.word.to_list()
        self.word_corpus.sort()
    
    def compute_idf(self, dataset:list):
        word_idf = []
        dataset_size = len(dataset)
        for index, word in enumerate(self.word_corpus):
            word_idf.append(0)
            for item in dataset:
                if word in item:
                    word_idf[index] += 1
            if word_idf[index] == 0:
                continue
            word_idf[index] = math.log(dataset_size/word_idf[index], 2)
        self.word_idf = np.asarray(word_idf)
        
    def convert_dataset(self, dataset:list)-> np.ndarray:
        # Calcular o idf para cada palavra.
        self.compute_idf(dataset)
        
        tf_idf_dataset = []
        for item in dataset:
            tf_idf_dataset.append(self.convert_item(item))
        return np.asarray(tf_idf_dataset)
    
    def compute_tf(self, word:str, item:list):
        word_count = item.count(word)
        if word_count == 0:
            return 0
        else:
            return 1 + math.log(word_count, 2)
    
    def convert_item(self, item:list)-> list:
        tf_idf_item = []
        for index, word in enumerate(self.word_corpus):
            tf = self.compute_tf(word, item)
            tf_idf = tf * self.word_idf[index]
            tf_idf_item.append(tf_idf)
        return tf_idf_item

In [55]:
tf_idf_model = TfIdfModel(word_corpus)
tf_idf_dataset = tf_idf_model.convert_dataset(dataset)
tf_idf_dataset.shape

(19997, 1289)

In [74]:
count = 0
for item in dataset:
    if 'atheist' in item:
        count += 1
print(count)

308


In [75]:
19997/308

64.92532467532467

In [60]:
math.log(19997/1416, 2)

3.819890408978376

In [58]:
model = tf_idf_model
print(model.word_corpus[12])
print(model.word_idf[12])

abl
3.819890408978376


In [62]:
1 + math.log(1, 2)

1.0

In [68]:
bow_dataset[1][105]

90

In [70]:
model.word_corpus[105]

'atheist'

In [67]:
tf_idf_dataset[1][105]

45.10627049809009

In [76]:
np.save("../datasets/20-news-tf-idf-model", tf_idf_dataset)