# Retrieval models
This notebook aims to implement the classical retrieval models based on tokenized texts and a defined word corpus

In [1]:
import pandas as pd
import pickle
import numpy as np

## Dataset import

In [2]:
word_corpus = pd.read_csv("../datasets/20-news-word-corpus.csv")
with open("../datasets/20-news-processed-no-singles.pickle", "rb") as f:
    dataset = pickle.load(f)

In [9]:
len(word_corpus)

1289

## Binary model
A vetor with values 1 or 0 for each word of the word corpus, representing if they are in the encoded text tokens.

In [7]:
def convert_to_binary(word_corpus: pd.DataFrame, dataset: list):
    words = word_corpus.word.to_list()
    binary_dataset = []
    for item in dataset:
        binary_item = []
        for word in words:
            value = 1 if word in item else 0
            binary_item.append(value)
        binary_dataset.append(binary_item)
    return np.asarray(binary_dataset)

In [8]:
binary_dataset = convert_to_binary(word_corpus, dataset)
binary_dataset.shape

(19997, 1289)

In [12]:
np.save("../datasets/20-news-binary-model", binary_dataset)

## Bag-of-Words

The bag of words model is an expansion of the binary model, where the text is represented by a vector containing the count of each word from the word corpus in the converted text.

In [15]:
def convert_to_bag_of_words(word_corpus: pd.DataFrame, dataset: list):
    words = word_corpus.word.to_list()
    bow_dataset = []
    for item in dataset:
        bow_item = []
        for word in words:
            bow_item.append(item.count(word))
        bow_dataset.append(bow_item)
    return np.asarray(bow_dataset)

In [16]:
bow_dataset = convert_to_bag_of_words(word_corpus, dataset)
bow_dataset.shape

(19997, 1289)

In [17]:
np.save("../datasets/20-news-bow-model", bow_dataset)

## TF-IDF model
The TF-IDF model computes values for each word based on its ocurrence in each text, in the full corpus and in how many texts it appears.