## Probabilistic BM25

In [1]:
import pandas as pd
import pickle
import numpy as np
import math
from sklearn import metrics

In [19]:
word_corpus = pd.read_csv("../datasets/20news-word-corpus-2k.csv")
test_filter = np.load("../datasets/test_filter.npy")
classes = np.load("../datasets/20-news-classes.npy")
with open("../datasets/20-news-processed-no-singles.pickle", "rb") as f:
    dataset = pickle.load(f)
len(word_corpus)

2423

In [17]:
class RetrievalEvaluation:
    
    def __init__(self, ranked_lists:np.ndarray, classes:np.ndarray):
        self.ranked_lists = ranked_lists
        self.classes = classes
        self.class_size = 1000
        self.n = len(ranked_lists)
        
    def p_at_n(self, n:int)->float:
        p_total = 0
        for rank in self.ranked_lists:
            p = 0
            target_class = self.classes[rank[0]]
            for i in range(n):
                if self.classes[rank[i]] == target_class:
                    p += 1
            p = p / n
            p_total += p
        return p_total / self.n
    
    def p_at_10(self)->float:
        return self.p_at_n(n=10)
    
    def p_at_20(self)->float:
        return self.p_at_n(n=20)
    
    def p_at_50(self)->float:
        return self.p_at_n(n=50)
    
    def p_at_100(self)->float:
        return self.p_at_n(n=100)
    
    def computeAveragePrecision(self, rk, d=1000):
        sumrj = 0
        curPrecision = 0
        sumPrecision = 0
        qClass = self.classes[rk[0]]
        for i in range(d):
            imgi = rk[i]
            imgiClass = self.classes[imgi]
            if (qClass == imgiClass):
                sumrj = sumrj + 1
                posi = i + 1
                curPrecision = sumrj / posi
                sumPrecision += curPrecision
        nRel = self.class_size
        l = len(rk)
        avgPrecision = sumPrecision / min(l, nRel)
        return avgPrecision

    def compute_map(self):
        acumAP = 0
        for rk in self.ranked_lists:
            acumAP += self.computeAveragePrecision(rk)
        return acumAP / self.n
    
    def evaluate_all(self) -> None:
        print("=========== Evaluation Procedure ===========")
        print("Evaluation dataset size:", self.n)
        print("Precision at 10:", self.p_at_10())
        print("Precision at 20:", self.p_at_20())
        print("Precision at 50:", self.p_at_50())
        print("Precision at 100:", self.p_at_100())
        print("Map:", self.compute_map())

In [29]:
class BM25Probabilistic:
    
    def __init__(self, word_corpus:pd.DataFrame, k = 1.2, b = 0.8):
        self.metric = "cosine"
        self.k = k
        self.b = b
        self.word_corpus = word_corpus.word.to_list()
        self.dataset = None
        self.metric = None
    
    def compute_ranked_lists(self, dataset:list, queries:list) -> np.ndarray:
        self.compute_idf(dataset)
        self.compute_average_size(dataset)

        ranked_lists = []
        total = len(queries)
        for x, query in enumerate(queries):
            print("processing", x, "of", total)
            similarities = []
            clean_query = [value for value in query if value in self.word_corpus]
            for item in dataset:
                similarity = 0
                target_words = [value for value in clean_query if value in item]
                for word in target_words:
                    similarity += self.compute_tf_idf(word, item)
                similarities.append(similarity)
            similarities = np.asarray(similarities) * -1
            rank_map = np.argsort(similarities)
            ranked_lists.append(rank_map)
        self.ranked_lists = np.asarray(ranked_lists)
        
    def get_dataset(self) -> np.ndarray:
        return self.dataset
    
    def compute_tf_idf(self, word:str, item:list) -> float:
        item_size = len(item)
        tf = self.compute_tf(word, item, item_size)
        tf_idf = tf * self.word_idf[word] if item.count(word) > 0 else 0
        return tf_idf
    
    def get_ranked_lists(self) -> np.ndarray:
        return self.ranked_lists
    
    def compute_idf(self, dataset:list):
        word_idf = {}
        dataset_size = len(dataset)
        for index, word in enumerate(self.word_corpus):
            word_idf[word] = 0
            for item in dataset:
                if word in item:
                    word_idf[word] += 1
            word_idf[word] = math.log((dataset_size - word_idf[word] + 0.5) / (word_idf[word] + 0.5), 2)
        self.word_idf = word_idf
        
    def compute_average_size(self, dataset:list):
        total_size = 0
        for item in dataset:
            total_size += len(item)
        self.average_size = total_size / len(dataset)
    
    def compute_tf(self, word:str, item:list, item_size:int):
        word_count = item.count(word)
        if word_count == 0:
            return 0
        else:
            upper = (self.k + 1) * word_count
            bottom = (self.k * (1 - self.b)) + (self.k * self.b * self.average_size / item_size) + word_count
            return upper / bottom 

In [26]:
filter_dataset = []
for index, val in enumerate(test_filter):
    if val:
        filter_dataset.append(dataset[index])
print(len(filter_dataset))

2000


In [30]:
bm25_model = BM25Probabilistic(word_corpus)
bm25_model.compute_ranked_lists(dataset, filter_dataset)
ranked_lists = bm25_model.get_ranked_lists()

processing 0 of 2000
processing 1 of 2000
processing 2 of 2000
processing 3 of 2000
processing 4 of 2000
processing 5 of 2000
processing 6 of 2000
processing 7 of 2000
processing 8 of 2000
processing 9 of 2000
processing 10 of 2000
processing 11 of 2000
processing 12 of 2000
processing 13 of 2000
processing 14 of 2000
processing 15 of 2000
processing 16 of 2000
processing 17 of 2000
processing 18 of 2000
processing 19 of 2000
processing 20 of 2000
processing 21 of 2000
processing 22 of 2000
processing 23 of 2000
processing 24 of 2000
processing 25 of 2000
processing 26 of 2000
processing 27 of 2000
processing 28 of 2000
processing 29 of 2000
processing 30 of 2000
processing 31 of 2000
processing 32 of 2000
processing 33 of 2000
processing 34 of 2000
processing 35 of 2000
processing 36 of 2000
processing 37 of 2000
processing 38 of 2000
processing 39 of 2000
processing 40 of 2000
processing 41 of 2000
processing 42 of 2000
processing 43 of 2000
processing 44 of 2000
processing 45 of 200

In [31]:
evaluation = RetrievalEvaluation(ranked_lists, classes)
evaluation.evaluate_all()

Evaluation dataset size: 2000
Precision at 10: 0.3429000000000022
Precision at 20: 0.25515000000000065
Precision at 50: 0.17752999999999997
Precision at 100: 0.1464400000000002
Map: 0.022750116115334854
