In [1]:
import pandas as pd
import numpy as np
import nltk as nl
from collections import defaultdict
import math

In [2]:
DATABASE_PATH = '../database/estadao_noticias_eleicao.csv'
TEMPLATE_PATH = '../database/gabarito.csv'

In [3]:
noticias_estadao = pd.read_csv(DATABASE_PATH)
noticias_estadao = noticias_estadao.replace(np.nan, '', regex=True)
gabarito = pd.read_csv(TEMPLATE_PATH)

In [4]:
conj_teste = noticias_estadao.head()
gabarito.head()

Unnamed: 0,str_busca,google,busca_binaria,tf,tfidf,bm25
0,segundo turno,"[1062, 1942, 2161, 2078, 2073]","[2048, 1, 2049, 2050, 4096]","[2744, 7, 2112, 7672, 2388]","[2744, 2112, 7672, 1235, 2388]","[2744, 2112, 7672, 2388, 2178]"
1,lava jato,"[616, 164, 1734, 163, 6716]","[3, 13, 15, 27, 6177]","[163, 353, 2807, 127, 359]","[163, 353, 2807, 127, 359]","[163, 353, 2807, 127, 359]"
2,projeto de lei,"[2853, 275, 978, 7092, 3171]","[3584, 6145, 8194, 8706, 6660]","[7, 3942, 7017, 1250, 6942]","[2232, 6461, 2853, 3171, 3942]","[2232, 6461, 3171, 2853, 3170]"
3,compra de voto,"[2200, 8615, 2265, 7746, 82]","[7424, 2178, 6531, 5122, 2311]","[3942, 7017, 5129, 2047, 748]","[7343, 7293, 6791, 3942, 2047]","[7343, 7293, 6791, 7329, 8615]"
4,ministério público,"[64, 6652, 164, 6550, 8615]","[8194, 7, 4104, 8201, 4109]","[6798, 8018, 6244, 6965, 6550]","[6798, 8018, 6244, 6965, 6550]","[6798, 8018, 6244, 6965, 6550]"


In [5]:
def freq_termo(documento, termo):
    return len(list(filter((lambda x: x == termo), documento)))

In [6]:
def produz_tokens(df):
    inverted_index = {}
    for i, row in df.iterrows():
        
        titulo = (word.lower() for word in (nl.word_tokenize(row['titulo'])))
        subtitulo = (word.lower() for word in (nl.word_tokenize(row['subTitulo'])))
        conteudo = (word.lower() for word in (nl.word_tokenize(row['conteudo'])))
        
        documento = list(titulo) + list(subtitulo) + list(conteudo)
        
        for termo in documento:
            if termo in inverted_index:
                inverted_index[termo][termo][row['idNoticia']] = freq_termo(documento, termo)
            else:
                inverted_index[termo] = {termo: {row['idNoticia']: freq_termo(documento, termo)}}
        
    return inverted_index

In [7]:
general_dict = produz_tokens(noticias_estadao)
#dict_teste = produz_tokens(conj_teste)

In [8]:
def representacao_binaria(query):
    binary_dict = {}
    for termo in query:
        for docId in general_dict[termo][termo].keys():
            if docId in binary_dict:
                binary_dict[docId] += 1
            else:
                binary_dict[docId] = 1
    return binary_dict

In [9]:
def representacao_tf(query):
    tf_dict = {}
    for termo in query:
        for docId in general_dict[termo][termo].keys():
            if docId in tf_dict:
                tf_dict[docId] += general_dict[termo][termo][docId]
            else:
                tf_dict[docId] = general_dict[termo][termo][docId]
    return tf_dict

In [10]:
def representacao_tf_idf(query):
    M = noticias_estadao.size # Total de documentos
    tf_idf_dict = {}
    for termo in query:
        for docId in general_dict[termo][termo].keys():
            k = len(general_dict[termo][termo])
            if docId in tf_idf_dict:               
                tf_idf_dict[docId] += math.log10((M+1)/k)
            else:
                tf_idf_dict[docId] = math.log10((M+1)/k)
    
    tf_dict = representacao_tf(query)
    
    for docId in tf_dict.keys():
        tf_idf_dict[docId] *= tf_dict[docId]
    
    return tf_idf_dict

In [11]:
def bm25(query):
    M = noticias_estadao.size
    bm25_dict = {}
    for termo in query:
        for docId in general_dict[termo][termo].keys():
            k = len(general_dict[termo][termo])
            c = general_dict[termo][termo][docId]
            if docId in bm25_dict:               
                bm25_dict[docId] += (((k+1)*c)/(c+k))*math.log10((M+1)/k)
            else:
                bm25_dict[docId] = (((k+1)*c)/(c+k))*math.log10((M+1)/k)
    return bm25_dict

In [21]:
import operator
x = bm25(["segundo", "turno"])
sorted_x = sorted(x.items(), key=operator.itemgetter(1))
sorted_x[len(sorted_x)-1]

(2744, 58.06345477174415)

In [13]:
representacao_tf(["segundo", "turno"])

{1: 3,
 2: 1,
 3: 4,
 7: 28,
 8: 2,
 11: 2,
 13: 2,
 14: 1,
 16: 1,
 18: 1,
 19: 2,
 23: 3,
 25: 1,
 26: 3,
 28: 1,
 29: 1,
 31: 4,
 33: 1,
 35: 6,
 36: 2,
 38: 2,
 41: 2,
 43: 3,
 45: 1,
 46: 3,
 47: 2,
 51: 2,
 52: 4,
 54: 1,
 56: 2,
 57: 1,
 58: 2,
 59: 2,
 60: 1,
 62: 3,
 64: 1,
 69: 8,
 71: 3,
 72: 2,
 74: 7,
 75: 2,
 76: 2,
 77: 1,
 78: 5,
 79: 1,
 80: 3,
 81: 4,
 82: 2,
 83: 1,
 85: 1,
 86: 2,
 87: 4,
 88: 3,
 89: 2,
 91: 2,
 92: 4,
 95: 4,
 98: 2,
 99: 1,
 102: 2,
 104: 2,
 105: 1,
 106: 2,
 107: 3,
 108: 4,
 111: 2,
 113: 2,
 114: 2,
 116: 1,
 117: 5,
 118: 3,
 119: 4,
 121: 1,
 123: 3,
 124: 2,
 125: 1,
 126: 3,
 127: 6,
 128: 1,
 129: 3,
 130: 1,
 133: 1,
 136: 2,
 137: 2,
 142: 1,
 146: 2,
 147: 2,
 148: 1,
 149: 3,
 150: 1,
 152: 1,
 154: 1,
 155: 7,
 157: 4,
 158: 1,
 159: 2,
 163: 7,
 164: 3,
 165: 1,
 169: 1,
 170: 1,
 172: 2,
 173: 8,
 175: 1,
 177: 3,
 178: 1,
 179: 2,
 180: 3,
 182: 1,
 184: 2,
 185: 1,
 191: 1,
 192: 1,
 196: 3,
 197: 4,
 198: 1,
 200: 1,
 201: 1,
 

In [14]:
list(representacao_tf_idf([".","agora","ainda"])

{1: 68.94343691503529,
 2: 73.87248141769548,
 3: 110.72890932341116,
 4: 0.812573010206781,
 5: 0.812573010206781,
 6: 0.812573010206781,
 7: 993.2047624160516,
 8: 48.60031672216808,
 10: 53.68674391438117,
 11: 2.437719030620343,
 12: 4.062865051033905,
 13: 573.7770755849488,
 15: 26.00233632661699,
 16: 114.08433081805998,
 17: 8.12573010206781,
 18: 29.160190033300847,
 19: 8.93830311227459,
 20: 21.939471275583085,
 21: 13.001168163308495,
 22: 120.09501914231954,
 23: 107.37348782876234,
 25: 127.50601679665527,
 26: 14.626314183722057,
 27: 138.02489949095735,
 28: 8.93830311227459,
 29: 37.37835846951192,
 30: 11.376022142894934,
 31: 18.68917923475596,
 32: 6.500584081654248,
 33: 110.72890932341116,
 34: 14.626314183722057,
 35: 69.98445607992204,
 36: 2.437719030620343,
 37: 42.76827871550791,
 38: 211.89738090865282,
 39: 31.10420270218757,
 40: 68.04044341103531,
 41: 104.01806633411351,
 42: 194.61444668963173,
 43: 52.48834205994152,
 44: 86.73529160278635,
 45: 31.135

In [15]:
bm25([".","agora","ainda"])

{1: 25.701128392188256,
 2: 31.062836208837275,
 3: 27.639120164146533,
 4: 0.812573010206781,
 5: 0.812573010206781,
 6: 0.812573010206781,
 7: 236.59024861887406,
 8: 20.577651995069857,
 10: 13.900535853241063,
 11: 2.437113762083876,
 12: 4.060847990067772,
 13: 139.43878946101177,
 15: 25.902624250154208,
 16: 30.380724621431256,
 17: 8.116658957076664,
 18: 12.489127027478963,
 19: 8.92721753232634,
 20: 21.868865228650897,
 21: 12.976996432712358,
 22: 44.2014734665204,
 23: 27.156670651034332,
 25: 32.31870332961043,
 26: 14.59550286511942,
 27: 59.180665343688915,
 28: 8.92721753232634,
 29: 37.17064963632743,
 30: 11.35768736879902,
 31: 18.638261347057554,
 32: 6.494938413096979,
 33: 28.56668892622918,
 34: 14.59550286511942,
 35: 29.777025047730163,
 36: 2.437113762083876,
 37: 18.153198543245697,
 38: 87.73857798332239,
 39: 13.29888192568479,
 40: 29.29518769530392,
 41: 26.02581632068273,
 42: 48.06753219875028,
 43: 22.516257252604678,
 44: 32.752823651114284,
 45: 11.