In [86]:
import pandas as pd
import numpy as np
import nltk
import re
import collections
import bisect
nltk.download('punkt')
nltk.download('rslp')
nltk.download('stopwords')


nltk.download('stopwords')
data = pd.read_csv('./results.csv', sep=',')
json = pd.read_json('./results_final.json')
gabarito = {json['query'][i]:json['docs'][i] for i in range(10)}

textData = pd.DataFrame(data['text'], columns=['text'])
textData['tokenizedText'] = data.apply(lambda row: nltk.word_tokenize(row['text'].lower(), language='portuguese'), axis=1)



stopwords = nltk.corpus.stopwords.words('portuguese')

index = {}
document = 0
M = textData.text.count()

for wordList in textData.tokenizedText:
  document += 1
  for word in wordList:
    if word not in stopwords and len(word) >= 3:      
      if word not in index.keys():
        index[word] = []
      index[word].append(document)
      
for elem in index.items():
  d = dict(collections.Counter(elem[1]))
  index[elem[0]] = list(d.items())
  
for word in index:
  k = len(index[word])
  IDF = round(np.log((M+1)/k),2)
  index[word].append(IDF)
  

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package rslp to /root/nltk_data...
[nltk_data]   Package rslp is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#Questão 1

Escolha um documento dentre aqueles da base do aluno Bernardi e crie uma consulta que você acha que tem boas chances de recuperar este documento. Em seguida, avalie os resultados de tal consulta usando a métrica de avaliação Reciprocal Rank

Para essa questão iremos escolher o documento 14 que trata do falecimento do neto do ex-presidente Lula e algumas reações de ódio expressadas.

In [87]:
ndoc = 14
document = data.loc[ndoc]
query = 'neto lula'
row = data.loc[data.url == document.url]

document.title

'A morte do inocente neto de Lula soltou os monstros do ódio'

Abaixo os modelos vetoriais definidos no laboratório anterior.

In [0]:
def binary_vsm(query, document):
  score = 0
  query_tokens = query.split()
  doc_tokens = document.split()
  
  for token in query_tokens:
    score += (token in doc_tokens)
    
  return score


def tf_vsm(query, document):
  score = 0
  doc_tokens = document.split()
  query_tokens = query.split()
  
  for word in query_tokens:
    score += doc_tokens.count(word)
  
  return score

def tf_idf_vsm(query, document):
  score = 0
  doc_tokens = document.split()
  query_tokens = query.split()
  
  for word in query_tokens:
    cwd = doc_tokens.count(word)
    if word in index:
      score += cwd * index[word][-1]
  
  return round(score,2)


def bm25_vsm(query, document, k):
  score = 0
  doc_tokens = document.split()
  query_tokens = query.split()
  
  words = [word for word in query_tokens if word in doc_tokens]
    
  for word in words:
    cwd = doc_tokens.count(word)
    dfw = 0
    if word in index:
      dfw = len(index[word][:-1])
    score += (((k+1) * cwd) / (cwd + k)) * np.log10(((M+1) / dfw)) if dfw != 0 else 0
  
  return round(score,2)


def create_topk_models(query,k):
  db = []
  dtf = []
  dtfidf = []
  dbm25 = []
  for i in range(len(data)):
    doc = data.text[i].lower()
    bisect.insort(db, (binary_vsm(query, doc), i))
    bisect.insort(dtf, (tf_vsm(query,doc), i))
    bisect.insort(dtfidf, (tf_idf_vsm(query,doc), i))
    bisect.insort(dbm25, (bm25_vsm(query,doc,20), i))
  
  db.reverse()
  dtf.reverse()
  dtfidf.reverse()
  dbm25.reverse()
  
  return db[:k], dtf[:k], dtfidf[:k], dbm25[:k]

In [0]:
top_binary, top_tf, top_tfidf, top_bm25 = create_topk_models(query,10)

##Resultados


In [90]:
query_df = pd.DataFrame()

query_df['Binary'] = top_binary
query_df['TF'] = top_tf
query_df['TF-IDF'] = top_tfidf
query_df['BM25'] = top_bm25

query_df.index+=1
query_df

Unnamed: 0,Binary,TF,TF-IDF,BM25
1,"(2, 225)","(14, 14)","(44.56, 14)","(14.87, 14)"
2,"(2, 14)","(3, 233)","(9.52, 225)","(4.01, 225)"
3,"(1, 235)","(3, 225)","(9.12, 233)","(3.61, 233)"
4,"(1, 234)","(2, 203)","(6.08, 203)","(2.52, 203)"
5,"(1, 233)","(1, 235)","(3.44, 234)","(1.49, 234)"
6,"(1, 215)","(1, 234)","(3.44, 148)","(1.49, 148)"
7,"(1, 203)","(1, 215)","(3.44, 113)","(1.49, 113)"
8,"(1, 171)","(1, 171)","(3.44, 64)","(1.49, 64)"
9,"(1, 167)","(1, 167)","(3.44, 33)","(1.49, 33)"
10,"(1, 148)","(1, 148)","(3.04, 235)","(1.32, 235)"


Para a análise dos resultados utilizaremos o reciprocal rank como indicado no enunciado da questão.


In [91]:
def reciprocal_rank(tuples, docId):
  n = 1.0;
  for r,doc in tuples:
    if doc == docId:
      return  [round(1 / n, 2)]
    else:
      n += 1

rank_df = pd.DataFrame()
rank_df['Binary'] = reciprocal_rank(query_df['Binary'], ndoc)
rank_df['TF'] = reciprocal_rank(query_df['TF'], ndoc)
rank_df['TF-IDF'] = reciprocal_rank(query_df['TF-IDF'], ndoc)
rank_df['BM25'] = reciprocal_rank(query_df['BM25'], ndoc)
rank_df.index+=1
rank_df

Unnamed: 0,Binary,TF,TF-IDF,BM25
1,0.5,1.0,1.0,1.0


#Questão 2

A partir do gabarito fornecido em OBS1, calcule o MAP para cada algoritmo abaixo e aponte qual obteve o melhor resultado. Para os cálculos do MAP, considere que um documento é relevante para uma dada consulta se este documento estiver entre os documentos do gabarito para essa consulta, senão ele deve ser considerado irrelevante. 

In [0]:
def doc_indexes(model):
  return [doc for score,doc in model]

def intersection(a,b):
  return [elem for elem in a if elem in b]

def calc_AP(query):
  relevant_docs = []

  for doc_info in gabarito[query]:
    row = data.loc[data.url == doc_info['URL']]
    relevant_docs.append(row.index[0])
  
  binary, tf, tfidf, bm25 = create_topk_models(query, 5)
  binary = doc_indexes(binary)
  tf = doc_indexes(tf)
  tfidf = doc_indexes(tfidf)
  bm25 = doc_indexes(bm25)
  
  ap_binary = len(intersection(binary, relevant_docs)) / len(binary)
  ap_tf = len(intersection(tf, relevant_docs)) / len(tf)
  ap_tfidf = len(intersection(tfidf, relevant_docs)) / len(tfidf)
  ap_bm25 = len(intersection(bm25, relevant_docs)) / len(bm25)
  
  return ap_binary, ap_tf, ap_tfidf, ap_bm25
  
def calc_MAP(queries):
  sum_binary = 0
  sum_tf = 0
  sum_tfidf = 0
  sum_bm25 = 0
  
  for query in queries:
    ap_binary, ap_tf, ap_tfidf, ap_bm25 = calc_AP(query)
    sum_binary += ap_binary
    sum_tf += ap_tf
    sum_tfidf += ap_tfidf
    sum_bm25 += ap_bm25
  
  map_binary = round(sum_binary / len(queries),2)
  map_tf = round(sum_tf / len(queries),2)
  map_tfidf = round(sum_tfidf / len(queries),2)
  map_bm25 = round(sum_bm25 / len(queries),2)
  
  return map_binary, map_tf, map_tfidf, map_bm25


map_binary, map_tf, map_tfidf, map_bm25 = calc_MAP(gabarito.keys())

##Resultados

Abaixo podemos observar o que o **MAP** de todos os algoritmos é muito baixo, isso nos mostra uma precisão extremamente baixa quando procuramos documentos específicos.

In [93]:
rank_df = pd.DataFrame()
rank_df['Binary'] = [map_binary]
rank_df['TF'] = [map_tf]
rank_df['TF-IDF'] = [map_tfidf]
rank_df['BM25'] = [map_bm25]
rank_df.index+=1
rank_df

Unnamed: 0,Binary,TF,TF-IDF,BM25
1,0.1,0.02,0.18,0.18


#Questão 3
Repita Q2 usando a avaliação multi-nível DCG. Utilize o campo "level" do gabarito para o cálculo do DCG e do idealDCG.

In [0]:
def set_levels(m, d):
  model = [(0, doc) for doc in m if doc not in d]
  dic = [(v, k) for k, v in d.items()]
  
  res = model + dic
  res.sort(reverse=True)
  
  return res

def get_level(d, l):
  for level,doc in l:
    if doc == d:
      return level

def all_docs(bi,tf,tfidf,bm):
  return doc_indexes(bi), doc_indexes(tf), doc_indexes(tfidf), doc_indexes(bm)
    
def all_levels(bi,tf,tfidf,bm, rd):
  return set_levels(bi,rd), set_levels(tf,rd), set_levels(tfidf,rd), set_levels(bm,rd)

def extract_docs(bi,tf,tfidf,bm):
  return [doc for level,doc in bi], [doc for level,doc in tf], [doc for level,doc in tfidf], [doc for level,doc in bm]

In [0]:
def calc_dcg(model, levels):
  dcg = 0.0
  for i in range(1,len(model)+1):
    doc = model[i-1]
    level = get_level(doc, levels)
    dcg += (2^level) / np.log2(i + 1.0)
    
  return dcg

def dcg_models(query):
  relevant_docs = {}

  for doc_info in gabarito[query]:
    row = data.loc[data.url == doc_info['URL']]
    relevant_docs[row.index[0]] = doc_info['level']
    
  binary, tf, tfidf, bm25 = create_topk_models(query, 5)
  binary, tf, tfidf, bm25 = all_docs(binary,tf,tfidf,bm25)
  
  dcg_binary = round(calc_dcg(binary, set_levels(binary, relevant_docs)),2)
  dcg_tf = round(calc_dcg(tf, set_levels(tf, relevant_docs)),2)
  dcg_tfidf = round(calc_dcg(tfidf, set_levels(tfidf, relevant_docs)),2)
  dcg_bm25 = round(calc_dcg(bm25, set_levels(bm25, relevant_docs)),2)
  
  return dcg_binary, dcg_tf, dcg_tfidf, dcg_bm25

def idcg_models(query):
  relevant_docs = {}

  for doc_info in gabarito[query]:
    row = data.loc[data.url == doc_info['URL']]
    relevant_docs[row.index[0]] = doc_info['level']
    
  binary, tf, tfidf, bm25 = create_topk_models(query, 5)
  binary = doc_indexes(binary)
  tf = doc_indexes(tf)
  tfidf = doc_indexes(tfidf)
  bm25 = doc_indexes(bm25)
  
  levels_binary, levels_tf, levels_tfidf, levels_bm25 = all_levels(binary,tf,tfidf,bm25,relevant_docs)
  
  binary, tf, tfidf, bm25 = extract_docs(levels_binary, levels_tf, levels_tfidf, levels_bm25)
  
  idcg_binary = round(calc_dcg(binary, levels_binary),2)
  idcg_tf = round(calc_dcg(tf, levels_tf),2)
  idcg_tfidf = round(calc_dcg(tfidf, levels_tfidf),2)
  idcg_bm25 = round(calc_dcg(bm25, levels_bm25),2)
  
  return idcg_binary, idcg_tf, idcg_tfidf, idcg_bm25

In [0]:

queries_results = {}
for query in gabarito.keys():
  dcg_binary, dcg_tf, dcg_tfidf, dcg_bm25 = dcg_models(query)
  idcg_binary, idcg_tf, idcg_tfidf, idcg_bm25 = idcg_models(query)
  
  binary = (dcg_binary, idcg_binary)
  tf = (dcg_tf, idcg_tf)
  tfidf = (dcg_tfidf, idcg_tfidf)
  bm25 = (dcg_bm25, idcg_bm25)
  
  results = [binary, tf, tfidf, bm25]
  
  queries_results[query] = results

##Resultados

Abaixo apresentaremos um data frame em que cada coluna representa um modelo vetorial e seus respectivos resultados para as métricas DCG e IDCG, respectivamente.

In [97]:
results_df = pd.DataFrame()
results_df['Query'] = gabarito.keys()
results_df['Binary'] = [queries_results[query][0] for query in gabarito.keys()]
results_df['TF'] = [queries_results[query][1] for query in gabarito.keys()]
results_df['TF-IDF'] = [queries_results[query][2] for query in gabarito.keys()]
results_df['BM25'] = [queries_results[query][3] for query in gabarito.keys()]
results_df.index+=1

results_df

Unnamed: 0,Query,Binary,TF,TF-IDF,BM25
1,território palestino,"(5.9, 15.65)","(5.9, 15.65)","(5.51, 14.98)","(5.51, 14.98)"
2,recessão mundial,"(11.58, 14.98)","(9.77, 14.98)","(9.77, 14.98)","(9.77, 14.98)"
3,ditadura militar,"(5.9, 17.17)","(5.9, 17.17)","(5.9, 17.17)","(5.9, 17.17)"
4,muro das lamentações,"(18.08, 19.29)","(5.9, 21.3)","(19.29, 19.29)","(19.29, 19.29)"
5,brasil e argentina,"(8.9, 17.5)","(5.9, 18.17)","(7.4, 17.5)","(7.79, 17.5)"
6,golpe militar,"(5.9, 20.67)","(5.9, 20.67)","(8.4, 20.04)","(8.4, 20.04)"
7,governo bolsonaro,"(5.9, 16.54)","(5.9, 16.54)","(5.9, 16.54)","(5.9, 16.54)"
8,ministro da economia,"(5.9, 17.17)","(5.9, 17.17)","(5.9, 17.17)","(5.9, 17.17)"
9,prisão de Temer,"(5.9, 13.43)","(5.9, 13.43)","(10.29, 12.05)","(10.29, 12.05)"
10,Congresso Nacional,"(5.9, 9.65)","(5.9, 9.65)","(5.9, 9.65)","(5.9, 9.65)"
