<a href="https://colab.research.google.com/github/ntshrocha/rec-info-2019-1/blob/master/Recupera%C3%A7%C3%A3o_de_Informa%C3%A7%C3%A3o_Natasha.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Natasha Rocha - DRE: 112079422

In [0]:
import re # expressões regulares
import functools # programação funcional
import numpy as np
import pandas as pd

In [0]:
# Dados que serão usados nos exemplos:
M = ['O peã e o caval são pec de xadrez. O caval é o melhor do jog.'
     ,'A jog envolv a torr, o peã e o rei.'
     ,'O peã lac o boi'
     ,'Caval de rodei!'
     ,'Polic o jog no xadrez.']

stopwords = ['a', 'o', 'e', 'é', 'de', 'do', 'no', 'são']

q = 'xadrez peã caval torr'

separadores = [' ',',','.','!','?']

##Exercício 4: Modelo Booleano

In [0]:
# TOKENIZAÇÃO DOS DOCUMENTOS ##############################################################################################################
def tokenize(string, seps):
  regex = re.compile(f'[{"".join(seps)}]+') # regex que identifica conjunto de separadores. Identifica "!", ", " ou ". ", por exemplo.
  tokens = re.split(regex, string)
  return list(filter(None, tokens)) # remove tokens vazios

# EXEMPLO #################################################################################################################################
tokenize(M[0], separadores)

## OBS: Se houverem strings ou certos caracteres especiais nos seps será preciso tratar esses casos.
## No caso isso não foi necessário, então esses edge cases não foram implementados - ainda.

['O',
 'peã',
 'e',
 'o',
 'caval',
 'são',
 'pec',
 'de',
 'xadrez',
 'O',
 'caval',
 'é',
 'o',
 'melhor',
 'do',
 'jog']

In [0]:
# NORMALIZAÇÃO DOS TERMOS E ELIMINAÇÃO DE STOPWORDS #######################################################################################
def remove_stopwords(words, stopwords):
  def process_words(obj, word):
    _word = word.lower() # Mapeamento para lowercase
    if _word not in stopwords: # Filtragem de stopwords
      obj.append(_word)
    return obj
  
  return functools.reduce(process_words, words, []) # Reduce consegue realizar o mapeamento e filtragem em uma única passada pelo array

# EXEMPLO #################################################################################################################################
tokens_0 = tokenize(M[0], separadores)
remove_stopwords(tokens_0, stopwords)

['peã', 'caval', 'pec', 'xadrez', 'caval', 'melhor', 'jog']

In [0]:
# CRIAÇÃO DA MATRIZ DE INCIDÊNCIAS COM FREQUÊNCIA #########################################################################################
def incidence_matrix(tokenized_docs, alphabet = None):
  # Transforma array de arrays em um único array:
  flatten = lambda l: [item for sublist in l for item in sublist]

  if alphabet is None:
    # Filtra palavras únicas e retorna alfabeto com todas as palavras nos docs:
    alphabet = functools.reduce(lambda l, x: l if x in l else l+[x], flatten(tokenized_docs), [])

  IM = pd.DataFrame(columns=alphabet)
  # Para cada documento:
  for i, doc in enumerate(tokenized_docs):
    doc_name = f'doc{i}'
    # Cria linha para adicionar no dataframe
    row = pd.Series(np.zeros(len(alphabet)), index=alphabet, name=doc_name, dtype=int)
    # Para cada palavra do documento:
    for word in doc:
      row[word] += 1
    IM = IM.append(row)
  return IM

# EXEMPLO #################################################################################################################################
tokenized_docs = [tokenize(doc, separadores) for doc in M]
clean_tokens = [remove_stopwords(tokens, stopwords) for tokens in tokenized_docs]
incidence_matrix(clean_tokens)

Unnamed: 0,peã,caval,pec,xadrez,melhor,jog,envolv,torr,rei,lac,boi,rodei,polic
doc0,1,2,1,1,1,1,0,0,0,0,0,0,0
doc1,1,0,0,0,0,1,1,1,1,0,0,0,0
doc2,1,0,0,0,0,0,0,0,0,1,1,0,0
doc3,0,1,0,0,0,0,0,0,0,0,0,1,0
doc4,0,0,0,1,0,1,0,0,0,0,0,0,1


In [0]:
# MODELO BOOLEANO #########################################################################################################################
def boolean_model(docs, stopwords, query, seps):  
  # Pré-processamento dos docs e query:
  tokens_query = tokenize(query, seps)
  tokens_docs = [tokenize(doc, seps) for doc in docs]
  
  clean_query = remove_stopwords(tokens_query, stopwords)
  clean_docs = [remove_stopwords(tokens, stopwords) for tokens in tokens_docs]

  # Criação da matriz de incidência com frequência:
  matrix = incidence_matrix(clean_docs)
  
  # Modelo booleano:
  boolean = {'AND': [], 'OR': []}
  
  # AND
  for i, doc in matrix.iterrows(): # Para cada linha da matriz (para cada documento)
    found = True
    for word in clean_query:
      if doc[word] == 0:
        found = False
        break
    if found:
      boolean['AND'].append(doc.name)
        
  # OR
  for i, doc in matrix.iterrows(): # Para cada linha da matriz (para cada documento)
    found = False
    for word in clean_query:
      if doc[word] != 0:
        found = True
        break
    if found:
      boolean['OR'].append(doc.name)
  
  return boolean

# EXEMPLO #################################################################################################################################
boolean_model(M, stopwords, q, separadores)

{'AND': [], 'OR': ['doc0', 'doc1', 'doc2', 'doc3', 'doc4']}

##Exercício 5: Modelo Vetorial (com ponderação TF-IDF)

In [0]:
# TERM FREQUENCY ##########################################################################################################################
def get_tf(matrix):
  TF = matrix.copy().applymap(lambda f: 0 if f == 0 else 1 + np.log2(f))
  return TF

# EXEMPLO #################################################################################################################################
IM = incidence_matrix(clean_tokens)
get_tf(IM)

Unnamed: 0,peã,caval,pec,xadrez,melhor,jog,envolv,torr,rei,lac,boi,rodei,polic
doc0,1.0,2.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc1,1.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0
doc2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
doc3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
doc4,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [0]:
# INVERSE DOCUMENT FREQUENCY ##############################################################################################################
def get_idf(matrix):
  N = matrix.shape[0] # Número total de documentos
  IDF = matrix.astype(bool).sum() # Conta a quantidade de valores não nulos por coluna
  return IDF.apply(lambda n: np.log2(N/n))

# EXEMPLO #################################################################################################################################
get_idf(IM)

peã       0.736966
caval     1.321928
pec       2.321928
xadrez    1.321928
melhor    2.321928
jog       0.736966
envolv    2.321928
torr      2.321928
rei       2.321928
lac       2.321928
boi       2.321928
rodei     2.321928
polic     2.321928
dtype: float64

In [0]:
def get_tf_idf(TF, IDF):
  TF = TF.where(TF != 0) # Zeros são transformados em NaN para não interferirem nas contas
  w = TF.mul(IDF)
  return w.fillna(0) # Transforma NaN de volta em 0

# EXEMPLO #################################################################################################################################
get_tf_idf(get_tf(IM), get_idf(IM))

Unnamed: 0,peã,caval,pec,xadrez,melhor,jog,envolv,torr,rei,lac,boi,rodei,polic
doc0,0.736966,2.643856,2.321928,1.321928,2.321928,0.736966,0.0,0.0,0.0,0.0,0.0,0.0,0.0
doc1,0.736966,0.0,0.0,0.0,0.0,0.736966,2.321928,2.321928,2.321928,0.0,0.0,0.0,0.0
doc2,0.736966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.321928,2.321928,0.0,0.0
doc3,0.0,1.321928,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.321928,0.0
doc4,0.0,0.0,0.0,1.321928,0.0,0.736966,0.0,0.0,0.0,0.0,0.0,0.0,2.321928


In [0]:
def vector_space_model(docs, stopwords, query, seps):
  # Pré-processamento dos docs e query:
  tokens_query = tokenize(query, seps)
  tokens_docs = [tokenize(doc, seps) for doc in docs]
  
  clean_query = remove_stopwords(tokens_query, stopwords)
  clean_docs = [remove_stopwords(tokens, stopwords) for tokens in tokens_docs]

  # Criação da matriz de incidência com frequência:
  IM_docs = incidence_matrix(clean_docs)
  IM_query = incidence_matrix([clean_query], alphabet = IM_docs.columns)
  
  # Frequência total de ocorrência dos termos:
  TF = get_tf(IM_docs)
  TF_query = get_tf(IM_query)
  
  # Frequência de documento:
  IDF = get_idf(IM_docs)
  
  # Ponderação TF-IDF:
  w = get_tf_idf(TF, IDF)
  w_query = get_tf_idf(TF_query, IDF)
  
  rank = {}
  
  def sim(w1, w2):
    return w1.dot(w2)/(np.linalg.norm(w1)*np.linalg.norm(w2))
  
  for name, vector in w.iterrows():
    rank[name] = sim(vector, w_query.T)[0]
  
  return rank

vector_space_model(M, stopwords, q, separadores)


{'doc0': 0.415053375730601,
 'doc1': 0.4651729931620071,
 'doc2': 0.052555274134206874,
 'doc3': 0.21298960013595078,
 'doc4': 0.20532236528436032}