<a href="https://colab.research.google.com/github/pedrohcosta/deeplearning/blob/main/Exerc%C3%ADcio_Aluno_Especial.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### 1) Leitura do arquivo CISI e transformando em um DataFrame

In [193]:
import pandas as pd


def readFileToDataFrame(file) :

  #leiura do arquivo e transformação em dataframe
  with open(file) as f:
      
      lines = ""
      for l in f.readlines():
          lines += "\n" + l.strip() if l.startswith(".") else " " + l.strip()         
      #fim for
      lines = lines.lstrip("\n").split("\n")

      docs_list = []
      dic = {}
      for line in lines:        
        if line.startswith(".I"):
          dic["ID"] = line[3:] 
        elif line.startswith(".T"):
          dic["Title"] = line[3:]
        elif line.startswith(".A"):
          dic["Author"] = line[3:]
        elif line.startswith(".W"):
           dic["Abstract"] = line[3:]
           docs_list.append(dic)
           dic = {}   
         #fim elif  
      #fim for       

      return pd.DataFrame(docs_list)

file_name = "CISI.ALL";
df = readFileToDataFrame(file_name)
df

Unnamed: 0,ID,Title,Author,Abstract
0,1,18 Editions of the Dewey Decimal Classifications,"Comaromi, J.P.",The present study is a history of the DEWEY De...
1,2,Use Made of Technical Libraries,"Slater, M.",This report is an analysis of 6300 acts of use...
2,3,Two Kinds of Power An Essay on Bibliographic C...,"Wilson, P.",The relationships between the organization and...
3,4,Systems Analysis of a University Library; fina...,"Buckland, M.K.",The establishment of nine new universities in ...
4,5,A Library Management Game: a report on a resea...,"Brophy, P.",Although the use of games in professional educ...
...,...,...,...,...
1455,1456,World Dynamics,"Forrester, J.W.",Over the last several decades interest in econ...
1456,1457,World Trends in Library Education,"Bramley, G.",One of the most significant aspects of the evo...
1457,1458,Legal Restrictions on Exploitation of the Pate...,"Baxter, W.A.",The patent laws confer on a patentee power to ...
1458,1459,Language and Thought,"Poluskin, V.A.",This book considers the basic aspects of this ...


### 2) Implementação BM25 com ajuda do chatGPT

In [194]:
import math

class BM25:
    def __init__(self, corpus):
        self.corpus_size = len(corpus)
        self.avgdl = sum([len(doc) for doc in corpus]) / self.corpus_size
        self.corpus = corpus
        self.f = []
        self.df = {}
        self.idf = {}
        self.k1 = 1.5
        self.b = 0.75
        self.epsilon = 0.25
        self.build()

    def build(self):
        #calculos da frequencia de cada palavra
        for document in self.corpus:
            frequencies = {}
            for word in document:
                if word not in frequencies:
                    frequencies[word] = 0
                frequencies[word] += 1
            self.f.append(frequencies)
            for word, freq in frequencies.items():
                if word not in self.df:
                    self.df[word] = 0
                self.df[word] += 1

        #calculos inverso da frequencia de cada palavra
        for word, freq in self.df.items():
            self.idf[word] = math.log((self.corpus_size - freq + 0.5) / (freq + 0.5))

    def get_score(self, query, index):
        score = 0
        doc_freq = self.f[index]
        doc_len = len(self.corpus[index])

        #realizando o calculo do score de cada palavra da query
        #k1 e b: são parâmetros que ajustam a contribuição dos termos de busca e do comprimento do documento.
        #k1 e b são parâmetros livres, geralmente escolhidos, na ausência de uma otimização avançada, como k1 ∈ [1.2,2.0] e b=0,75.
        #avgdl: é a média do comprimento de todos os documentos do corpus.
        for word in query:
            if word not in doc_freq:
                continue
            #self.f[index][word] frequencia do palavra no documento (index indice do documento) 
            #calculo realizado de acordo com a formula
            idf = self.idf[word] #frenquencia inversa da palavra
            numerator = (self.f[index][word] * (self.k1 + 1)) 
            denominator = (self.f[index][word] + 
                           self.k1 * (1 - self.b + 
                           self.b * (doc_len / self.avgdl)))
            score += (idf * (numerator / denominator))

        return score

    def search(self, query):
        query = query.split()
        scores = []
        for index in range(self.corpus_size):
            score = self.get_score(query, index)
           
            scores.append((index, score))

        scores = sorted(scores, key=lambda x: x[1], reverse=True)

        results = []
        #retornando apenas score maior que o espicificado na variavel epsilon
        for index, score in scores:
            if score > self.epsilon:
                results.append((index, score))

        return results


### 3) Busca de documentos relevantes

In [196]:
def search_docs(query, quantity=10) :
  corpus = df["Abstract"]
  tokenized_corpus = [doc.split(" ") for doc in corpus]

  bm25 = BM25(tokenized_corpus)
  result = bm25.search(query)[:quantity]
  new_df = df.filter(items=list(map(lambda x: x[0], result)), axis=0)#busca dos indices retornados no dataframe
  new_df['Score'] = list(map(lambda x: x[1], result)) #adição nova coluna score no dataframe de retorno
  return new_df

#exemplo de utilização
query = "information retrieval"
search_docs(query)

Unnamed: 0,ID,Title,Author,Abstract,Score
538,539,Information Retrieval Languages,"Moskovich, V.A.",This book gives classification and detailed de...,4.931489
508,509,The Use of Hierarchic Clustering in Informatio...,"Jardine, N. Van Rijsbergen, C. J.",We introduce information retrieval strategies ...,4.465591
1135,1136,Data Retrieval Systems: Specifics and Problems,"Shtein, V. S.",The essential differences between data retriev...,4.40096
894,895,Design Equations for Retrieval System Based on...,"Heine, M. H.",Swets's theory of information retrieval allows...,4.291098
460,461,Information Retrieval and Processing,"Doyle, L.B.",The present book embodies a change in structur...,4.229659
1080,1081,Generalization of Epidemic Theory An Applicati...,"Goffman, W. Newill, V.A.",One of the most fundamental problems in the fi...,4.086881
850,851,Bibliographic Retrieval from Bibliographic Inp...,"Ruecking, Frederick H. Jr.",A study of problems associated with bibliograp...,4.054105
444,445,A Definition of Relevance for Information Retr...,"Cooper, W.S.","The concept of ""relevance"", sometimes also cal...",3.931236
1124,1125,A Contribution to the Theory of the Systems of...,"Kozachkov, L. S.",Certain structural properties of information d...,3.926227
174,175,"Automatic Information, Organization and Retrieval","Salton, G.",Information retrieval is a field concerned wit...,3.922867
