In [67]:
import torch
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


In [72]:

import pandas as pd

import numpy as np
import torch.nn as nn

from my_nlp_library import MyTokenizer
from sklearn.preprocessing import normalize


#  Carregando os embeddings do GloVe
def load_glove_vectors(glove_file):
    glove_vectors = {}
    with open(glove_file, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = torch.tensor([float(val) for val in values[1:]], dtype=torch.float32)
            glove_vectors[word] = vector
    return glove_vectors


def get_vocabulary_from_glove(glove_vectors):
    vocab = dict()
    inverse_vocab = list()
    vocab["<PAD>"] = 0
    inverse_vocab.append("<PAD>")
    vocab["<UNK>"] = 1
    inverse_vocab.append("<UNK>")
    for word, vector in glove_vectors.items():
        vocab[word] = len(inverse_vocab)
        inverse_vocab.append(word)
    return vocab, inverse_vocab



class Glove():
    def __init__(self):
        self.glove_vectors = load_glove_vectors("data/glove.6B.300d.txt" )
        vocab, inverse_vocab = get_vocabulary_from_glove(self.glove_vectors)
        self.vocab = vocab
        self.inverse_vocab = inverse_vocab
        self.embedding_dim = 300
        self.embeddings_matrix = self.create_embedding_matrix()
        print("Glove embeddings loaded.")


    def create_embedding_matrix(self):
        vocab_size = len(self.glove_vectors) + 2
        embedding_glove = nn.Embedding(vocab_size, self.embedding_dim)

        for idx, word in enumerate(self.inverse_vocab[2:]):
            i = idx + 2
            embedding_glove.weight[idx].data = self.glove_vectors[word]
        return embedding_glove
        
    
    def get_embedding_matrix(self):
        return self.embedding_matrix

        


class PreparaTexto(nn.Module ):
    def __init__(self, tokenizer, embedding_layer):
        super().__init__()
        self.tokenizer = tokenizer
        self.embedding_layer = embedding_layer

    def forward(self, x):
        # Tokenize
        tokens = self.tokenizer(x)
        tokens = np.array(tokens)
        mask = tokens > 1
        x = torch.tensor(tokens)
        x = self.embedding_layer(x)
        pooled = torch.mean(x[mask], dim=0)
        return pooled
    

class Transformador( nn.Module ):
    def __init__(self, n_inputs, n_hidden):
        super().__init__()
        self.layer1 = nn.Linear(n_inputs, n_hidden)
        self.layer2 = nn.Linear(n_hidden, n_inputs)

    def forward(self, x):
        h = self.layer1(x)
        z = self.layer2(h)
        return z, h
    
class Database:
    def __init__(self):
        self.df = pd.read_csv("data/arxiv3.csv")
        self.embeddings_matrix = torch.load("data/embeddings_matrix.pt", weights_only=True)


class FinderModel:

    def __init__(self):
        # load data from embeddings_matrix.pt
        self.data = Database()
        self.glove = Glove()
        self.tokenizer =  MyTokenizer(sentence_length=800, case_sensitive=False, vocab=self.glove.vocab, inverse_vocab=self.glove.inverse_vocab)
        self.preparador = PreparaTexto(self.tokenizer, self.glove.embeddings_matrix)
        self.model = Transformador(300, 200)
        self.model.load_state_dict(torch.load("data/model.pth", weights_only=True))
        print('Model loaded')

    def predict(self, query):

        query_embedding = self.preparador(query)
        query_processed = self.model(query_embedding)[1].detach().numpy() 

        query_processed_ = normalize(query_processed.reshape(1, -1))

        embeddings_matrix_ = normalize(self.data.embeddings_matrix)   
        
        R = embeddings_matrix_ @ query_processed_.T

        df_ = self.data.df.copy()
        relevance = R.flatten()
        df_["relevance"] = relevance

        df_filtered = df_[relevance > 0.00]
        df_final = df_filtered.sort_values("relevance", ascending=False)

        # Selecionar colunas de interesse
        df_final = df_final[['title', 'abstract', 'relevance']]

        # print the top 10 abstracts
        tam = min(10, len(df_final))
        for i in range(tam):
            print(df_final['abstract'].iloc[i])
            print('-----------------------------------')
            
        return df_final.head(10)



In [None]:
finder = FinderModel()

In [None]:
# print it prettier
import json

finder.predict("neural network")

In [None]:

finder.predict("relational database")

In [None]:

finder.predict("natural language processing")