<a href="https://colab.research.google.com/github/pierfrancescomartinello/NLP-Project/blob/main/rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install farm-haystack[colab,faiss]==1.17.2



In [2]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
import numpy as np

filepath = "/content/drive/MyDrive/output240615145009.json"
df = pd.read_json(filepath)
df.columns = ["title", "addr", "text"]

In [5]:
df["text"] = df["text"].apply(lambda x: x.strip())
df["text"] = df["text"].replace("", np.nan)
df.dropna(subset=["text"], inplace=True)

df.reset_index(inplace=True, drop=True)

df

Unnamed: 0,title,addr,text
0,Biblioteca Biomedica di Medicina Interna e Spe...,https://www.unipa.it/biblioteche/Biblioteca-Bi...,10-giu-2024 Si comunica la CHIUSURA ANTICIPATA...
1,Supporto alla ricerca | Università degli Studi...,https://www.unipa.it/biblioteche/scopri-i-serv...,Le biblioteche universitarie sono sempre più c...
2,Fatturazione elettronica | Università degli St...,https://www.unipa.it/target/imprese/informazio...,Il D.M 55 del 3 aprile 2013 prevede l'obbligo ...
3,"Accordo tra Unipa e Bi-Rex, concessi i locali ...",https://www.unipa.it/servizi/resocontisedutesa...,"29-mag-2024 Il Consorzio Bi-Rex, Big Data Inno..."
4,Metodologia delle ricerca e prevenzione del pl...,https://www.unipa.it/strutture/cimdu/Metodolog...,30-mag-2024 Nell'ambito del progetto pilota pe...
...,...,...,...
2273,Graduatorie di accesso - Dottorati XXXV Ciclo ...,https://www.unipa.it/didattica/dottorati/dotto...,L'immatricolazione ai corsi di dottorato si po...
2274,Academic Writing and oral skills for PhD stude...,https://www.unipa.it/didattica/dottorati/Acade...,10-mar-2016 Academic Writing and oral skills -...
2275,Bando di accesso ciclo 37 | Università degli S...,https://www.unipa.it/didattica/dottorati/dotto...,Pubblicazione: 14 giugno 2021 ore 23:59:59 (or...
2276,Cotutela | Università degli Studi di Palermo,https://www.unipa.it/didattica/dottorati/dotto...,La co-tutela di tesi di dottorato rappresenta ...


In [6]:
from haystack import Document

# Use data to initialize Document objects
titles = list(df["title"].values)
texts = list(df["text"].values)
documents = []
for title, text in zip(titles, texts):
    documents.append(Document(content=text, meta={"name": title or ""}))


In [7]:
from haystack.document_stores import FAISSDocumentStore

document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True)


In [8]:
from haystack.nodes import RAGenerator, DensePassageRetriever

retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
)


INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
  return self.fget.__get__(instance, owner)()
INFO:haystack.modeling.model.language_model:Auto-detected model language: english
INFO:haystack.modeling.model.language_model:Auto-detected model language: english


In [9]:
generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=True,
    top_k=1,
    max_length=200,
    min_length=2,
    embed_title=True,
    num_beams=3,
)


INFO:haystack.modeling.utils:Using devices: CUDA:0 - Number of GPUs: 1
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this 

In [10]:
# Delete existing documents in documents store
document_store.delete_documents()

# Write documents to document store
document_store.write_documents(documents)

# Add documents embeddings to index
document_store.update_embeddings(retriever=retriever)


Writing Documents:   0%|          | 0/2278 [00:00<?, ?it/s]

INFO:haystack.document_stores.faiss:Updating embeddings for 2019 docs...


Updating Embedding:   0%|          | 0/2019 [00:00<?, ? docs/s]

Create embeddings:   0%|          | 0/2032 [00:00<?, ? Docs/s]

In [11]:
from haystack.pipelines import GenerativeQAPipeline

pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)


In [29]:
# Use a pipeline as a high-level helper
from transformers import pipeline

mt_pipe = pipeline("translation", model="Helsinki-NLP/opus-mt-it-en", framework="pt", max_length=70000,)

In [30]:
mt_pipe(df.iloc[0]["text"])[0]["translation_text"]

'10-Jun-2024 The EARLY CLOSURE is announced at 13:00 on 18/06/2024 of the Library for institutional commitments.'

In [31]:
for i, row in df.iterrows():
  print(i, mt_pipe(df.iloc[i]["text"])[0]["translation_text"])

Token indices sequence length is longer than the specified maximum sequence length for this model (1044 > 512). Running this sequence through the model will result in indexing errors


0 10-Jun-2024 The EARLY CLOSURE is announced at 13:00 on 18/06/2024 of the Library for institutional commitments.


IndexError: index out of range in self

In [24]:
df["text"].apply(lambda x: len(x)).sort_values()

2083        10
741         10
1818        10
1826        10
1016        10
         ...  
187      53528
2201     64470
956      65905
439      73664
974     175758
Name: text, Length: 2278, dtype: int64

In [None]:
from haystack.utils import print_answers

QUESTIONS = [
    # "chi ha fondato Unipa?",
    # "who founded Unipa?",
    # "quando è stata fondata Unipa?",
    # "when was Unipa founded?",
    # "cos'è Data Algorithms and Machine Intelligence",
    # "what is è Data Algorithms and Machine Intelligence",
    # "cos'è il phishing?",
    # "cosa raccoglie la pagina?",
    "a cosa serve la carta europea?",
    "stazione",
    "biblioteche universitarie",
    "corsi di laurea di unipa"

]

for question in QUESTIONS:
    res = pipe.run(query=question, params={"Generator": {"top_k": 3}, "Retriever": {"top_k": 10}})
    print_answers(res, details="medium")
    print()
