<a href="https://colab.research.google.com/github/pierfrancescomartinello/NLP-Project/blob/main/rag.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
! pip install farm-haystack[colab,faiss]==1.17.2

Collecting farm-haystack[colab,faiss]==1.17.2
  Downloading farm_haystack-1.17.2-py3-none-any.whl (719 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m719.5/719.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting azure-ai-formrecognizer>=3.2.0b2 (from farm-haystack[colab,faiss]==1.17.2)
  Downloading azure_ai_formrecognizer-3.3.3-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m301.4/301.4 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting boilerpy3 (from farm-haystack[colab,faiss]==1.17.2)
  Downloading boilerpy3-1.0.7-py3-none-any.whl (22 kB)
Collecting canals==0.2.2 (from farm-haystack[colab,faiss]==1.17.2)
  Downloading canals-0.2.2-py3-none-any.whl (31 kB)
Collecting dill (from farm-haystack[colab,faiss]==1.17.2)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollec

In [None]:
import logging

logging.basicConfig(format="%(levelname)s - %(name)s -  %(message)s", level=logging.WARNING)
logging.getLogger("haystack").setLevel(logging.INFO)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
import numpy as np

filepath = "/content/drive/MyDrive/output240614160947.json"
df = pd.read_json(filepath)
df.columns = ["title", "addr", "text"]

In [None]:
df["text"] = df["text"].apply(lambda x: x.strip())
df["text"] = df["text"].replace("", np.nan)
df.dropna(subset=["text"], inplace=True)

df.reset_index(inplace=True, drop=True)

df

In [None]:
from haystack import Document

# Use data to initialize Document objects
titles = list(df["title"].values)
texts = list(df["text"].values)
documents = []
for title, text in zip(titles, texts):
    documents.append(Document(content=text, meta={"name": title or ""}))


In [None]:
from haystack.document_stores import FAISSDocumentStore

document_store = FAISSDocumentStore(faiss_index_factory_str="Flat", return_embedding=True)


In [None]:
from haystack.nodes import RAGenerator, DensePassageRetriever

retriever = DensePassageRetriever(
    document_store=document_store,
    query_embedding_model="facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model="facebook/dpr-ctx_encoder-single-nq-base",
    use_gpu=True,
    embed_title=True,
)


In [None]:
generator = RAGenerator(
    model_name_or_path="facebook/rag-token-nq",
    use_gpu=True,
    top_k=1,
    max_length=200,
    min_length=2,
    embed_title=True,
    num_beams=2,
)


In [None]:
# Delete existing documents in documents store
document_store.delete_documents()

# Write documents to document store
document_store.write_documents(documents)

# Add documents embeddings to index
document_store.update_embeddings(retriever=retriever)


In [None]:
from haystack.pipelines import GenerativeQAPipeline

pipe = GenerativeQAPipeline(generator=generator, retriever=retriever)


In [None]:
from haystack.utils import print_answers

QUESTIONS = [
    "chi ha fondato Unipa?",
    # "who founded Unipa?",
    # "quando è stata fondata Unipa?",
    # "when was Unipa founded?",
    # "cos'è Data Algorithms and Machine Intelligence",
    # "what is è Data Algorithms and Machine Intelligence",
    # "cos'è il phishing?",
    # "cosa raccoglie la pagina?",
    # "carta europea?",
    # "stazione",
]

for question in QUESTIONS:
    res = pipe.run(query=question, params={"Generator": {"top_k": 1}, "Retriever": {"top_k": 10}})
    print_answers(res, details="medium")
