In [137]:
# !pip install pymongo
# !pip install langchain
# !pip install langchain-community
# !pip install pypdf
# !pip install langchain-ollama
# !pip install openai
# !pip install tiktoken
# !pip install sentence-transformers
# !pip install faiss-cpu
# !pip install pandas
# !pip install openpyxl
# !pip install langchain-openai

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.5


In [164]:
# mongo-related
from pymongo import MongoClient
import pprint

# pdf loading/splitting
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.documents.base import Document

# ollama API
from langchain_ollama import OllamaEmbeddings

# operate on LAB LLM
# from langchain_community.chat_models import ChatOpenAI # DEPRECATED
from langchain_openai.chat_models import ChatOpenAI
from langchain_community.llms import OpenAI
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

# vector store
from langchain_community.vectorstores import FAISS

# langchain stuff
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables.base import RunnableParallel

# general
import datetime
import numpy as np
import pandas as pd
import textwrap

# Extra
from operator import itemgetter
from typing import List

from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.messages import BaseMessage, AIMessage
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.runnables import (
    RunnableLambda,
    ConfigurableFieldSpec,
    RunnablePassthrough,
)
from langchain_core.runnables.history import RunnableWithMessageHistory

# Classes definition test (not implemented)

In [2]:
class MongoHandler(MongoClient):
    def __init__(self, host='localhost', port=27017):
        super().__init__(host, port)
        self.dbs = {self[key] for key in self.list_database_names()}

    def get_collections_from_database(self, db_name):
        db = self[db_name]
        return db.list_collection_names()
    
    def get_collection(self, db_name, collection_name):
        db = self[db_name]
        if collection_name in db.list_collection_names():
            return db[collection_name]  
        else: 
            raise ValueError(f"Collection {collection_name} not found in database {db_name}")
    

In [3]:
handler = MongoHandler()

In [6]:
handler.papers.papers_collection_test

Collection(Database(MongoHandler(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'papers'), 'papers_collection_test')

In [127]:
handler.get_collections_from_database('papers')

['papers_collection_test']

In [128]:
handler.get_collection('papers', 'papers_collection_test')

Collection(Database(MongoHandler(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'papers'), 'papers_collection_test')

# Load PDFs

In [4]:
loader = PyPDFDirectoryLoader("./files_pdf/")

docs_before_split = loader.load()
text_splitter = RecursiveCharacterTextSplitter(
    separators=[".", "\n\n"],
    chunk_size = 2000,
    chunk_overlap  = 200,
)
docs_after_split = text_splitter.split_documents(docs_before_split)

docs_after_split[0]

Document(metadata={'source': 'files_pdf\\1810.04805v2.pdf', 'page': 0}, page_content='BERT: Pre-training of Deep Bidirectional Transformers for\nLanguage Understanding\nJacob Devlin Ming-Wei Chang Kenton Lee Kristina Toutanova\nGoogle AI Language\n{jacobdevlin,mingweichang,kentonl,kristout }@google.com\nAbstract\nWe introduce a new language representa-\ntion model called BERT , which stands for\nBidirectional Encoder Representations from\nTransformers. Unlike recent language repre-\nsentation models (Peters et al., 2018a; Rad-\nford et al., 2018), BERT is designed to pre-\ntrain deep bidirectional representations from\nunlabeled text by jointly conditioning on both\nleft and right context in all layers. As a re-\nsult, the pre-trained BERT model can be ﬁne-\ntuned with just one additional output layer\nto create state-of-the-art models for a wide\nrange of tasks, such as question answering and\nlanguage inference, without substantial task-\nspeciﬁc architecture modiﬁcations.\nBERT is c

In [5]:
avg_doc_length = lambda docs: sum([len(doc.page_content) for doc in docs])//len(docs)
avg_char_before_split = avg_doc_length(docs_before_split)
avg_char_after_split = avg_doc_length(docs_after_split)

print(f'Before split, there were {len(docs_before_split)} documents loaded, with average characters equal to {avg_char_before_split}.')
print(f'After split, there were {len(docs_after_split)} documents (chunks), with average characters equal to {avg_char_after_split} (average chunk length).')

Before split, there were 16 documents loaded, with average characters equal to 4003.
After split, there were 41 documents (chunks), with average characters equal to 1638 (average chunk length).


# Push to Mongo

In [6]:
client = MongoClient('localhost', 27017)

In [7]:
client.list_database_names()

['admin', 'config', 'local', 'papers', 'test_db']

In [8]:
db = client.papers
collection = db.papers_collection_test

In [15]:
dicts_after_split = [chunk.model_dump() for chunk in docs_after_split]

!! per completezza si poteva inserire una funzione di "insert" nell'handler di Mongo definito sopra

In [75]:
collection.insert_many(dicts_after_split)

InsertManyResult([ObjectId('6717b904ee55f62c083a22d8'), ObjectId('6717b904ee55f62c083a22d9'), ObjectId('6717b904ee55f62c083a22da'), ObjectId('6717b904ee55f62c083a22db'), ObjectId('6717b904ee55f62c083a22dc'), ObjectId('6717b904ee55f62c083a22dd'), ObjectId('6717b904ee55f62c083a22de'), ObjectId('6717b904ee55f62c083a22df'), ObjectId('6717b904ee55f62c083a22e0'), ObjectId('6717b904ee55f62c083a22e1'), ObjectId('6717b904ee55f62c083a22e2'), ObjectId('6717b904ee55f62c083a22e3'), ObjectId('6717b904ee55f62c083a22e4'), ObjectId('6717b904ee55f62c083a22e5'), ObjectId('6717b904ee55f62c083a22e6'), ObjectId('6717b904ee55f62c083a22e7'), ObjectId('6717b904ee55f62c083a22e8'), ObjectId('6717b904ee55f62c083a22e9'), ObjectId('6717b904ee55f62c083a22ea'), ObjectId('6717b904ee55f62c083a22eb'), ObjectId('6717b904ee55f62c083a22ec'), ObjectId('6717b904ee55f62c083a22ed'), ObjectId('6717b904ee55f62c083a22ee'), ObjectId('6717b904ee55f62c083a22ef'), ObjectId('6717b904ee55f62c083a22f0'), ObjectId('6717b904ee55f62c083a22

# Read from Mongo

Create client and access DB/collection

In [7]:
if 'client' not in globals():
    client = MongoClient('localhost', 27017)
    db = client.papers
    collection = db.papers_collection_test

!! si poteva inserire in una funzione dell'handler di Mongo

In [48]:
# TEST
for chunk in collection.find():
    pprint.pprint(chunk.get("type"))
    break

'Document'


Documents' chunks reconstruction

In [8]:
retrieved_docs = []
for chunk in collection.find():
    doc = Document(page_content=chunk.get("page_content"), 
                   metadata=chunk.get("metadata"),
                   type='Document',
                   id=chunk.get("_id"))
    retrieved_docs.append(doc)

In [9]:
retrieved_docs[0].page_content

'BERT: Pre-training of Deep Bidirectional Transformers for\nLanguage Understanding\nJacob Devlin Ming-Wei Chang Kenton Lee Kristina Toutanova\nGoogle AI Language\n{jacobdevlin,mingweichang,kentonl,kristout }@google.com\nAbstract\nWe introduce a new language representa-\ntion model called BERT , which stands for\nBidirectional Encoder Representations from\nTransformers. Unlike recent language repre-\nsentation models (Peters et al., 2018a; Rad-\nford et al., 2018), BERT is designed to pre-\ntrain deep bidirectional representations from\nunlabeled text by jointly conditioning on both\nleft and right context in all layers. As a re-\nsult, the pre-trained BERT model can be ﬁne-\ntuned with just one additional output layer\nto create state-of-the-art models for a wide\nrange of tasks, such as question answering and\nlanguage inference, without substantial task-\nspeciﬁc architecture modiﬁcations.\nBERT is conceptually simple and empirically\npowerful. It obtains new state-of-the-art re-\nsu

## Define Embedding Model

In [10]:
huggingface_embeddings = HuggingFaceBgeEmbeddings(
    model_name="BAAI/bge-small-en-v1.5",  # alternatively use "sentence-transformers/all-MiniLM-l6-v2" (faster)
    model_kwargs={'device':'cpu'}, #CPU run or 'device': 'cuda' for GPU use
    encode_kwargs={'normalize_embeddings': True} #Normalization is active, which means that the resulting vectors will have unit length. Normalization can be useful when you want to compare the similarity of sentences using methods like dot product or cosine similarity, as it makes the embeddings comparable on a common scale.
)

  from tqdm.autonotebook import tqdm, trange


In [78]:
# TEST
sample_embedding = np.array(huggingface_embeddings.embed_query(retrieved_docs[0].page_content))
print(sample_embedding.shape)

(384,)


## Create vectorstore

In [11]:
vectorstore = FAISS.from_documents(retrieved_docs, huggingface_embeddings)

!! si poteva creare una funzione invece di replicare il codice più volte

In [12]:
# Test the similarity search
query = """What does 'BERT' stand for?"""
relevant_documents = vectorstore.similarity_search(query, k=8)

print(f'There are {len(relevant_documents)} documents retrieved which are relevant to the query. Display the first one:\n')
print(relevant_documents[0].page_content)
print("\nDisplay the second one:")
print(relevant_documents[1].page_content)
print("\nDisplay the third one:")
print(relevant_documents[2].page_content)
print("\nDisplay the fourth one:")
print(relevant_documents[3].page_content)
print(type(relevant_documents))

There are 8 documents retrieved which are relevant to the query. Display the first one:

BERT: Pre-training of Deep Bidirectional Transformers for
Language Understanding
Jacob Devlin Ming-Wei Chang Kenton Lee Kristina Toutanova
Google AI Language
{jacobdevlin,mingweichang,kentonl,kristout }@google.com
Abstract
We introduce a new language representa-
tion model called BERT , which stands for
Bidirectional Encoder Representations from
Transformers. Unlike recent language repre-
sentation models (Peters et al., 2018a; Rad-
ford et al., 2018), BERT is designed to pre-
train deep bidirectional representations from
unlabeled text by jointly conditioning on both
left and right context in all layers. As a re-
sult, the pre-trained BERT model can be ﬁne-
tuned with just one additional output layer
to create state-of-the-art models for a wide
range of tasks, such as question answering and
language inference, without substantial task-
speciﬁc architecture modiﬁcations.
BERT is conceptually simple

# Chain Q&A

In [13]:
def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

In [130]:
retriever = vectorstore.as_retriever()

model = ChatOpenAI(
    openai_api_base="https://d0b9-195-230-200-203.ngrok-free.app/v1",
    api_key="EMPTY",
    temperature=0
)

template = """
    Answer the question based only on the following context:

    {context}

    Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context": retriever | format_docs , "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)


In [131]:
question = """What is the pre-training procedure?"""
# Wrap text to 80 characters.
wrapper = textwrap.TextWrapper(width=75)

response = chain.invoke(question)

print(wrapper.fill(response))


The pre-training procedure for BERT involves two steps:   1. Pre-training:
The model is trained on unlabeled data over different pre-training tasks.
2. Fine-tuning: The BERT model is first initialized with the pre-trained
parameters, and all of the parameters are fine-tuned using labeled data
from the downstream tasks.


## Output Streaming

In [132]:
# %%javascript
# IPython.OutputArea.prototype._should_scroll = function(lines) {
#     return false;
# }
# from IPython.display import clear_output

# wrapper = textwrap.TextWrapper(width=75)

response = ''
question = """What is the pre-training procedure? Please wrap the text width to 80 characters."""
for chunk in chain.stream(question):
    response+=chunk
    print(chunk, end="", flush=True)

# clear_output(wait=True)

# print(wrapper.fill(response))

The pre-training procedure for BERT involves the following steps:

1.  **Data Preparation**: 
    -   The input data is tokenized into subwords using WordPiece tokenization.
    -   The input sequences are sampled to have a combined length of ≤512 tokens.
    -   The LM masking is applied with a uniform masking rate of 15%.

2.  **Training Setup**:
    -   The model is trained with a batch size of 256 sequences (256 sequences * 512 tokens = 128,000 tokens/batch).
    -   The training is done for 1,000,000 steps, which is approximately 40 epochs over the 3.3 billion word corpus.
    -   The Adam optimizer is used with a learning rate of 1e-4, β1= 0.9, β2= 0.999, L2 weight decay of 0.01, and learning rate warmup over the first 10,000 steps.

3.  **Training Procedure**:
    -   The training loss is the sum of the mean masked LM likelihood and the mean next sentence prediction likelihood.
    -   The model is trained on 4 Cloud TPUs in Pod configuration (16 TPU chips total) for BERT BASE a

## Answer evaluation

In [139]:
testset_df = pd.read_excel("TestSetBERT.xlsx")
testset_df.head()

Unnamed: 0,Domanda,Risposta
0,What does BERT stand for?,BERT stands for Bidirectional Encoder Represen...
1,What is the main innovation of BERT compared t...,BERT’s main innovation is its ability to pre-t...
2,How does BERT differ from traditional language...,Traditional language models are unidirectional...
3,What are some tasks BERT achieves state-of-the...,BERT achieves state-of-the-art results on task...
4,What is the significance of the [MASK] token i...,The [MASK] token is used during pre-training t...


In [143]:
model_responses = []
for i in range(testset_df.shape[0]):
    response = chain.invoke(testset_df.iloc[i, 0])
    model_responses.append(response)
testset_df["Model Response"] = model_responses
testset_df.head()

Unnamed: 0,Domanda,Risposta,Model Response
0,What does BERT stand for?,BERT stands for Bidirectional Encoder Represen...,BERT stands for Bidirectional Encoder Represen...
1,What is the main innovation of BERT compared t...,BERT’s main innovation is its ability to pre-t...,The main innovation of BERT compared to previo...
2,How does BERT differ from traditional language...,Traditional language models are unidirectional...,"BERT differs from traditional language models,..."
3,What are some tasks BERT achieves state-of-the...,BERT achieves state-of-the-art results on task...,BERT achieves state-of-the-art results on elev...
4,What is the significance of the [MASK] token i...,The [MASK] token is used during pre-training t...,The [MASK] token in BERT is used for the Maske...


In [144]:
testset_df.to_excel("TestSetBERT_with_model_responses.xlsx", index=False)

# Chatbot with history

idea: RunnableWithMessageHistory

[Discussion on GitHub](https://github.com/langchain-ai/langchain/discussions/16582)

In [136]:
class InMemoryHistory(BaseChatMessageHistory, BaseModel):
    """In memory implementation of chat message history."""

    messages: List[BaseMessage] = Field(default_factory=list)

    def add_message(self, message: BaseMessage) -> None:
        """Add a self-created message to the store"""
        self.messages.append(message)

    def clear(self) -> None:
        self.messages = []

    class Config:
        arbitrary_types_allowed = True

# Here we use a global variable to store the chat message history.
# This will make it easier to inspect it to see the underlying results.
store = {}

def get_session_history(user_id: str, conversation_id: str) -> BaseChatMessageHistory:
    if (user_id, conversation_id) not in store:
        store[(user_id, conversation_id)] = InMemoryHistory()
    return store[(user_id, conversation_id)]

## Test the history
# history = get_session_history("1", "1")
# history.add_message(AIMessage(content="hello"))
# print(store) 

In [152]:
template = """
    Answer the question based only on the following context:

    {context}
    
    If asked for previous interactions, you need to answer based on the following history:
    
    {history}
    
    Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

model = ChatOpenAI(
    openai_api_base="https://d0b9-195-230-200-203.ngrok-free.app/v1",
    api_key="EMPTY",
    temperature=0
)

# retrieve_context_chain = itemgetter("question") | retriever | format_docs
# retrieve_history_chain = itemgetter("history")

# first_step = RunnablePassthrough.assign(context=retrieve_context_chain, history=retrieve_history_chain)
# chain = first_step | prompt | model #| StrOutputParser()
# chain = {"response": first_step | prompt | model, "context": itemgetter("context")} #| StrOutputParser()

chain = (
    RunnableParallel({
                      "context": itemgetter("question") | retriever | format_docs,
                      "question": itemgetter("question"),
                      "history": itemgetter("history")
    })
    |{
        "question": itemgetter("question"),
        "output": prompt | model,
        "context": itemgetter("context"),
        "history": itemgetter("history")
    }
    # | StrOutputParser()
)

In [157]:
store = {}
chain_with_history = RunnableWithMessageHistory(
    chain,
    # Uses the get_by_session_id function defined before
    get_session_history=get_session_history,
    input_messages_key="question",
    history_messages_key="history",
    history_factory_config=[
        ConfigurableFieldSpec(
            id="user_id",
            annotation=str,
            name="User ID",
            description="Unique identifier for the user.",
            default="",
            is_shared=True,
        ),
        ConfigurableFieldSpec(
            id="conversation_id",
            annotation=str,
            name="Conversation ID",
            description="Unique identifier for the conversation.",
            default="",
            is_shared=True,
        ),
    ],
)

!! si può migliorare l'utilizzo dei parametri user_id e conversation_id passandoli come parametri della funzione o inserendo il tutto in una classe e definirli nell costruttore

!! manca salvataggio delle conversazioni su mongoDB

In [159]:
def ask_question(question):
    response = chain_with_history.invoke(
        {"question": question},
        config={"configurable": {"user_id": user_id, "conversation_id": conversation_id}}
    )
    print(response.get("output").content)
    print("\n")
    return response


user_id = "zappa"
conversation_id = "dev"


question = "What is the pre-training procedure?"
response = ask_question(question)

question = "And What is BERT?"
response = ask_question(question)

question = "Which questions did I ask during this conversation?"
response = ask_question(question)


The pre-training procedure for BERT involves two steps: pre-training and fine-tuning. 

During pre-training, the model is trained on unlabeled data over different pre-training tasks. 

For pre-training, the model uses a "masked language model" (MLM) objective, where some of the tokens from the input are randomly masked, and the objective is to predict the original vocabulary id of the masked tokens. 

Additionally, BERT also uses a "next sentence prediction" task that jointly pre-trains text-pair representations. 

The pre-training procedure is shown in Figure 1, which illustrates the overall pre-training and fine-tuning procedures for BERT.


BERT stands for Bidirectional Encoder Representations from Transformers. It is a new language representation model introduced in the paper "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding". BERT is designed to pre-train deep bidirectional representations from unlabeled text by jointly conditioning on both left and

Exampe of history:

- Code:
```python
print(store.get(('zappa', 'foo')))
```

- Output:

Human: What is the pre-training procedure?
AI: The pre-training procedure for BERT involves two steps: 

1. Pre-training: The model is trained on unlabeled data over different pre-training tasks. 
2. Fine-tuning: The BERT model is first initialized with the pre-trained parameters, and all of the parameters are fine-tuned using labeled data from the downstream tasks. 

During pre-training, the model uses a "masked language model" (MLM) pre-training objective, where some of the tokens from the input are randomly masked, and the objective is to predict the original vocabulary id of the masked tokens

In [67]:
print(store.get('foo'))

Human: What does cosine mean?
AI: The cosine is a fundamental concept in mathematics, particularly in trigonometry. It's a ratio of the adjacent side to the hypotenuse in a right-angled triangle.

Imagine a right-angled triangle with an angle θ (theta) between the two sides. The cosine of θ (cos θ) is defined as the ratio of the length of the adjacent side (the side next to the angle) to the length of the hypotenuse (the side opposite the right angle).

Mathematically, it's represented as:

cos θ = adjacent side / hypotenuse

For example, if the adjacent side is 3 units and the hypotenuse is 5 units, the cosine of θ would be:

cos θ = 3 / 5 = 0.6

The cosine function is used to describe the ratio of the adjacent side to the hypotenuse in a right-angled triangle, and it's a fundamental concept in mathematics, physics, engineering, and many other fields.

Would you like to know more about trigonometry or cosine?
Human: What's its inverse
AI: The inverse of the cosine function is called t

# Tests

In [18]:
o = OpenAI(
    openai_api_base="https://c12d-195-230-200-203.ngrok-free.app/v1",
    api_key="EMPTY", max_tokens=50, temperature=0
    )

response = o.invoke("I see a penguin with a rifle")
print(response)

, and I think of the absurdity of the situation. A penguin, a flightless bird, holding a rifle. It's a comical image, and I can almost hear the penguin's awkward attempts to hold the rifle steady.
The
