In [137]:
# !pip install pymongo
# !pip install langchain
# !pip install langchain-community
# !pip install pypdf
# !pip install langchain-ollama
# !pip install openai
# !pip install tiktoken
# !pip install sentence-transformers
# !pip install faiss-cpu
# !pip install pandas
# !pip install openpyxl
# !pip install langchain-openai

Collecting openpyxl
  Downloading openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Downloading et_xmlfile-1.1.0-py3-none-any.whl.metadata (1.8 kB)
Downloading openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Downloading et_xmlfile-1.1.0-py3-none-any.whl (4.7 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-1.1.0 openpyxl-3.1.5


In [None]:
# ollama API
from langchain_ollama import OllamaEmbeddings

# operate on LAB LLM
# from langchain_community.chat_models import ChatOpenAI # DEPRECATED
from langchain_openai.chat_models import ChatOpenAI
from langchain_community.llms import OpenAI
from langchain_community.embeddings.openai import OpenAIEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings # <<<<

# langchain stuff
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables.base import RunnableParallel

# general
import datetime
import numpy as np
import pandas as pd
import textwrap

# Extra
from operator import itemgetter
from typing import List

from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.messages import BaseMessage, AIMessage
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.runnables import (
    RunnableLambda,
    ConfigurableFieldSpec,
    RunnablePassthrough,
)
from langchain_core.runnables.history import RunnableWithMessageHistory


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  exec(code_obj, self.user_global_ns, self.user_ns)


# Import from project directories

In [1]:
from model import handlers
from model.chatbot.withhistory import ChatbotWithHistory


For example, replace imports like: `from langchain_core.pydantic_v1 import BaseModel`
with: `from pydantic import BaseModel`
or the v1 compatibility namespace if you are working in a code base that has not been fully upgraded to pydantic 2 yet. 	from pydantic.v1 import BaseModel

  from model.chatbot.memoryhistory import InMemoryHistory


# Load and Push to Mongo

In [4]:
handler = handlers.MongoHandler()
handler.set_main_db('papers')
handler.set_main_collection('papers_collection_test_bis')
# handler
# print(handler.papers.papers_collection_test)
# print(handler.get_collections_from_database('papers'))
# print(handler.get_collection('papers', 'papers_collection_test'))



In [5]:
# load data, split and return dicts for storing them in mongo
processor = handlers.PDFHandler("./files_pdf/", verbose=True)
dicts_after_split = processor.load_and_split(return_dicts=True)

Before split, there were 16 documents loaded, with average characters equal to 4003.
After split, there were 41 documents (chunks), with average characters equal to 1638 (average chunk length).


In [6]:
handler.push_to_main_collection(dicts_after_split)

All documents pushed to collection 'papers_collection_test_bis' in database 'papers'


# Read from Mongo

Create client and access DB/collection

In [2]:
if 'handler' not in globals():
    handler = handlers.MongoHandler()
    handler.set_main_db('papers')
    handler.set_main_collection('papers_collection_test_bis')
    print('handler created')

handler created


In [3]:
retrieved_docs = handler.retrieve_documents_from_main_collection()

In [4]:
retrieved_docs[0].page_content

'BERT: Pre-training of Deep Bidirectional Transformers for\nLanguage Understanding\nJacob Devlin Ming-Wei Chang Kenton Lee Kristina Toutanova\nGoogle AI Language\n{jacobdevlin,mingweichang,kentonl,kristout }@google.com\nAbstract\nWe introduce a new language representa-\ntion model called BERT , which stands for\nBidirectional Encoder Representations from\nTransformers. Unlike recent language repre-\nsentation models (Peters et al., 2018a; Rad-\nford et al., 2018), BERT is designed to pre-\ntrain deep bidirectional representations from\nunlabeled text by jointly conditioning on both\nleft and right context in all layers. As a re-\nsult, the pre-trained BERT model can be ﬁne-\ntuned with just one additional output layer\nto create state-of-the-art models for a wide\nrange of tasks, such as question answering and\nlanguage inference, without substantial task-\nspeciﬁc architecture modiﬁcations.\nBERT is conceptually simple and empirically\npowerful. It obtains new state-of-the-art re-\nsu

## Define Embedding Model and VectorStore

In [5]:
from langchain_community.embeddings import HuggingFaceBgeEmbeddings

embeddings_model = HuggingFaceBgeEmbeddings(
            model_name="BAAI/bge-small-en-v1.5",  # alternatively use "sentence-transformers/all-MiniLM-l6-v2" (faster)
            model_kwargs={'device':'cpu'}, #CPU run or 'device': 'cuda' for GPU use
            encode_kwargs={'normalize_embeddings': True} #Normalization is active, which means that the resulting vectors will have unit length. Normalization can be useful when you want to compare the similarity of sentences using methods like dot product or cosine similarity, as it makes the embeddings comparable on a common scale.
            )


  from tqdm.autonotebook import tqdm, trange


In [6]:
vector_store = handlers.VectorStoreHandler(embeddings_model=embeddings_model, documents=retrieved_docs[:20])

vector_store.similarity_search("What is BERT?", k=3)

[Document(metadata={'source': 'files_pdf\\1810.04805v2.pdf', 'page': 0}, page_content='BERT: Pre-training of Deep Bidirectional Transformers for\nLanguage Understanding\nJacob Devlin Ming-Wei Chang Kenton Lee Kristina Toutanova\nGoogle AI Language\n{jacobdevlin,mingweichang,kentonl,kristout }@google.com\nAbstract\nWe introduce a new language representa-\ntion model called BERT , which stands for\nBidirectional Encoder Representations from\nTransformers. Unlike recent language repre-\nsentation models (Peters et al., 2018a; Rad-\nford et al., 2018), BERT is designed to pre-\ntrain deep bidirectional representations from\nunlabeled text by jointly conditioning on both\nleft and right context in all layers. As a re-\nsult, the pre-trained BERT model can be ﬁne-\ntuned with just one additional output layer\nto create state-of-the-art models for a wide\nrange of tasks, such as question answering and\nlanguage inference, without substantial task-\nspeciﬁc architecture modiﬁcations.\nBERT is 

In [7]:
vector_store.total_documents()

20

# Chain Q&A

In [15]:
from utils.format_docs import format_docs
# def format_docs(docs):
#     return "\n\n".join([d.page_content for d in docs])

In [None]:
# operate on LAB LLM
# from langchain_community.chat_models import ChatOpenAI # DEPRECATED
from langchain_openai.chat_models import ChatOpenAI
from langchain_community.llms import OpenAI

# langchain stuff
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser


retriever = vector_store.as_retriever()

model = ChatOpenAI(
    openai_api_base="https://047c-195-230-200-203.ngrok-free.app/v1",
    api_key="EMPTY",
    temperature=0
)

template = """
    Answer the question based only on the following context:

    {context}

    Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

chain = (
    {"context": retriever | format_docs , "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)


In [131]:
question = """What is the pre-training procedure?"""
# Wrap text to 80 characters.
wrapper = textwrap.TextWrapper(width=75)

response = chain.invoke(question)

print(wrapper.fill(response))


The pre-training procedure for BERT involves two steps:   1. Pre-training:
The model is trained on unlabeled data over different pre-training tasks.
2. Fine-tuning: The BERT model is first initialized with the pre-trained
parameters, and all of the parameters are fine-tuned using labeled data
from the downstream tasks.


## Output Streaming

In [132]:
# %%javascript
# IPython.OutputArea.prototype._should_scroll = function(lines) {
#     return false;
# }
# from IPython.display import clear_output

# wrapper = textwrap.TextWrapper(width=75)

response = ''
question = """What is the pre-training procedure? Please wrap the text width to 80 characters."""
for chunk in chain.stream(question):
    response+=chunk
    print(chunk, end="", flush=True)

# clear_output(wait=True)

# print(wrapper.fill(response))

The pre-training procedure for BERT involves the following steps:

1.  **Data Preparation**: 
    -   The input data is tokenized into subwords using WordPiece tokenization.
    -   The input sequences are sampled to have a combined length of ≤512 tokens.
    -   The LM masking is applied with a uniform masking rate of 15%.

2.  **Training Setup**:
    -   The model is trained with a batch size of 256 sequences (256 sequences * 512 tokens = 128,000 tokens/batch).
    -   The training is done for 1,000,000 steps, which is approximately 40 epochs over the 3.3 billion word corpus.
    -   The Adam optimizer is used with a learning rate of 1e-4, β1= 0.9, β2= 0.999, L2 weight decay of 0.01, and learning rate warmup over the first 10,000 steps.

3.  **Training Procedure**:
    -   The training loss is the sum of the mean masked LM likelihood and the mean next sentence prediction likelihood.
    -   The model is trained on 4 Cloud TPUs in Pod configuration (16 TPU chips total) for BERT BASE a

## Answer evaluation

In [139]:
testset_df = pd.read_excel("TestSetBERT.xlsx")
testset_df.head()

Unnamed: 0,Domanda,Risposta
0,What does BERT stand for?,BERT stands for Bidirectional Encoder Represen...
1,What is the main innovation of BERT compared t...,BERT’s main innovation is its ability to pre-t...
2,How does BERT differ from traditional language...,Traditional language models are unidirectional...
3,What are some tasks BERT achieves state-of-the...,BERT achieves state-of-the-art results on task...
4,What is the significance of the [MASK] token i...,The [MASK] token is used during pre-training t...


In [143]:
model_responses = []
for i in range(testset_df.shape[0]):
    response = chain.invoke(testset_df.iloc[i, 0])
    model_responses.append(response)
testset_df["Model Response"] = model_responses
testset_df.head()

Unnamed: 0,Domanda,Risposta,Model Response
0,What does BERT stand for?,BERT stands for Bidirectional Encoder Represen...,BERT stands for Bidirectional Encoder Represen...
1,What is the main innovation of BERT compared t...,BERT’s main innovation is its ability to pre-t...,The main innovation of BERT compared to previo...
2,How does BERT differ from traditional language...,Traditional language models are unidirectional...,"BERT differs from traditional language models,..."
3,What are some tasks BERT achieves state-of-the...,BERT achieves state-of-the-art results on task...,BERT achieves state-of-the-art results on elev...
4,What is the significance of the [MASK] token i...,The [MASK] token is used during pre-training t...,The [MASK] token in BERT is used for the Maske...


In [144]:
testset_df.to_excel("TestSetBERT_with_model_responses.xlsx", index=False)

# Chatbot with history

idea: RunnableWithMessageHistory

[Discussion on GitHub](https://github.com/langchain-ai/langchain/discussions/16582)

In [8]:
retriever = vector_store.as_retriever()
chatbot = ChatbotWithHistory(user_id="zappa", conversation_id="dev", retriever=retriever)

In [9]:
question = "Why is the sky blue?"
response = chatbot.ask_question(question)

question = "what did I asked you in the last question?"
response = chatbot.ask_question(question)

There is no previous interaction history to draw from. The conversation has just started.


You asked me "Why is the sky blue?" in the last question.




In [10]:
question = "What is the pre-training procedure?"
response = chatbot.ask_question(question)

# question = "And What is BERT?"
# response = chatbot.ask_question(question)

question = "Which questions did I ask during this conversation?"
response = chatbot.ask_question(question)

The pre-training procedure for BERT involves two steps: pre-training and fine-tuning. 

During pre-training, the model is trained on unlabeled data over different pre-training tasks. The pre-training tasks used in BERT are:

1. Masked Language Model (MLM): This task involves randomly masking some of the tokens from the input, and the objective is to predict the original vocabulary id of the masked token.
2. Next Sentence Prediction (NSP): This task involves predicting whether two sentences are adjacent in the original text or not.

The pre-training procedure for BERT is as follows:

1. The model is initialized with random weights.
2. The model is trained on a large corpus of text data.
3. During training, some of the tokens in the input are randomly masked.
4. The model predicts the original vocabulary id of the masked token.
5. The model is also trained to predict whether two sentences are adjacent in the original text or not.
6. The model is trained to minimize the loss on both tasks

Exampe of history:

- Code:
```python
print(store.get(('zappa', 'foo')))
```

- Output:

Human: What is the pre-training procedure?
AI: The pre-training procedure for BERT involves two steps: 

1. Pre-training: The model is trained on unlabeled data over different pre-training tasks. 
2. Fine-tuning: The BERT model is first initialized with the pre-trained parameters, and all of the parameters are fine-tuned using labeled data from the downstream tasks. 

During pre-training, the model uses a "masked language model" (MLM) pre-training objective, where some of the tokens from the input are randomly masked, and the objective is to predict the original vocabulary id of the masked tokens

In [11]:
print(chatbot.history.get(('zappa', 'dev')))

Human: Why is the sky blue?
AI: There is no previous interaction history to draw from. The conversation has just started.
Human: what did I asked you in the last question?
AI: You asked me "Why is the sky blue?" in the last question.
Human: What is the pre-training procedure?
AI: The pre-training procedure for BERT involves two steps: pre-training and fine-tuning. 

During pre-training, the model is trained on unlabeled data over different pre-training tasks. The pre-training tasks used in BERT are:

1. Masked Language Model (MLM): This task involves randomly masking some of the tokens from the input, and the objective is to predict the original vocabulary id of the masked token.
2. Next Sentence Prediction (NSP): This task involves predicting whether two sentences are adjacent in the original text or not.

The pre-training procedure for BERT is as follows:

1. The model is initialized with random weights.
2. The model is trained on a large corpus of text data.
3. During training, some

# Tests

In [18]:
o = OpenAI(
    openai_api_base="https://c12d-195-230-200-203.ngrok-free.app/v1",
    api_key="EMPTY", max_tokens=50, temperature=0
    )

response = o.invoke("I see a penguin with a rifle")
print(response)

, and I think of the absurdity of the situation. A penguin, a flightless bird, holding a rifle. It's a comical image, and I can almost hear the penguin's awkward attempts to hold the rifle steady.
The
