<a href="https://colab.research.google.com/github/pattrickx/lang_chain_test/blob/main/langchain_load%2C_split_and_similarit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!ln -s "/content/drive/MyDrive/My_projects/LangChain/data" data

# Instalation

In [None]:
!pip install LangChain==0.1.12
!pip install openai==1.14.1
!pip install --upgrade --quiet  langchain-openai
!pip install sentence-transformers
!pip install jq
!pip install tiktoken

# Convert string in document type

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema.document import Document


def get_text_chunks_langchain(text):
    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=100)
    docs = [Document(page_content=x) for x in text_splitter.split_text(text)]
    return docs


def main():
    text = "I want to use a langchain with a string instead of a txt file, is this possible?"
    docs = get_text_chunks_langchain(text)
    print(docs)


if __name__ == '__main__':
    main()

[Document(page_content='I want to use a langchain with a string instead of a txt file, is this possible?')]


# Load Document from json

In [None]:
from langchain_community.document_loaders import JSONLoader

In [None]:
# Define the metadata extraction function.
def metadata_func(record: dict, metadata: dict) -> dict:

    metadata["page"] = record.get("page")
    # metadata["timestamp_ms"] = record.get("timestamp_ms")

    return metadata


loader = JSONLoader(
    file_path='/content/data/pages.json',
    jq_schema='.[]',
    content_key="content",
    metadata_func=metadata_func
)

data = loader.load()

In [None]:
data


# Sep documents

In [None]:
from langchain.text_splitter import CharacterTextSplitter

In [None]:
# text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000)
# documents_splitted = text_splitter.split_documents(data)

text_splitter_tiktoken = CharacterTextSplitter.from_tiktoken_encoder(separator="\n",chunk_size=400)
documents_splitted = text_splitter_tiktoken.split_documents(data)

In [None]:
documents_splitted

# Embeding

In [None]:
!python -m spacy download pt_core_news_sm

In [None]:
from langchain.embeddings import SpacyEmbeddings
embeddings = SpacyEmbeddings(model_name="pt_core_news_sm")

In [None]:
doc_result = embeddings.embed_documents([part.page_content for part in documents_splitted])

In [None]:
doc_result

# Vector storage: "chromadb"

In [None]:
!pip install chromadb==0.4.24
!pip install --upgrade --quiet  spacy

In [None]:
import chromadb

In [None]:
from langchain.vectorstores import Chroma

In [None]:
db = Chroma.from_documents(documents_splitted,embeddings,persist_directory="./speech_new_db_spacy")
db.persist()

In [None]:
# Conectar ao db
db_new_connection = Chroma(persist_directory="./speech_new_db_spacy",embedding_function=embeddings)

In [None]:
prompt = " retorne texto relacionado a testes substantivs e abordagem dual"

In [None]:
similar_docs = db_new_connection.similarity_search(prompt) # Buscar documentoos mais semelhantes

In [None]:
similar_docs

# multi query

In [None]:
! pip install --upgrade accelerate

## Usar GPT4

In [None]:
import os
model_name = "gpt-unifor"
os.environ["OPENAI_API_VERSION"] = ""
os.environ["AZURE_OPENAI_ENDPOINT"] = ""
os.environ["AZURE_OPENAI_API_KEY"] = ""

In [None]:
from langchain.schema import HumanMessage
from langchain_openai import AzureChatOpenAI
llm = AzureChatOpenAI(
    model_name=model_name,
    temperature=0.0
)


## Usar model Local

In [None]:
# from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline
# from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

# model_id = "GreenBitAI/LLaMA-2-1.1B-2bit-groupsize8"
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id,temperature=0.5,device_map="auto")
# pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=1000)
# llm = HuggingFacePipeline(pipeline=pipe)

## Rodar Multi query

In [None]:
from langchain.retrievers.multi_query import MultiQueryRetriever

In [None]:
retriever_from_llm = MultiQueryRetriever.from_llm(retriever=db_new_connection.as_retriever(),llm=llm)

In [None]:
import logging
logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

In [None]:
unique_docs = retriever_from_llm.get_relevant_documents(query="Quem esta na reunião?")

INFO:langchain.retrievers.multi_query:Generated queries: ['1. Quais são os participantes presentes na reunião atual?', '2. Poderia me informar os nomes das pessoas na reunião?', '3. Quem são os indivíduos que estão participando da reunião?']


In [None]:
unique_docs

# Context Compression

In [None]:
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor


In [None]:
compressor = LLMChainExtractor.from_llm(llm)

compression_retriver = ContextualCompressionRetriever(base_compressor=compressor,
                                                      base_retriever=db_new_connection.as_retriever())

In [None]:
compressed_docs = compression_retriver.get_relevant_documents(query="Quem esta na reunião?")

In [None]:
compressed_docs

# Chains

In [None]:
from langchain.prompts.chat import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate, AIMessagePromptTemplate

In [None]:
system_prompt_template = SystemMessagePromptTemplate.from_template("voce é uma IA que auxilia na buca de informação e responde de forma resumida")
human_prompt_template = HumanMessagePromptTemplate.from_template("Use o contexto para responder a pergunta:\n{context}\nPergunta: {question}")
chat_prompt_template = ChatPromptTemplate.from_messages([system_prompt_template,human_prompt_template])

In [None]:
llm

In [None]:
from langchain.chains import LLMChain

In [None]:
chain = LLMChain(llm=llm, prompt=chat_prompt_template)

In [None]:
question="Quem esta na reunião?"
context = "\n".join(document.page_content for document in compressed_docs)
chain.run(question=question, context=context)

In [None]:
from langchain.chains import LLMChain, SimpleSequentialChain

# QA Chain

In [None]:
# Conectar ao db
from langchain.embeddings import SpacyEmbeddings
from langchain.vectorstores import Chroma

embeddings = SpacyEmbeddings(model_name="pt_core_news_sm")
db_new_connection = Chroma(persist_directory="./speech_new_db_spacy",embedding_function=embeddings)

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

In [None]:
qa_chain = load_qa_with_sources_chain(llm,chain_type="stuff")

In [None]:
question="Quem esta na reunião?"

In [None]:
docs = db_new_connection.similarity_search(question)

In [None]:
qa_chain.run(input_documents=docs,question=question)