In [1]:
!pip show langchain

Name: langchain
Version: 0.0.352
Summary: Building applications with LLMs through composability
Home-page: https://github.com/langchain-ai/langchain
Author: 
Author-email: 
License: MIT
Location: /Users/mmenendezg/Developer/Platzi/.venv/lib/python3.11/site-packages
Requires: aiohttp, dataclasses-json, jsonpatch, langchain-community, langchain-core, langsmith, numpy, pydantic, PyYAML, requests, SQLAlchemy, tenacity
Required-by: 


In [2]:
from langchain.schema import Document
from langchain.document_loaders import (
    UnstructuredFileLoader,
    PyPDFLoader,
    DataFrameLoader,
)
from langchain.text_splitter import RecursiveCharacterTextSplitter

import polars as pl
import jsonlines
from typing import List

In [3]:
page_content = "Textooo largoooo exampleeee"
metadata = {"source": "Platzi", "class": "langchain"}

doc = Document(page_content=page_content, metadata=metadata)

In [4]:
doc

Document(page_content='Textooo largoooo exampleeee', metadata={'source': 'Platzi', 'class': 'langchain'})

In [5]:
doc.page_content

'Textooo largoooo exampleeee'

## PDF Loaders

In [6]:
loader = UnstructuredFileLoader("../files/public_key_crytptography.pdf")
data = loader.load()

In [7]:
data[0].metadata

{'source': '../files/public_key_crytptography.pdf'}

In [8]:
data[0].page_content[:300]

'The First Ten Years of Public-Key Cryptography\n\nWH lTFl ELD DI FFlE\n\nInvited Paper\n\nPublic-key cryptosystems separate the capacities for encryption and decryption so that 7) many people can encrypt messages in such a way that only one person can read them, or 2) one person can encrypt messages in su'

In [9]:
loader = PyPDFLoader("../files/public_key_crytptography.pdf")
pdf_data = loader.load()

In [10]:
len(pdf_data)

18

In [11]:
pdf_data[17].page_content

'[I101 -, “Cryptology,” in Encyclopaedia Britannica, 76th Edi- \ntion. Chicago, IL: Encyclopaedia Britannica, 1986, pp. 913- \n9248. \n[Ill] Proceedings of Smart Card 2000, Vienna, Austria, Oct. 19- \n20, 1988. \n[I121 M. V. Wilkes, Time-sharing Computer Systems. New York, \nNY: American Elsevier, 1972. \n[I131 H. C. Williams, “A modification of the RSA public-keycryp- \ntosystem,” IEEE Trans. Informat. Theory, vol. IT-26, no. 6, pp. \n[I141 -, Eds., Advances in Cryptology-CRYPT0 ’85. Berlin, \nGermany: Springer-Verlag, 1986. \n[I151 S. Wolfram, “Cryptography with cellular automata,” in \n[I161 M. C. Wunderlich, “Recent advances in the design and \nimplementation of large integer factorization algorithms,” \nin 7983Symp. on Securityandfrivacy, (Oakland, CA, pp. 67- \n71, Apr. 25-27,1983. \n[I17 K. Yiu and K. Peterson, “A single-chip VLSl implementation \nof the discrete exponential public key distribution system,” \nin GOMAC (Government Microcircuit Applications Confer- \nence), (Orlan

In [12]:
pdf_data[17].metadata

{'source': '../files/public_key_crytptography.pdf', 'page': 17}

In [13]:
df = pl.read_csv("../files/repos_cairo.csv")
df.head()

repo_name,repo_owner,repo_updated_at,repo_created_at,repo_stargazers_count,repo_forks
str,str,str,str,i64,i64
"""kkrt-labs/kaka…","""kkrt-labs""","""2023-06-10T16:…","""2022-10-04T14:…",453,93
"""ZeroSync/ZeroS…","""ZeroSync""","""2023-06-09T15:…","""2022-07-08T14:…",290,29
"""starknet-edu/s…","""starknet-edu""","""2023-06-07T23:…","""2022-07-05T15:…",259,131
"""shramee/starkl…","""shramee""","""2023-06-09T13:…","""2023-01-05T10:…",249,101
"""keep-starknet-…","""keep-starknet-…","""2023-06-09T09:…","""2022-11-25T08:…",115,42


In [14]:
loader = DataFrameLoader(df.to_pandas(), page_content_column="repo_name")
data = loader.load()

In [15]:
print(f"The type of the file is {type(data)} and its length is {len(data)}")

The type of the file is <class 'list'> and its length is 25


In [16]:
from pprint import pprint

pprint(data[:5])

[Document(page_content='kkrt-labs/kakarot', metadata={'repo_owner': 'kkrt-labs', 'repo_updated_at': '2023-06-10T16:12:50Z', 'repo_created_at': '2022-10-04T14:33:18Z', 'repo_stargazers_count': 453, 'repo_forks': 93}),
 Document(page_content='ZeroSync/ZeroSync', metadata={'repo_owner': 'ZeroSync', 'repo_updated_at': '2023-06-09T15:19:11Z', 'repo_created_at': '2022-07-08T14:56:27Z', 'repo_stargazers_count': 290, 'repo_forks': 29}),
 Document(page_content='starknet-edu/starknet-cairo-101', metadata={'repo_owner': 'starknet-edu', 'repo_updated_at': '2023-06-07T23:08:37Z', 'repo_created_at': '2022-07-05T15:00:25Z', 'repo_stargazers_count': 259, 'repo_forks': 131}),
 Document(page_content='shramee/starklings-cairo1', metadata={'repo_owner': 'shramee', 'repo_updated_at': '2023-06-09T13:06:27Z', 'repo_created_at': '2023-01-05T10:04:40Z', 'repo_stargazers_count': 249, 'repo_forks': 101}),
 Document(page_content='keep-starknet-strange/alexandria', metadata={'repo_owner': 'keep-starknet-strange', 

In [17]:
class TransformerDocsJSONLoader:
    def __init__(self, filepath: str):
        self.filepath = filepath

    def load(self) -> list[Document]:
        with jsonlines.open(self.filepath) as reader:
            documents = []
            for obj in reader:
                page_content = obj.get("text", "")
                metadata = {
                    "title": obj.get("title", ""),
                    "repo_owner": obj.get("repo_owner", ""),
                    "repo_name": obj.get("repo_name", ""),
                }
                documents.append(Document(page_content=page_content, metadata=metadata))
        return documents

In [18]:
loader = TransformerDocsJSONLoader("../files/transformers-docs.jsonl")
data = loader.load()

In [19]:
for doc in data:
    print(doc)

page_content='<!--Copyright 2022 The HuggingFace Team. All rights reserved.\n\nLicensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with\nthe License. You may obtain a copy of the License at\n\nhttp://www.apache.org/licenses/LICENSE-2.0\n\nUnless required by applicable law or agreed to in writing, software distributed under the License is distributed on\nan "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the\nspecific language governing permissions and limitations under the License.\n-->\n\n# Distributed training with 🤗 Accelerate\n\nAs models get bigger, parallelism has emerged as a strategy for training larger models on limited hardware and accelerating training speed by several orders of magnitude. At Hugging Face, we created the [🤗 Accelerate](https://huggingface.co/docs/accelerate) library to help users easily train a 🤗 Transformers model on any type of distributed

In [20]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    length_function=len,
    chunk_overlap=100,
)
documents = text_splitter.split_documents(pdf_data)

In [21]:
len(pdf_data)

18

In [22]:
len(documents)

128

In [23]:
documents[50]

Document(page_content='one system were in use, only one certification study would \nbe required. As certification is the most fundamental and \nmost difficult problem in cryptography, this seemed to be \nwhere the real savings lay. \nIn time I saw the folly of this view. Theorems or not, it \nseemed silly to expect that adding a major new criterion to \nthe requirements for a cryptographic system could fail to \nslow it down. The designer would always have more lati- \ntudewith systems that did not have to satisfythe public key \nproperty and some of these would doubtless be faster. Even \nmore compelling was the realization that modes of oper- \nation incompatible with the public-key property are essen- \ntial in many communication channels. \n566 PROCEEDINGS OF THE IEEE, VOL. 76, NO. 5, MAY 1988', metadata={'source': '../files/public_key_crytptography.pdf', 'page': 6})

# Embeddings with OpenAI

In [24]:
import os
from langchain.embeddings import OpenAIEmbeddings
import tiktoken

In [25]:
openai_api_key = os.environ["OPENAI_API_KEY"]

In [26]:
embedding_openai = OpenAIEmbeddings(model="text-embedding-ada-002")

  warn_deprecated(


In [27]:
embedding_openai

OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x2af56d210>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x2af5784d0>, model='text-embedding-ada-002', deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)

In [28]:
documents_list = [
    "Hola Parce",
    "Uy, Hola",
    "Como te llamas?",
    "Mis parceros me llaman Marlon",
    "Hola de nuevo",
]

embeddings = embedding_openai.embed_documents(documents_list)

In [29]:
len(embeddings[0])

1536

In [30]:
embedding_query = embedding_openai.embed_query(documents_list[0])
embedding_query

[-0.01733691178168973,
 -0.0020069060069858096,
 0.0041171720519443036,
 -0.02144374801610127,
 -0.029051041968013627,
 0.021622906532840973,
 -0.022849443414942733,
 0.003224831182404288,
 -0.002759711244357844,
 -0.028692728659824383,
 0.013347217296176245,
 0.0038484361394798324,
 -0.0038863349132020704,
 -0.01462199016651681,
 -0.005354046614223901,
 -0.0062567235341273114,
 0.042694559529827626,
 -0.011762364311917626,
 0.012603025395301934,
 0.0007588344448635918,
 0.00533337451349711,
 -0.009571132810748275,
 0.005433289123738451,
 -0.012671932087283724,
 -0.0018398074186283511,
 0.008372157487852186,
 0.01430502012845861,
 -0.013106043874239982,
 0.004578846795090039,
 -0.0227391934528299,
 0.002117156667590546,
 0.004113726624212961,
 -0.012065553570372992,
 -0.028665165237973637,
 -0.014566865185460395,
 -0.01740581847367152,
 -0.010156839692593487,
 -0.00895097351423471,
 -0.01087346910293959,
 -0.013360999007101618,
 -0.0011412663874564837,
 -0.017061285013762573,
 0.007972

# Embeddings with Hugging Face

In [31]:
from langchain.embeddings import SentenceTransformerEmbeddings

In [32]:
embeddings_st = SentenceTransformerEmbeddings(
    model_name="sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
)

caused by: ["[Errno 2] The file to load file system plugin from does not exist.: '/Users/mmenendezg/Developer/Platzi/.venv/lib/python3.11/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so'"]
caused by: ["dlopen(/Users/mmenendezg/Developer/Platzi/.venv/lib/python3.11/site-packages/tensorflow_io/python/ops/libtensorflow_io.so, 0x0006): tried: '/Users/mmenendezg/Developer/Platzi/.venv/lib/python3.11/site-packages/tensorflow_io/python/ops/libtensorflow_io.so' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/Users/mmenendezg/Developer/Platzi/.venv/lib/python3.11/site-packages/tensorflow_io/python/ops/libtensorflow_io.so' (no such file), '/Users/mmenendezg/Developer/Platzi/.venv/lib/python3.11/site-packages/tensorflow_io/python/ops/libtensorflow_io.so' (no such file)"]


In [33]:
embeddings = embeddings_st.embed_documents(documents_list)

In [34]:
len(embeddings)

5

In [35]:
len(embeddings[0])

384

In [36]:
embedding = embeddings_st.embed_query(documents_list[0])

In [37]:
len(embedding)

384

In [38]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

In [39]:
embedding_instruct = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-large",
)

load INSTRUCTOR_Transformer
max_seq_length  512


In [40]:
embeddings = embedding_instruct.embed_documents(documents_list)
len(embeddings)

5

In [41]:
len(embeddings[0])

768

In [42]:
embeddings_st.client, embedding_instruct.client

(SentenceTransformer(
   (0): Transformer({'max_seq_length': 128, 'do_lower_case': False}) with Transformer model: BertModel 
   (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
 ),
 INSTRUCTOR(
   (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: T5EncoderModel 
   (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False})
   (2): Dense({'in_features': 1024, 'out_features': 768, 'bias': False, 'activation_function': 'torch.nn.modules.linear.Identity'})
   (3): Normalize()
 ))

# Chroma

In [43]:
from langchain.vectorstores import Chroma

In [47]:
type(documents_list[0])

str

In [48]:
name_chroma_index = "instruct-embeddings-public-crypto"

vectorstore_chroma = Chroma.from_documents(
    documents=documents,
    embedding=embedding_instruct,
    persist_directory=name_chroma_index,
)

In [50]:
# Make the db persistent
vectorstore_chroma.persist()

In [52]:
vectorstore_chroma = Chroma(
    persist_directory=name_chroma_index,
    embedding_function=embedding_instruct,
)

In [54]:
query = "What is the public key cryptography?"

docs = vectorstore_chroma.similarity_search_with_score(query, k=3)

In [58]:
docs[2]

(Document(page_content='the corresponding secret key, but this is of no help to \nhim in creating (forging) a message with this property. \nThe first aspect of public-key cryptography greatly sim- \nplifies the management of keys, especially in large com- \nmunication networks. In order for a pair of subscribers to \ncommunicate privately using conventional end-to-end \ncryptography, they must both have copies of the same cryp- \ntographic key and this key must be kept secret from anyone \nthey do not wish to take into their confidence. If a network \nhas only a few subscribers, each person simply stores one \nkey for every other subscriber against the day he will need \nit, but for a large network, this is impractical. \nIn a network with n subscribers there are n(n - 1)/2 pairs, \neach of which may require a key. This amounts to five thou- \nsand keys in a network with only a hundred subscribers, \nhalf a million in a network with one thousand, and twenty \nmillion billion in a netwo

## Create a Retriever

In [59]:
retriever_chroma = vectorstore_chroma.as_retriever(
    search_kwargs={"k":2}
)

In [60]:
retriever_chroma.get_relevant_documents("What are the recent advances on public key cryptography?")

[Document(page_content='The First Ten Years of Public-Key \nCryptography \nWH lTFl ELD DI FFlE \nInvited Paper \nPublic-key cryptosystems separate the capacities for encryption \nand decryption so that 7) many people can encrypt messages in \nsuch a way that only one person can read them, or 2) one person \ncan encrypt messages in such a way that many people can read \nthem. This separation allows important improvements in the man- \nagement of cryptographic keys and makes it possible to ‘sign’ a \npurely digital message. \nPublic key cryptography was discovered in the Spring of 1975 \nand has followed a surprising course. Although diverse systems \nwere proposed early on, the ones that appear both practical and \nsecure today are all very closely related and the search for new and \ndifferent ones has met with little success. Despite this reliance on \na limited mathematical foundation public-key cryptography is rev- \nolutionizing communication security by making possible secure', me

In [61]:
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(
    openai_api_key=openai_api_key, model_name="gpt-3.5-turbo-1106", temperature=0.0
)

qa_chain_with_resources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm, chain_type="stuff", retriever=retriever_chroma
)

  warn_deprecated(


In [62]:
query = "What is the relevance of public key crypto?"
response = qa_chain_with_resources(query)
print(response)

{'question': 'What is the relevance of public key crypto?', 'answer': 'Public key cryptography revolutionizes communication security by allowing secure communication networks with hundreds of thousands of subscribers and providing a systematic means of addressing a broad range of security objectives. It also allows the development of cryptographic protocols with proven security characteristics.\n', 'sources': '../files/public_key_crytptography.pdf'}


In [66]:
query = "What is crypto?"
response = qa_chain_with_resources(query)
print(response)

{'question': 'What is crypto?', 'answer': 'Crypto is short for cryptography, which is the practice and study of techniques for secure communication in the presence of third parties.', 'sources': ''}
