# Concept 05: Embeddings and VectorStores

> Watch Video from 9:00 to 11:40

> https://www.youtube.com/watch?v=aywZrzNaKjs

In [1]:
explanation: str = """An autoencoder is like a machine that helps computers learn about the world. It is a type of artificial neural network, which is a type of computer program that can learn from experience.

An autoencoder works by taking some input, like a picture of a dog, and then trying to make the output look the same as the input. It does this by breaking the picture down into small pieces and then putting them back together in a slightly different way. This process is called “encoding”.

The autoencoder then takes the encoded image and tries to make it look as close as possible to the original input. This process is called “decoding”. The autoencoder is then able to learn how to make the picture look more like the original input.

This type of artificial neural network is called an unsupervised autoencoder because it does not need someone to tell it what the correct output should look like. It is able to figure it out by itself.

The autoencoder creates a “latent space”, which is a place where the data is stored in a way that the computer can understand.
"""

In [2]:
# Import utility for splitting up texts and split up the explanation given above into document chunks

from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 100,
    chunk_overlap  = 0,
)

texts = text_splitter.create_documents([explanation])


In [3]:
# Individual text chunks can be accessed with "page_content"

texts[0].page_content

'An autoencoder is like a machine that helps computers learn about the world. It is a type of'

In [4]:
# Import and instantiate OpenAI embeddings
# Error Correction: https://stackoverflow.com/questions/76697130/open-ai-embeddings

from langchain.embeddings import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-ada-002",embedding_ctx_length= 1024)

In [5]:
# Turn the first text chunk into a vector with the embedding

query_result = embeddings.embed_query(texts[0].page_content)
print(query_result)
     

[-0.04706957295545045, -0.00740398051868516, 0.011905600569737615, -0.01384708951486003, -0.004501620516713716, 0.011780555620059632, 0.00761458280878881, -0.016716542632516405, -0.009924624481024144, -0.03975115117417474, 0.012866473023570428, 0.050518182021645916, -0.005840918182049104, -0.018322384803518447, 0.0022672634676837618, 0.004139648048876784, -0.008904520846143486, 0.013991878501994803, 0.005199239758671021, 0.0011410357478881768, -0.03072158444703075, 0.02288981769187852, -0.0033367274290370236, -0.034933624661168616, -0.00471551316928599, 0.005788267725938506, 0.023863852526908348, -0.055598959883882505, -0.000698853535336642, -0.009141448364302434, 0.01700612060678595, -0.020441568223106922, 0.008062112617051408, -0.047806681203643855, -0.022521263567781814, -0.031563991744800304, 0.006696489257999461, -0.013287677603777727, 0.01626901422123759, -0.006015323225369061, 0.022297498430819886, 0.015413442679625971, -0.02058635721024169, -0.017888017842114128, -0.00237256437

In [6]:
len(query_result)

1536

In [15]:
import os
import pinecone
from langchain.vectorstores import Pinecone


pinecone.init(
    api_key=os.getenv('PINECONE_API_KEY'),  
    environment=os.getenv('PINECONE_ENV')  
)

In [16]:

index_name = "langchain-quickstart"

In [17]:
# First, check if our index already exists. If it doesn't, we create it
if index_name not in pinecone.list_indexes():
    # we create a new index
    pinecone.create_index(
      name=index_name,
      metric='cosine',
      dimension=1536  
)

In [18]:
# Upload vectors to Pinecone
# https://docs.pinecone.io/docs/indexes
# https://docs.pinecone.io/docs/choosing-index-type-and-size
# Create a Index named "langchain-quickstart" in Pinecode Webapp with 1024 dimensions

search = Pinecone.from_documents(texts, embeddings, index_name=index_name)

In [19]:
# Do a simple vector similarity search

query = "What is magical about an autoencoder?"
result = search.similarity_search(query)

print(result)
     

[Document(page_content='The autoencoder creates a “latent space”, which is a place where the data is stored in a way that'), Document(page_content='An autoencoder is like a machine that helps computers learn about the world. It is a type of'), Document(page_content='The autoencoder then takes the encoded image and tries to make it look as close as possible to the'), Document(page_content='original input. This process is called “decoding”. The autoencoder is then able to learn how to')]
