Skip to content

Commit

Permalink
embeddings
Browse files Browse the repository at this point in the history
  • Loading branch information
manufy committed Jun 6, 2024
1 parent 40d5c92 commit 2871089
Show file tree
Hide file tree
Showing 11 changed files with 290 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -16,5 +16,7 @@ old
*.jpg
*photo*
*.png
*.pdf



Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@

from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("The One Page Linux Manual.pdf")
pages = loader.load_and_split()

# By loading the text file, we can ask more specific questions related to the subject,
# which helps minimize the likelihood of LLM hallucinations and
# ensures more accurate, context-driven responses.

from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
texts = text_splitter.split_documents(pages)

print(texts[0])

print (f"You have {len(texts)} documents")
print ("Preview:")
print (texts[0].page_content)


# No universal approach for chunking text will fit all scenarios
# what's effective for one case might not be suitable for another.
# Finding the best chunk size for your project means going through a few steps.
# First, clean up your data by getting rid of anything that's not needed,
# like HTML tags from websites. Then, pick a few different chunk sizes to test.
# The best size will depend on what kind of data you're working with and
# the model you're using. Finally, test out how well each size works
# by running some queries and comparing the results. You might need to try
# a few different sizes before finding the best one. This process might
# take some time, but getting the best results from your project is worth it.
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@

# The Recursive Character Text Splitter is a text splitter designed
# to split the text into chunks based on a list of characters provided.
# It attempts to split text using the characters from a list in order
# until the resulting chunks are small enough. By default, the list
# of characters used for splitting is ["\n\n", "\n", " ", "]

# To use the RecursiveCharacterTextSplitter, you can create an instance
# of it and provide the following parameters:
# chunk_size : The maximum size of the chunks,
# as measured by the length_function (default is 100).
# chunk_overlap: The maximum overlap between chunks to maintain continuity
# between them (default is 20).
# length_function: parameter is used to calculate the length of the chunks.
# By default, it is set to len, which counts the number of characters in a chunk. However, you can also pass a token counter or any other function that calculates the length of a chunk based on your specific requirements.
# Using a token counter instead of the default len function
# can benefit specific scenarios, such as when working with language models
# with token limits. For example, OpenAI's GPT-3 has a token limit of 4096 tokens per request,
# so you might want to count tokens instead of characters to better manage
# and optimize your requests.

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFLoader("The One Page Linux Manual.pdf")
pages = loader.load_and_split()

text_splitter = RecursiveCharacterTextSplitter(
chunk_size=50,
chunk_overlap=10,
length_function=len,
)

docs = text_splitter.split_documents(pages)
for doc in docs:
print(doc)

# In this example, the text is loaded from a file,
# and the RecursiveCharacterTextSplitter is used to split it into chunks
# with a maximum size of 50 characters and an overlap of 10 characters.
# The output will be a list of documents containing the split text.

# To use a token counter, you can create a custom function that calculates
# the number of tokens in a given text and pass it as the length_function parameter.
# This will ensure that your text splitter calculates the length of chunks based
# on the number of tokens instead of the number of characters.
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
# The NLTKTextSplitter in LangChain is an implementation
# of a text splitter that uses the Natural Language Toolkit (NLTK) library
# to split text based on tokenizers. The goal is to split long texts
# into smaller chunks without breaking the structure of sentences and paragraphs.

from langchain.text_splitter import NLTKTextSplitter

# Load a long document
with open('/home/cloudsuperadmin/scrape-chain/langchain/LLM.txt', encoding= 'unicode_escape') as f:
sample_text = f.read()

text_splitter = NLTKTextSplitter(chunk_size=500)
texts = text_splitter.split_text(sample_text)
print(texts)
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from langchain.text_splitter import SpacyTextSplitter

# Load a long document
with open('/home/cloudsuperadmin/scrape-chain/langchain/LLM.txt', encoding= 'unicode_escape') as f:
sample_text = f.read()

# Instantiate the SpacyTextSplitter with the desired chunk size
text_splitter = SpacyTextSplitter(chunk_size=500, chunk_overlap=20)

# Split the text using SpacyTextSplitter
texts = text_splitter.split_text(sample_text)

# Print the first chunk
print(texts[0])
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
from langchain.text_splitter import MarkdownTextSplitter

markdown_text = """
#
# Welcome to My Blog!
## Introduction
Hello everyone! My name is **John Doe** and I am a _software developer_. I specialize in Python, Java, and JavaScript.
Here's a list of my favorite programming languages:
1. Python
2. JavaScript
3. Java
You can check out some of my projects on [GitHub](https://github.com).
## About this Blog
In this blog, I will share my journey as a software developer. I'll post tutorials, my thoughts on the latest technology trends, and occasional book reviews.
Here's a small piece of Python code to say hello:
\``` python
def say_hello(name):
print(f"Hello, {name}!")
say_hello("John")
\```
Stay tuned for more updates!
## Contact Me
Feel free to reach out to me on [Twitter](https://twitter.com) or send me an email at johndoe@email.com.
"""

markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)
docs = markdown_splitter.create_documents([markdown_text])

print(docs)
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from langchain.text_splitter import TokenTextSplitter

# Load a long document
with open('/home/cloudsuperadmin/scrape-chain/langchain/LLM.txt', encoding= 'unicode_escape') as f:
sample_text = f.read()

# Initialize the TokenTextSplitter with desired chunk size and overlap
text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=50)

# Split into smaller chunks
texts = text_splitter.split_text(sample_text)
print(texts[0])
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import openai
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
#from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings

# Define the documents
documents = [
"The cat is on the mat.",
"There is a cat on the mat.",
"The dog is in the yard.",
"There is a dog in the yard.",
]

# Initialize the OpenAIEmbeddings instance
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# Generate embeddings for the documents
document_embeddings = embeddings.embed_documents(documents)

# Perform a similarity search for a given query
query = "A cat is sitting on a mat."
query_embedding = embeddings.embed_query(query)

# Calculate similarity scores
similarity_scores = cosine_similarity([query_embedding], document_embeddings)[0]

# Find the most similar document
most_similar_index = np.argmax(similarity_scores)
most_similar_document = documents[most_similar_index]

print(f"Most similar document to the query '{query}':")
print(most_similar_document)

Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
#from langchain.llms import HuggingFacePipeline
#from langchain.embeddings import HuggingFaceEmbeddings
#from langchain_community.embeddings import HuggingFaceEmbeddings
#from langchain_community.llms import HuggingFacePipeline
#from langchain_community.llms import HuggingFacePipeline

from sentence_transformers import SentenceTransformer

model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2",cache_folder="./custom_cache")

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
# hf = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
#hf = HuggingFacePipeline(model_name=model_name, model_kwargs=model_kwargs)

documents = ["Document 1", "Document 2", "Document 3"]
doc_embeddings = model.encode(documents)
#doc_embeddings = model.embed_documents(documents)
print(doc_embeddings)
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import cohere
from langchain.embeddings import CohereEmbeddings

# Initialize the CohereEmbeddings object
cohere = CohereEmbeddings(
model="embed-multilingual-v2.0",
cohere_api_key="your_cohere_api_key"
)

# Define a list of texts
texts = [
"Hello from Cohere!",
"مرحبًا من كوهير!",
"Hallo von Cohere!",
"Bonjour de Cohere!",
"¡Hola desde Cohere!",
"Olá do Cohere!",
"Ciao da Cohere!",
"您好,来自 Cohere!",
"कोहेरे से नमस्ते!"
]

# Generate embeddings for the texts
document_embeddings = cohere.embed_documents(texts)

# Print the embeddings
for text, embedding in zip(texts, document_embeddings):
print(f"Text: {text}")
print(f"Embedding: {embedding[:5]}") # print first 5 dimensions of each embedding
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import DeepLake
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain_community.chat_models import ChatOpenAI
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

# create our documents
texts = [
"Napoleon Bonaparte was born in 15 August 1769",
"Louis XIV was born in 5 September 1638",
"Lady Gaga was born in 28 March 1986",
"Michael Jeffrey Jordan was born in 17 February 1963"
]
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.create_documents(texts)

# initialize embeddings model
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")

# create Deep Lake dataset
# TODO: use your organization id here. (by default, org id is your username)
import os
my_activeloop_org_id = os.environ["ACTIVELOOP_ORG_ID"]
my_activeloop_dataset_name = "langchain_course_embeddings"
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"
#db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)
db = DeepLake(dataset_path=dataset_path, embedding=embeddings)

# add documents to our Deep Lake dataset
db.add_documents(docs)


# create retriever from db
retriever = db.as_retriever()

# istantiate the llm wrapper
model = ChatOpenAI(model='gpt-3.5-turbo')

# create the question-answering chain
qa_chain = RetrievalQA.from_llm(model, retriever=retriever)

# ask a question to the chain
#qa_chain.run("When was Michael Jordan born?")
response = qa_chain.invoke("When was Michael Jordan born?")
print(response)

0 comments on commit 2871089

Please sign in to comment.