embeddings

manufy · Jun 6, 2024 · 2871089 · 2871089
1 parent 40d5c92
commit 2871089
Show file tree

Hide file tree

Showing 11 changed files with 290 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -16,5 +16,7 @@ old
 *.jpg
 *photo*
 *.png
+*.pdf
+
 
 
diff --git a/...-keeping-information-organized-with-indexes/3-text-splitters/0_character_text_splitter.py b/...-keeping-information-organized-with-indexes/3-text-splitters/0_character_text_splitter.py
@@ -0,0 +1,32 @@
+
+from langchain_community.document_loaders import PyPDFLoader
+
+loader = PyPDFLoader("The One Page Linux Manual.pdf")
+pages = loader.load_and_split()
+
+# By loading the text file, we can ask more specific questions related to the subject, 
+# which helps minimize the likelihood of LLM hallucinations and
+# ensures more accurate, context-driven responses.
+
+from langchain.text_splitter import CharacterTextSplitter
+
+text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20)
+texts = text_splitter.split_documents(pages)
+
+print(texts[0])
+
+print (f"You have {len(texts)} documents")
+print ("Preview:")
+print (texts[0].page_content)
+
+
+# No universal approach for chunking text will fit all scenarios 
+# what's effective for one case might not be suitable for another. 
+# Finding the best chunk size for your project means going through a few steps.
+# First, clean up your data by getting rid of anything that's not needed, 
+# like HTML tags from websites. Then, pick a few different chunk sizes to test. 
+# The best size will depend on what kind of data you're working with and 
+# the model you're using.  Finally, test out how well each size works
+# by running some queries and comparing the results. You might need to try
+# a few different sizes before finding the best one. This process might 
+# take some time, but getting the best results from your project is worth it.
diff --git a/...nformation-organized-with-indexes/3-text-splitters/1_recursive_character_text_splitter.py b/...nformation-organized-with-indexes/3-text-splitters/1_recursive_character_text_splitter.py
@@ -0,0 +1,46 @@
+
+# The Recursive Character Text Splitter is a text splitter designed 
+# to split the text into chunks based on a list of characters provided. 
+# It attempts to split text using the characters from a list in order
+# until the resulting chunks are small enough. By default, the list
+# of characters used for splitting is ["\n\n", "\n", " ", "]
+
+# To use the RecursiveCharacterTextSplitter, you can create an instance 
+# of it and provide the following parameters:
+# chunk_size : The maximum size of the chunks,
+# as measured by the length_function (default is 100).
+# chunk_overlap: The maximum overlap between chunks to maintain continuity
+# between them (default is 20).
+# length_function: parameter is used to calculate the length of the chunks.
+# By default, it is set to len, which counts the number of characters in a chunk. However, you can also pass a token counter or any other function that calculates the length of a chunk based on your specific requirements.
+# Using a token counter instead of the default len function
+# can benefit specific scenarios, such as when working with language models
+# with token limits. For example, OpenAI's GPT-3 has a token limit of 4096 tokens per request, 
+# so you might want to count tokens instead of characters to better manage
+# and optimize your requests.
+
+from langchain_community.document_loaders import PyPDFLoader
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+
+loader = PyPDFLoader("The One Page Linux Manual.pdf")
+pages = loader.load_and_split()
+
+text_splitter = RecursiveCharacterTextSplitter(
+    chunk_size=50,
+    chunk_overlap=10,
+    length_function=len,
+)
+
+docs = text_splitter.split_documents(pages)
+for doc in docs:
+    print(doc)
+
+# In this example, the text is loaded from a file,
+# and the RecursiveCharacterTextSplitter is used to split it into chunks
+# with a maximum size of 50 characters and an overlap of 10 characters. 
+# The output will be a list of documents containing the split text.
+
+# To use a token counter, you can create a custom function that calculates
+# the number of tokens in a given text and pass it as the length_function parameter.
+# This will ensure that your text splitter calculates the length of chunks based
+# on the number of tokens instead of the number of characters. 
diff --git a/...ion/4-keeping-information-organized-with-indexes/3-text-splitters/2_ntlk_text_splitter.py b/...ion/4-keeping-information-organized-with-indexes/3-text-splitters/2_ntlk_text_splitter.py
@@ -0,0 +1,14 @@
+# The NLTKTextSplitter in LangChain is an implementation 
+# of a text splitter that uses the Natural Language Toolkit (NLTK) library 
+# to split text based on tokenizers. The goal is to split long texts
+# into smaller chunks without breaking the structure of sentences and paragraphs.
+
+from langchain.text_splitter import NLTKTextSplitter
+
+# Load a long document
+with open('/home/cloudsuperadmin/scrape-chain/langchain/LLM.txt', encoding= 'unicode_escape') as f:
+    sample_text = f.read()
+
+text_splitter = NLTKTextSplitter(chunk_size=500)
+texts = text_splitter.split_text(sample_text)
+print(texts)
diff --git a/...n/4-keeping-information-organized-with-indexes/3-text-splitters/3_spacey_test_splitter.py b/...n/4-keeping-information-organized-with-indexes/3-text-splitters/3_spacey_test_splitter.py
@@ -0,0 +1,14 @@
+from langchain.text_splitter import SpacyTextSplitter
+
+# Load a long document
+with open('/home/cloudsuperadmin/scrape-chain/langchain/LLM.txt', encoding= 'unicode_escape') as f:
+    sample_text = f.read()
+
+# Instantiate the SpacyTextSplitter with the desired chunk size
+text_splitter = SpacyTextSplitter(chunk_size=500, chunk_overlap=20)
+
+# Split the text using SpacyTextSplitter
+texts = text_splitter.split_text(sample_text)
+
+# Print the first chunk
+print(texts[0])
diff --git a/...4-keeping-information-organized-with-indexes/3-text-splitters/4_markdown_text_splitter.py b/...4-keeping-information-organized-with-indexes/3-text-splitters/4_markdown_text_splitter.py
@@ -0,0 +1,41 @@
+from langchain.text_splitter import MarkdownTextSplitter
+
+markdown_text = """
+# 
+
+# Welcome to My Blog!
+
+## Introduction
+Hello everyone! My name is **John Doe** and I am a _software developer_. I specialize in Python, Java, and JavaScript.
+
+Here's a list of my favorite programming languages:
+
+1. Python
+2. JavaScript
+3. Java
+
+You can check out some of my projects on [GitHub](https://github.com).
+
+## About this Blog
+In this blog, I will share my journey as a software developer. I'll post tutorials, my thoughts on the latest technology trends, and occasional book reviews.
+
+Here's a small piece of Python code to say hello:
+
+\``` python
+def say_hello(name):
+    print(f"Hello, {name}!")
+
+say_hello("John")
+\```
+
+Stay tuned for more updates!
+
+## Contact Me
+Feel free to reach out to me on [Twitter](https://twitter.com) or send me an email at johndoe@email.com.
+
+"""
+
+markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0)
+docs = markdown_splitter.create_documents([markdown_text])
+
+print(docs)
diff --git a/...on/4-keeping-information-organized-with-indexes/3-text-splitters/5_token_text_splitter.py b/...on/4-keeping-information-organized-with-indexes/3-text-splitters/5_token_text_splitter.py
@@ -0,0 +1,12 @@
+from langchain.text_splitter import TokenTextSplitter
+
+# Load a long document
+with open('/home/cloudsuperadmin/scrape-chain/langchain/LLM.txt', encoding= 'unicode_escape') as f:
+    sample_text = f.read()
+
+# Initialize the TokenTextSplitter with desired chunk size and overlap
+text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=50)
+
+# Split into smaller chunks
+texts = text_splitter.split_text(sample_text)
+print(texts[0])
diff --git a/...-keeping-information-organized-with-indexes/4-exploring-embeddings/0_openai_embeddings.py b/...-keeping-information-organized-with-indexes/4-exploring-embeddings/0_openai_embeddings.py
@@ -0,0 +1,34 @@
+import openai
+import numpy as np
+from sklearn.metrics.pairwise import cosine_similarity
+#from langchain.embeddings import OpenAIEmbeddings
+from langchain_openai import OpenAIEmbeddings
+
+# Define the documents
+documents = [
+    "The cat is on the mat.",
+    "There is a cat on the mat.",
+    "The dog is in the yard.",
+    "There is a dog in the yard.",
+]
+
+# Initialize the OpenAIEmbeddings instance
+embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
+
+# Generate embeddings for the documents
+document_embeddings = embeddings.embed_documents(documents)
+
+# Perform a similarity search for a given query
+query = "A cat is sitting on a mat."
+query_embedding = embeddings.embed_query(query)
+
+# Calculate similarity scores
+similarity_scores = cosine_similarity([query_embedding], document_embeddings)[0]
+
+# Find the most similar document
+most_similar_index = np.argmax(similarity_scores)
+most_similar_document = documents[most_similar_index]
+
+print(f"Most similar document to the query '{query}':")
+print(most_similar_document)
+
diff --git a/...eeping-information-organized-with-indexes/4-exploring-embeddings/1_hf_embedding_models.py b/...eeping-information-organized-with-indexes/4-exploring-embeddings/1_hf_embedding_models.py
@@ -0,0 +1,19 @@
+#from langchain.llms import HuggingFacePipeline
+#from langchain.embeddings import HuggingFaceEmbeddings
+#from langchain_community.embeddings import HuggingFaceEmbeddings
+#from langchain_community.llms import HuggingFacePipeline
+#from langchain_community.llms import HuggingFacePipeline
+
+from sentence_transformers import SentenceTransformer
+
+model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2",cache_folder="./custom_cache")
+
+model_name = "sentence-transformers/all-mpnet-base-v2"
+model_kwargs = {'device': 'cpu'}
+# hf = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)
+#hf = HuggingFacePipeline(model_name=model_name, model_kwargs=model_kwargs)
+
+documents = ["Document 1", "Document 2", "Document 3"]
+doc_embeddings = model.encode(documents)
+#doc_embeddings = model.embed_documents(documents)
+print(doc_embeddings)
diff --git a/...-keeping-information-organized-with-indexes/4-exploring-embeddings/2-cohere_embeddings.py b/...-keeping-information-organized-with-indexes/4-exploring-embeddings/2-cohere_embeddings.py
@@ -0,0 +1,29 @@
+import cohere
+from langchain.embeddings import CohereEmbeddings
+
+# Initialize the CohereEmbeddings object
+cohere = CohereEmbeddings(
+	model="embed-multilingual-v2.0",
+	cohere_api_key="your_cohere_api_key"
+)
+
+# Define a list of texts
+texts = [
+    "Hello from Cohere!", 
+    "مرحبًا من كوهير!", 
+    "Hallo von Cohere!",  
+    "Bonjour de Cohere!", 
+    "¡Hola desde Cohere!", 
+    "Olá do Cohere!",  
+    "Ciao da Cohere!", 
+    "您好，来自 Cohere！", 
+    "कोहेरे से नमस्ते!"
+]
+
+# Generate embeddings for the texts
+document_embeddings = cohere.embed_documents(texts)
+
+# Print the embeddings
+for text, embedding in zip(texts, document_embeddings):
+    print(f"Text: {text}")
+    print(f"Embedding: {embedding[:5]}")  # print first 5 dimensions of each embedding
diff --git a/...ing-information-organized-with-indexes/4-exploring-embeddings/3_deepllake_vector_store.py b/...ing-information-organized-with-indexes/4-exploring-embeddings/3_deepllake_vector_store.py
@@ -0,0 +1,47 @@
+#from langchain.embeddings.openai import OpenAIEmbeddings
+from langchain_openai import OpenAIEmbeddings
+from langchain_community.vectorstores import DeepLake
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+#from langchain_community.chat_models import ChatOpenAI
+from langchain_openai import ChatOpenAI
+from langchain.chains import RetrievalQA
+
+# create our documents
+texts = [
+    "Napoleon Bonaparte was born in 15 August 1769",
+    "Louis XIV was born in 5 September 1638",
+    "Lady Gaga was born in 28 March 1986",
+    "Michael Jeffrey Jordan was born in 17 February 1963"
+]
+text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
+docs = text_splitter.create_documents(texts)
+
+# initialize embeddings model
+embeddings = OpenAIEmbeddings(model="text-embedding-ada-002")
+
+# create Deep Lake dataset
+# TODO: use your organization id here. (by default, org id is your username)
+import os
+my_activeloop_org_id = os.environ["ACTIVELOOP_ORG_ID"]
+my_activeloop_dataset_name = "langchain_course_embeddings"
+dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}"
+#db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings)
+db = DeepLake(dataset_path=dataset_path, embedding=embeddings)
+
+# add documents to our Deep Lake dataset
+db.add_documents(docs)
+
+
+# create retriever from db
+retriever = db.as_retriever()
+
+# istantiate the llm wrapper
+model = ChatOpenAI(model='gpt-3.5-turbo')
+
+# create the question-answering chain
+qa_chain = RetrievalQA.from_llm(model, retriever=retriever)
+
+# ask a question to the chain
+#qa_chain.run("When was Michael Jordan born?")
+response = qa_chain.invoke("When was Michael Jordan born?")
+print(response)