-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
11 changed files
with
290 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -16,5 +16,7 @@ old | |
*.jpg | ||
*photo* | ||
*.png | ||
|
||
|
||
|
32 changes: 32 additions & 0 deletions
32
...-keeping-information-organized-with-indexes/3-text-splitters/0_character_text_splitter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
|
||
from langchain_community.document_loaders import PyPDFLoader | ||
|
||
loader = PyPDFLoader("The One Page Linux Manual.pdf") | ||
pages = loader.load_and_split() | ||
|
||
# By loading the text file, we can ask more specific questions related to the subject, | ||
# which helps minimize the likelihood of LLM hallucinations and | ||
# ensures more accurate, context-driven responses. | ||
|
||
from langchain.text_splitter import CharacterTextSplitter | ||
|
||
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=20) | ||
texts = text_splitter.split_documents(pages) | ||
|
||
print(texts[0]) | ||
|
||
print (f"You have {len(texts)} documents") | ||
print ("Preview:") | ||
print (texts[0].page_content) | ||
|
||
|
||
# No universal approach for chunking text will fit all scenarios | ||
# what's effective for one case might not be suitable for another. | ||
# Finding the best chunk size for your project means going through a few steps. | ||
# First, clean up your data by getting rid of anything that's not needed, | ||
# like HTML tags from websites. Then, pick a few different chunk sizes to test. | ||
# The best size will depend on what kind of data you're working with and | ||
# the model you're using. Finally, test out how well each size works | ||
# by running some queries and comparing the results. You might need to try | ||
# a few different sizes before finding the best one. This process might | ||
# take some time, but getting the best results from your project is worth it. |
46 changes: 46 additions & 0 deletions
46
...nformation-organized-with-indexes/3-text-splitters/1_recursive_character_text_splitter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
|
||
# The Recursive Character Text Splitter is a text splitter designed | ||
# to split the text into chunks based on a list of characters provided. | ||
# It attempts to split text using the characters from a list in order | ||
# until the resulting chunks are small enough. By default, the list | ||
# of characters used for splitting is ["\n\n", "\n", " ", "] | ||
|
||
# To use the RecursiveCharacterTextSplitter, you can create an instance | ||
# of it and provide the following parameters: | ||
# chunk_size : The maximum size of the chunks, | ||
# as measured by the length_function (default is 100). | ||
# chunk_overlap: The maximum overlap between chunks to maintain continuity | ||
# between them (default is 20). | ||
# length_function: parameter is used to calculate the length of the chunks. | ||
# By default, it is set to len, which counts the number of characters in a chunk. However, you can also pass a token counter or any other function that calculates the length of a chunk based on your specific requirements. | ||
# Using a token counter instead of the default len function | ||
# can benefit specific scenarios, such as when working with language models | ||
# with token limits. For example, OpenAI's GPT-3 has a token limit of 4096 tokens per request, | ||
# so you might want to count tokens instead of characters to better manage | ||
# and optimize your requests. | ||
|
||
from langchain_community.document_loaders import PyPDFLoader | ||
from langchain.text_splitter import RecursiveCharacterTextSplitter | ||
|
||
loader = PyPDFLoader("The One Page Linux Manual.pdf") | ||
pages = loader.load_and_split() | ||
|
||
text_splitter = RecursiveCharacterTextSplitter( | ||
chunk_size=50, | ||
chunk_overlap=10, | ||
length_function=len, | ||
) | ||
|
||
docs = text_splitter.split_documents(pages) | ||
for doc in docs: | ||
print(doc) | ||
|
||
# In this example, the text is loaded from a file, | ||
# and the RecursiveCharacterTextSplitter is used to split it into chunks | ||
# with a maximum size of 50 characters and an overlap of 10 characters. | ||
# The output will be a list of documents containing the split text. | ||
|
||
# To use a token counter, you can create a custom function that calculates | ||
# the number of tokens in a given text and pass it as the length_function parameter. | ||
# This will ensure that your text splitter calculates the length of chunks based | ||
# on the number of tokens instead of the number of characters. |
14 changes: 14 additions & 0 deletions
14
...ion/4-keeping-information-organized-with-indexes/3-text-splitters/2_ntlk_text_splitter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
# The NLTKTextSplitter in LangChain is an implementation | ||
# of a text splitter that uses the Natural Language Toolkit (NLTK) library | ||
# to split text based on tokenizers. The goal is to split long texts | ||
# into smaller chunks without breaking the structure of sentences and paragraphs. | ||
|
||
from langchain.text_splitter import NLTKTextSplitter | ||
|
||
# Load a long document | ||
with open('/home/cloudsuperadmin/scrape-chain/langchain/LLM.txt', encoding= 'unicode_escape') as f: | ||
sample_text = f.read() | ||
|
||
text_splitter = NLTKTextSplitter(chunk_size=500) | ||
texts = text_splitter.split_text(sample_text) | ||
print(texts) |
14 changes: 14 additions & 0 deletions
14
...n/4-keeping-information-organized-with-indexes/3-text-splitters/3_spacey_test_splitter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
from langchain.text_splitter import SpacyTextSplitter | ||
|
||
# Load a long document | ||
with open('/home/cloudsuperadmin/scrape-chain/langchain/LLM.txt', encoding= 'unicode_escape') as f: | ||
sample_text = f.read() | ||
|
||
# Instantiate the SpacyTextSplitter with the desired chunk size | ||
text_splitter = SpacyTextSplitter(chunk_size=500, chunk_overlap=20) | ||
|
||
# Split the text using SpacyTextSplitter | ||
texts = text_splitter.split_text(sample_text) | ||
|
||
# Print the first chunk | ||
print(texts[0]) |
41 changes: 41 additions & 0 deletions
41
...4-keeping-information-organized-with-indexes/3-text-splitters/4_markdown_text_splitter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
from langchain.text_splitter import MarkdownTextSplitter | ||
|
||
markdown_text = """ | ||
# | ||
# Welcome to My Blog! | ||
## Introduction | ||
Hello everyone! My name is **John Doe** and I am a _software developer_. I specialize in Python, Java, and JavaScript. | ||
Here's a list of my favorite programming languages: | ||
1. Python | ||
2. JavaScript | ||
3. Java | ||
You can check out some of my projects on [GitHub](https://github.com). | ||
## About this Blog | ||
In this blog, I will share my journey as a software developer. I'll post tutorials, my thoughts on the latest technology trends, and occasional book reviews. | ||
Here's a small piece of Python code to say hello: | ||
\``` python | ||
def say_hello(name): | ||
print(f"Hello, {name}!") | ||
say_hello("John") | ||
\``` | ||
Stay tuned for more updates! | ||
## Contact Me | ||
Feel free to reach out to me on [Twitter](https://twitter.com) or send me an email at johndoe@email.com. | ||
""" | ||
|
||
markdown_splitter = MarkdownTextSplitter(chunk_size=100, chunk_overlap=0) | ||
docs = markdown_splitter.create_documents([markdown_text]) | ||
|
||
print(docs) |
12 changes: 12 additions & 0 deletions
12
...on/4-keeping-information-organized-with-indexes/3-text-splitters/5_token_text_splitter.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,12 @@ | ||
from langchain.text_splitter import TokenTextSplitter | ||
|
||
# Load a long document | ||
with open('/home/cloudsuperadmin/scrape-chain/langchain/LLM.txt', encoding= 'unicode_escape') as f: | ||
sample_text = f.read() | ||
|
||
# Initialize the TokenTextSplitter with desired chunk size and overlap | ||
text_splitter = TokenTextSplitter(chunk_size=100, chunk_overlap=50) | ||
|
||
# Split into smaller chunks | ||
texts = text_splitter.split_text(sample_text) | ||
print(texts[0]) |
34 changes: 34 additions & 0 deletions
34
...-keeping-information-organized-with-indexes/4-exploring-embeddings/0_openai_embeddings.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
import openai | ||
import numpy as np | ||
from sklearn.metrics.pairwise import cosine_similarity | ||
#from langchain.embeddings import OpenAIEmbeddings | ||
from langchain_openai import OpenAIEmbeddings | ||
|
||
# Define the documents | ||
documents = [ | ||
"The cat is on the mat.", | ||
"There is a cat on the mat.", | ||
"The dog is in the yard.", | ||
"There is a dog in the yard.", | ||
] | ||
|
||
# Initialize the OpenAIEmbeddings instance | ||
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") | ||
|
||
# Generate embeddings for the documents | ||
document_embeddings = embeddings.embed_documents(documents) | ||
|
||
# Perform a similarity search for a given query | ||
query = "A cat is sitting on a mat." | ||
query_embedding = embeddings.embed_query(query) | ||
|
||
# Calculate similarity scores | ||
similarity_scores = cosine_similarity([query_embedding], document_embeddings)[0] | ||
|
||
# Find the most similar document | ||
most_similar_index = np.argmax(similarity_scores) | ||
most_similar_document = documents[most_similar_index] | ||
|
||
print(f"Most similar document to the query '{query}':") | ||
print(most_similar_document) | ||
|
19 changes: 19 additions & 0 deletions
19
...eeping-information-organized-with-indexes/4-exploring-embeddings/1_hf_embedding_models.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
#from langchain.llms import HuggingFacePipeline | ||
#from langchain.embeddings import HuggingFaceEmbeddings | ||
#from langchain_community.embeddings import HuggingFaceEmbeddings | ||
#from langchain_community.llms import HuggingFacePipeline | ||
#from langchain_community.llms import HuggingFacePipeline | ||
|
||
from sentence_transformers import SentenceTransformer | ||
|
||
model = SentenceTransformer("sentence-transformers/all-mpnet-base-v2",cache_folder="./custom_cache") | ||
|
||
model_name = "sentence-transformers/all-mpnet-base-v2" | ||
model_kwargs = {'device': 'cpu'} | ||
# hf = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs) | ||
#hf = HuggingFacePipeline(model_name=model_name, model_kwargs=model_kwargs) | ||
|
||
documents = ["Document 1", "Document 2", "Document 3"] | ||
doc_embeddings = model.encode(documents) | ||
#doc_embeddings = model.embed_documents(documents) | ||
print(doc_embeddings) |
29 changes: 29 additions & 0 deletions
29
...-keeping-information-organized-with-indexes/4-exploring-embeddings/2-cohere_embeddings.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import cohere | ||
from langchain.embeddings import CohereEmbeddings | ||
|
||
# Initialize the CohereEmbeddings object | ||
cohere = CohereEmbeddings( | ||
model="embed-multilingual-v2.0", | ||
cohere_api_key="your_cohere_api_key" | ||
) | ||
|
||
# Define a list of texts | ||
texts = [ | ||
"Hello from Cohere!", | ||
"مرحبًا من كوهير!", | ||
"Hallo von Cohere!", | ||
"Bonjour de Cohere!", | ||
"¡Hola desde Cohere!", | ||
"Olá do Cohere!", | ||
"Ciao da Cohere!", | ||
"您好,来自 Cohere!", | ||
"कोहेरे से नमस्ते!" | ||
] | ||
|
||
# Generate embeddings for the texts | ||
document_embeddings = cohere.embed_documents(texts) | ||
|
||
# Print the embeddings | ||
for text, embedding in zip(texts, document_embeddings): | ||
print(f"Text: {text}") | ||
print(f"Embedding: {embedding[:5]}") # print first 5 dimensions of each embedding |
47 changes: 47 additions & 0 deletions
47
...ing-information-organized-with-indexes/4-exploring-embeddings/3_deepllake_vector_store.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
#from langchain.embeddings.openai import OpenAIEmbeddings | ||
from langchain_openai import OpenAIEmbeddings | ||
from langchain_community.vectorstores import DeepLake | ||
from langchain.text_splitter import RecursiveCharacterTextSplitter | ||
#from langchain_community.chat_models import ChatOpenAI | ||
from langchain_openai import ChatOpenAI | ||
from langchain.chains import RetrievalQA | ||
|
||
# create our documents | ||
texts = [ | ||
"Napoleon Bonaparte was born in 15 August 1769", | ||
"Louis XIV was born in 5 September 1638", | ||
"Lady Gaga was born in 28 March 1986", | ||
"Michael Jeffrey Jordan was born in 17 February 1963" | ||
] | ||
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0) | ||
docs = text_splitter.create_documents(texts) | ||
|
||
# initialize embeddings model | ||
embeddings = OpenAIEmbeddings(model="text-embedding-ada-002") | ||
|
||
# create Deep Lake dataset | ||
# TODO: use your organization id here. (by default, org id is your username) | ||
import os | ||
my_activeloop_org_id = os.environ["ACTIVELOOP_ORG_ID"] | ||
my_activeloop_dataset_name = "langchain_course_embeddings" | ||
dataset_path = f"hub://{my_activeloop_org_id}/{my_activeloop_dataset_name}" | ||
#db = DeepLake(dataset_path=dataset_path, embedding_function=embeddings) | ||
db = DeepLake(dataset_path=dataset_path, embedding=embeddings) | ||
|
||
# add documents to our Deep Lake dataset | ||
db.add_documents(docs) | ||
|
||
|
||
# create retriever from db | ||
retriever = db.as_retriever() | ||
|
||
# istantiate the llm wrapper | ||
model = ChatOpenAI(model='gpt-3.5-turbo') | ||
|
||
# create the question-answering chain | ||
qa_chain = RetrievalQA.from_llm(model, retriever=retriever) | ||
|
||
# ask a question to the chain | ||
#qa_chain.run("When was Michael Jordan born?") | ||
response = qa_chain.invoke("When was Michael Jordan born?") | ||
print(response) |