In [1]:
from langchain_openai import OpenAIEmbeddings
import chromadb

def load_document(file): 
	import os 
	name, extension = os.path.splitext(file) 
	 
	if extension == '.pdf':
		from langchain.document_loaders import PyPDFLoader
		print(f'Loading {file}')
		loader = PyPDFLoader(file) 	
	elif extension == '.txt':
		from langchain.document_loaders import TextLoader
		loader = TextLoader(file) 
	else:
		print('Document format is not supported!')
		return None 
 
	data = loader.load() 
	return data

def chunk_data(data, chunk_size=512, chunk_overlap=0):
    from langchain.text_splitter import CharacterTextSplitter
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap,separator='**')
    chunks = text_splitter.split_documents(data)
    return chunks

# create embeddings using OpenAIEmbeddings() and save them in a Chroma vector store
def create_embeddings(chunks):
	embeddings = OpenAIEmbeddings()
	#vector_store = Chroma.from_documents(chunks, embeddings)

	# if you want to use a specific directory for chromadb
	vector_store = chromadb.from_documents(chunks, embeddings, persist_directory='./mychroma_db')
	vector_store.persist()
	return vector_store


In [2]:
from dotenv import load_dotenv, find_dotenv

load_dotenv ()

file = "Python_code_for_RAG.txt"

data = load_document(file)
print(data)


[Document(page_content="Question:Here are the columns of the dataframe Genre,TotalSales. Create a bar chart to show the total sales for each music genre\nResponse:\nimport matplotlib.pyplot as plt\nfrom matplotlib import rcParams\n\n# Set figure size and DPI\nplt.rcParams['figure.figsize'] = 10,6\nplt.rcParams['figure.dpi'] = 300\n\n# Group by Genre and sum TotalSales\ngenre_grouped = df.groupby('Genre')['TotalSales'].sum()\n\n# Create bar chart\nplt.bar(genre_grouped.index, genre_grouped.values)\n\nplt.xlabel('Genre')\nplt.ylabel('Total Sales')\nplt.title('Total Sales per Music Genre')\n\n# Save plot as png\nplt.savefig('plot_image.png')\n**\nQuestion:Here are the columns of the dataframe Artist_Name,Total_Sales. Create a pie chart\nResponse:\nimport matplotlib.pyplot as plt \nimport pandas as pd\n\nplt.figure(figsize=(10, 6), dpi=300) \nplt.pie(df['Total_Sales'], labels=df['Artist_Name'],  autopct='%1.1f%%') \nplt.axis('equal') \nplt.savefig('plot_image.png')\n**\nQuestion:Here are t

In [3]:
chunks = chunk_data(data)
chunks


Created a chunk of size 603, which is longer than the specified 512


[Document(page_content="Question:Here are the columns of the dataframe Genre,TotalSales. Create a bar chart to show the total sales for each music genre\nResponse:\nimport matplotlib.pyplot as plt\nfrom matplotlib import rcParams\n\n# Set figure size and DPI\nplt.rcParams['figure.figsize'] = 10,6\nplt.rcParams['figure.dpi'] = 300\n\n# Group by Genre and sum TotalSales\ngenre_grouped = df.groupby('Genre')['TotalSales'].sum()\n\n# Create bar chart\nplt.bar(genre_grouped.index, genre_grouped.values)\n\nplt.xlabel('Genre')\nplt.ylabel('Total Sales')\nplt.title('Total Sales per Music Genre')\n\n# Save plot as png\nplt.savefig('plot_image.png')", metadata={'source': 'Python_code_for_RAG.txt'}),
 Document(page_content="Question:Here are the columns of the dataframe Artist_Name,Total_Sales. Create a pie chart\nResponse:\nimport matplotlib.pyplot as plt \nimport pandas as pd\n\nplt.figure(figsize=(10, 6), dpi=300) \nplt.pie(df['Total_Sales'], labels=df['Artist_Name'],  autopct='%1.1f%%') \nplt.

In [4]:
vector_store = create_embeddings(chunks)
	
retriever = vector_store.as_retriever(search_type='similarity',search_kwargs={'k': 2})

AttributeError: module 'chromadb' has no attribute 'from_documents'

In [None]:
result = retriever.get_relevant_documents("Question:Here are the columns of the dataframe Genre,TotalSales.Create a bar chart")
print(result[0].page_content)


In [None]:
my_db  = Chroma(persist_directory='./mychroma_db', embedding_function=OpenAIEmbeddings())
my_retriever = my_db.as_retriever(search_type='similarity',search_kwargs={'k': 2})
result = my_retriever.get_relevant_documents("Question:Here are the columns of the dataframe Genre,TotalSales.Create a bar chart")
print(result[0].page_content)


### Chroma DB

In [2]:
import chromadb

In [3]:
client = chromadb.PersistentClient(path="./chromadb")

In [4]:

default_collection = client.get_or_create_collection(name="default")

In [5]:
with open("./Python_code_for_RAG.txt") as file:
    data = file.read()

docs = data.split('**')

In [16]:
default_collection.upsert(documents = docs,ids=['1','2','3'])

In [33]:
results = default_collection.query(query_texts=["Create"],n_results=2)
# print(results['documents'])
# print((results['documents'][0]))
# for item in results['documents'][0]:
#     print(item)

print('\n'.join(results['documents'][0]))

Question:Here are the columns of the dataframe Genre,TotalSales. Create a bar chart to show the total sales for each music genre
Response:
import matplotlib.pyplot as plt
from matplotlib import rcParams

# Set figure size and DPI
plt.rcParams['figure.figsize'] = 10,6
plt.rcParams['figure.dpi'] = 300

# Group by Genre and sum TotalSales
genre_grouped = df.groupby('Genre')['TotalSales'].sum()

# Create bar chart
plt.bar(genre_grouped.index, genre_grouped.values)

plt.xlabel('Genre')
plt.ylabel('Total Sales')
plt.title('Total Sales per Music Genre')

# Save plot as png
plt.savefig('plot_image.png')


Question:Here are the columns of the dataframe Artist_Name,Total_Sales. Create a pie chart
Response:
import matplotlib.pyplot as plt 
import pandas as pd

plt.figure(figsize=(10, 6), dpi=300) 
plt.pie(df['Total_Sales'], labels=df['Artist_Name'],  autopct='%1.1f%%') 
plt.axis('equal') 
plt.savefig('plot_image.png')



In [18]:
default_collection.count()

3

In [15]:

import chromadb

def load_document(file): 
	import os 
	name, extension = os.path.splitext(file) 
	 
	if extension == '.txt':
            
	
	else:
		print('Document format is not supported!')
		return None 
 
	data = loader.load() 
	return data

def chunk_data(data, chunk_size=256, chunk_overlap=20):
    from langchain.text_splitter import CharacterTextSplitter
    text_splitter = CharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap,separator='**')
    chunks = text_splitter.split_documents(data)
    return chunks

# create embeddings using OpenAIEmbeddings() and save them in a Chroma vector store
def create_embeddings(chunks):
    embeddings = OpenAIEmbeddings()
    #vector_store = Chroma.from_documents(chunks, embeddings)

    # if you want to use a specific directory for chromadb
    vector_store = chromadb.from_documents(chunks, embeddings, persist_directory='./mychroma_db')
    return vector_store

def retrieve_from_chroma():
	client  = chromadb(persist_directory='./mychroma_db', embedding_function=OpenAIEmbeddings())
	my_retriever = client.as_retriever(search_type='similarity',search_kwargs={'k': 2})
	result = my_retriever.get_relevant_documents("Question:Here are the columns of the dataframe Genre,TotalSales.Create a bar chart")
	print(result[0].page_content)
	
def load_file():
	
	
if __name__ == "__main__":
    from dotenv import load_dotenv, find_dotenv

    load_dotenv ()
	
    file = "Python_code_for_RAG.txt"
	
    data = load_document(file)
    chunks = chunk_data(data)
    vector_store = create_embeddings(chunks)
	
    retriever = vector_store.as_retriever(search_type='similarity')
	

array([[0.00818355, 0.37249333, 0.5773323 , 0.70842125, 0.10713834]])