# Given a dataset description, return from chroma db all similar datasets 

In [2]:
from dotenv import load_dotenv
load_dotenv()
from langchain.chains import RetrievalQA
from langchain.indexes import VectorstoreIndexCreator
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.document_loaders import DataFrameLoader
from langchain.llms import OpenAI

import pandas as pd


In [3]:
# load data
df = pd.read_json('../huggingface-recommender/hfdatasets.json')
df.dropna(subset=['description'],inplace=True)
df['description'] = df['description'].astype('str')
df['tag_text'] = df['tags'].apply(lambda x: " ".join([tag for tag in x] if x else ''))
df.drop(columns=['tags'],inplace=True)


In [5]:
df_loaded = df[['author','id','description','downloads','tag_text']]
loader = DataFrameLoader(df_loaded, page_content_column="description")

documents = loader.load()

In [6]:

# split the documents into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(documents)
# select which embeddings we want to use
embeddings = OpenAIEmbeddings()
# create the vectorestore to use as the index


Created a chunk of size 1253, which is longer than the specified 1000
Created a chunk of size 1253, which is longer than the specified 1000
Created a chunk of size 1253, which is longer than the specified 1000
Created a chunk of size 1253, which is longer than the specified 1000
Created a chunk of size 1253, which is longer than the specified 1000
Created a chunk of size 1253, which is longer than the specified 1000
Created a chunk of size 1140, which is longer than the specified 1000
Created a chunk of size 1140, which is longer than the specified 1000
Created a chunk of size 1087, which is longer than the specified 1000
Created a chunk of size 1190, which is longer than the specified 1000
Created a chunk of size 1164, which is longer than the specified 1000
Created a chunk of size 1190, which is longer than the specified 1000


In [6]:
db = Chroma.from_documents(texts, embeddings)
# expose this index in a retriever interface

Using embedded DuckDB without persistence: data will be transient


In [8]:
sample_dataset = df.sample(n=1)

In [9]:
docs = db.similarity_search(sample_dataset['description'].values[0])[1:]

In [14]:
sample_metadata = sample_dataset['id'].values[0]

In [15]:
sample_metadata

'huggingartists/agata-christie'

In [13]:
docs[0].metadata['id']

'RollingMuffin/test_scripts'

In [16]:
def clean_docs(docs,query_data):
    cleaned_docs = []
    input_id = query_data['id'].values[0]
    for doc in docs:
        if doc.metadata['id'] != input_id:
            cleaned_docs.append(doc)

    return cleaned_docs


In [20]:
cleaned_docs = clean_docs(docs, sample_dataset)

In [23]:
cleaned_docs

# return the url which is hf/datasets / id 

[Document(page_content='This dataset is designed to generate lyrics with HuggingArtists.', metadata={'author': 'RollingMuffin', 'id': 'RollingMuffin/test_scripts', 'downloads': 273, 'tag_text': ''}),
 Document(page_content='This dataset is designed to generate lyrics with HuggingArtists.', metadata={'author': 'huggingartists', 'id': 'huggingartists/5nizza', 'downloads': 262, 'tag_text': 'language:en huggingartists lyrics'}),
 Document(page_content='This dataset is designed to generate lyrics with HuggingArtists.', metadata={'author': 'huggingartists', 'id': 'huggingartists/5opka', 'downloads': 262, 'tag_text': 'language:en huggingartists lyrics'})]