In [1]:
import os, tempfile
from pathlib import Path
from glob import glob

from langchain.chains import RetrievalQA, ConversationalRetrievalChain, LLMChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# from langchain.llms import VertexAI
from langchain.llms import HuggingFaceHub
from langchain.document_loaders import (
    DirectoryLoader,
    PyPDFLoader,
    PyPDFDirectoryLoader,
    DirectoryLoader,
)
from langchain.text_splitter import (
    CharacterTextSplitter,
    TextSplitter,
    RecursiveCharacterTextSplitter,
)
from langchain.memory import ConversationBufferMemory
from langchain.memory.chat_message_histories import ChatMessageHistory
from langchain.prompts import PromptTemplate
from langchain.indexes import VectorstoreIndexCreator
from tqdm import tqdm
import pickle
from pqdm.threads import pqdm

In [4]:
HUGGINGFACEHUB_API_KEY = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
# use export HUGGINGFACEHUB_API_TOKEN=your_token_here to set the token (in the shell)


In [6]:
import openml

In [7]:
# Gather all OpenML datasets for semantic tagging
all_datasets = openml.datasets.list_datasets(output_format="dataframe")

# List dataset 'did' to be used as an identifier 
data_id = []
for i in range(len(all_datasets)):
  data_id.append(all_datasets.iloc[i]['did'])

# dictonary to hold {'did': dataset_decription}
all_data_description = dict.fromkeys(data_id, "") 



In [8]:
len(all_datasets)

5477

In [None]:
# all_datasets = all_datasets[:50]
for i in tqdm(range(len(all_datasets)), total=len(all_datasets)):
  dataset_name = all_datasets.iloc[i]['name']
  try:
    data = openml.datasets.get_dataset(dataset_name, download_data = False, download_qualities = False)
    all_data_description[all_datasets.iloc[i]['did']] = data.description
  except Exception as e:
    print(e)

# Save the data to a file
with open('all_data_description.pkl', 'wb') as f:
  pickle.dump(all_data_description, f)

In [7]:
def get_dataset_description(dataset_name) -> openml.datasets.dataset.OpenMLDataset:
    try:
        data = openml.datasets.get_dataset(dataset_name, download_data = False, download_qualities = False, download_features_meta_data = False)
    except Exception as e:
        print(e)
    return data

In [8]:
dataset_names = all_datasets['name'].tolist() # get a list of all dataset names
# Run the function to get the dataset descriptions parallelly using pqdm
all_data_descriptions = pqdm(dataset_names, get_dataset_description, n_jobs=10)

QUEUEING TASKS | :   0%|          | 0/5477 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/5477 [00:00<?, ?it/s]

COLLECTING RESULTS | :   0%|          | 0/5477 [00:00<?, ?it/s]

In [20]:
with open('all_data_descriptions.pkl', 'wb') as f:
  pickle.dump(all_data_descriptions, f)

In [None]:
# all_data_description[all_datasets.iloc[i]['did']] 
descriptions = [all_data_descriptions[i].description for i in range(len(all_data_descriptions))]

all_data_description = dict(zip(data_id, descriptions))

In [7]:
# convert the dictionary to a dataframe
import pandas as pd
df = pd.DataFrame(list(all_data_description.items()),columns = ['did','description'])

In [8]:
print(df.loc[29]['description'])

1. Title: Dermatology Database

2. Source Information:
   (a) Original owners:
       -- 1. Nilsel Ilter, M.D., Ph.D., 
             Gazi University, 
             School of Medicine
             06510 Ankara, Turkey
             Phone: +90 (312) 214 1080

       -- 2. H. Altay Guvenir, PhD., 
             Bilkent University,
             Department of Computer Engineering and Information Science,
             06533 Ankara, Turkey
             Phone: +90 (312) 266 4133
             Email: guvenir@cs.bilkent.edu.tr

   (b) Donor: H. Altay Guvenir,
              Bilkent University,
              Department of Computer Engineering and Information Science,
              06533 Ankara, Turkey
              Phone: +90 (312) 266 4133
              Email: guvenir@cs.bilkent.edu.tr

   (c) Date:  January, 1998

3. Past Usage:
   1. G. Demiroz, H. A. Govenir, and N. Ilter, 
      "Learning Differential Diagnosis of Eryhemato-Squamous Diseases using
       Voting Feature Intervals", Aritificial In

In [9]:
# remove the rows with empty descriptions
df = df[df['description'] != '']

In [10]:
df.shape

(50, 2)

In [11]:
df.head()

Unnamed: 0,did,description
0,2,**Author**: Unknown. Donated by David Sterling...
1,3,Author: Alen Shapiro\nSource: [UCI](https://ar...
2,4,**Author**: Unknown\n**Source**: Collective Ba...
3,5,"**Author**: H. Altay Guvenir, Burak Acar, Hald..."
4,6,**Author**: David J. Slate \n**Source**: [UCI...


In [12]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)



In [13]:
from langchain.document_loaders import DataFrameLoader

In [14]:
loader = DataFrameLoader(df, page_content_column="description")
documents = loader.load()

In [15]:
vectordb = Chroma.from_documents(
        documents, embedding=embeddings, persist_directory="./testdir"
    )


In [16]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI


In [22]:
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# Initiate llm generator. Temperature = 0 (Not creative). Temperature = 1 (Creative). Using
llm = HuggingFaceHub(
    # repo_id="declare-lab/flan-alpaca-large",
    # repo_id = "google/flan-t5-large",
    repo_id = "HuggingFaceH4/zephyr-7b-beta",
    # repo_id="google/flan-t5-large",
    model_kwargs={"temperature": 0.1, "max_length": 512},
    huggingfacehub_api_token=HUGGINGFACEHUB_API_KEY,
)


In [23]:
rqa_prompt_template = "This database is a list of dataset metadata. Use the following pieces of context to find the relevant document. Answer only from the context given using the {question} given. If you do not know the answer, say you do not know. {context}"

In [24]:
RQA_PROMPT = PromptTemplate(
    template=rqa_prompt_template, input_variables=["context", "question"]
)
rqa_chain_type_kwargs = {"prompt": RQA_PROMPT}

In [25]:
qa = RetrievalQA.from_chain_type(
    llm,
    # chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs=rqa_chain_type_kwargs,
    return_source_documents=True,
    verbose=False,
)

In [42]:
# query = "Find me documents that talk about diseases"
query = "Which datasets are good for finance problems"
result = qa({"query": query})

In [43]:
result

{'query': 'Which datasets are good for finance problems',
 'result': 'This database is a list of dataset metadata. Use the following pieces of context to find the relevant document. Answer only from the context given using the Which datasets are good for finance problems given. If you do not know the answer, say you do not know. **Author**: Dr. Hans Hofmann  \n**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)) - 1994    \n**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)\n\n**German Credit dataset**  \nThis dataset classifies people described by a set of attributes as good or bad credit risks.\n\nThis dataset comes with a cost matrix: \n``` \nGood  Bad (predicted)  \nGood   0    1   (actual)  \nBad    5    0  \n```\n\nIt is worse to class a customer as good when they are bad (5), than it is to class a customer as bad when they are good (1).  \n\n### Attribute description  \n\n1. Status of existing checking account, in Deuts

In [44]:
relevant_docs = [doc.metadata['did'] for doc in result["source_documents"]]
relevant_docs

[31, 31, 31, 31, 29]

In [45]:
# get description of the relevant documents using df
test_descrptions = df[df['did'].isin(relevant_docs)]["description"]
test_descrptions.head()

24    **Author**: Confidential - Donated by Ross Qui...
26    **Author**: Dr. Hans Hofmann  \n**Source**: [U...
Name: description, dtype: object

In [47]:
print(test_descrptions.to_list()[0])

**Author**: Confidential - Donated by Ross Quinlan   
**Source**: [UCI](http://archive.ics.uci.edu/ml/datasets/credit+approval) - 1987  
**Please cite**: [UCI](http://archive.ics.uci.edu/ml/citation_policy.html)  

**Credit Approval**
This file concerns credit card applications. All attribute names and values have been changed to meaningless symbols to protect the confidentiality of the data.  
   
This dataset is interesting because there is a good mix of attributes -- continuous, nominal with small numbers of values, and nominal with larger numbers of values.  There are also a few missing values.
