In [1]:
import os, tempfile
from pathlib import Path
from glob import glob

from langchain.chains import RetrievalQA, ConversationalRetrievalChain, LLMChain
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

# from langchain.llms import VertexAI
from langchain.llms import HuggingFaceHub
from langchain.document_loaders import (
    DirectoryLoader,
    PyPDFLoader,
    PyPDFDirectoryLoader,
    DirectoryLoader,
)
from langchain.text_splitter import (
    CharacterTextSplitter,
    TextSplitter,
    RecursiveCharacterTextSplitter,
)
from langchain.memory import ConversationBufferMemory
from langchain.memory.chat_message_histories import ChatMessageHistory
from langchain.prompts import PromptTemplate
from langchain.indexes import VectorstoreIndexCreator
from tqdm import tqdm
import pickle
from pqdm.threads import pqdm

import pandas as pd

from typing import Union, List

In [2]:
HUGGINGFACEHUB_API_KEY = os.environ.get("HUGGINGFACEHUB_API_TOKEN")
# use export HUGGINGFACEHUB_API_TOKEN=your_token_here to set the token (in the shell)


In [3]:
import openml

In [4]:
def get_dataset_description(dataset_name) -> openml.datasets.dataset.OpenMLDataset:
    try:
        data = openml.datasets.get_dataset(dataset_name, download_data = False, download_qualities = False, download_features_meta_data = False)
    except Exception as e:
        print(e)
    return data

In [5]:
def get_all_dataset_metadata_from_openml(save_filename = "all_dataset_metadata.pkl") -> Union[List, List]:
    # Gather all OpenML datasets
    all_datasets = openml.datasets.list_datasets(output_format="dataframe")

    # List dataset 'did' to be used as an identifier 
    data_id = [all_datasets.iloc[i]['did'] for i in range(len(all_datasets))]

    dataset_names = all_datasets['name'].tolist() # get a list of all dataset names

    # if the file already exists, load it else get the metadata from openml
    if os.path.exists(save_filename):
        with open(save_filename, 'rb') as f:
            all_data_descriptions = pickle.load(f)
        return all_data_descriptions, data_id
    else:
        # Get all dataset metadata using n_jobs parallel threads from openml
        all_data_descriptions = pqdm(dataset_names, get_dataset_description, n_jobs=10)

        # Save the metadata to a file
        with open(save_filename, 'wb') as f:
            pickle.dump(all_data_descriptions, f)
        
        return all_data_descriptions, data_id

In [6]:
def create_metadata_dataframe(all_data_descriptions, data_id) -> pd.DataFrame:
    descriptions = [all_data_descriptions[i].description for i in range(len(all_data_descriptions))]

    all_data_description = dict(zip(data_id, descriptions))

    return pd.DataFrame(list(all_data_description.items()),columns = ['did','description'])

In [7]:
def clean_metadata_dataframe(metadata_df) -> pd.DataFrame:
    # remove rows with empty descriptions
    metadata_df = metadata_df[metadata_df['description'].notna()]
    return metadata_df

In [8]:
metadata_df = create_metadata_dataframe(*get_all_dataset_metadata_from_openml())
metadata_df = clean_metadata_dataframe(metadata_df)

In [9]:
print(metadata_df.loc[20]['description'])

**Author**: Mary McLeish & Matt Cecile, University of Guelph  
Donor: Will Taylor (taylor@pluto.arc.nasa.gov)   
**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Horse+Colic) - 8/6/89   

**Horse Colic database**  
Database of surgeries on horses. Possible class attributes: 24 (whether lesion is surgical), others include: 23, 25, 26, and 27

Notes:
* Hospital_Number is an identifier and should be ignored when modelling

Attribute Information:
> 
   1:  surgery?
           1 = Yes, it had surgery
           2 = It was treated without surgery  
   2:  Age 
           1 = Adult horse
           2 = Young (< 6 months)  
   3:  Hospital Number 
           - numeric id
           - the case number assigned to the horse
             (may not be unique if the horse is treated > 1 time)  
   4:  rectal temperature
           - linear
           - in degrees celsius.
           - An elevated temp may occur due to infection.
           - temperature may be reduced when the animal is in l

## Rag part

In [10]:
from langchain.document_loaders import DataFrameLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import langchain_core

from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceEmbeddings

In [11]:
def create_embeddings_and_retriever(chunked_docs, model_name = "BAAI/bge-base-en-v1.5"):
    db = FAISS.from_documents(chunked_docs, HuggingFaceEmbeddings(model_name=model_name))
    retriever = db.as_retriever(search_type="similarity", search_kwargs={"k": 4})
    return db, retriever

In [20]:
def parallel_add_to_chroma_db(documents, embeddings, persist_directory = "./chroma_db/") -> Chroma:
    # https://github.com/zylon-ai/private-gpt/issues/257#issuecomment-1666622098
    if len(documents) > 100:
        batch_size = int(len(documents) / 100)
        batches = [documents[i:i + batch_size] for i in range(0, len(documents), batch_size)]
        for batch in tqdm(batches, desc="Processing batches"):
            db = Chroma.from_documents(
                batch, embedding=embeddings, persist_directory=persist_directory)
    else:
        db = Chroma.from_documents(
        documents, embedding=embeddings, persist_directory=persist_directory)
    return db

In [21]:
def load_document_and_create_vector_store(metadata_df, persist_directory = "./chroma_db/", model_name = "BAAI/bge-base-en-v1.5", device = "cpu", normalize_embeddings = True, recreate_chroma = False) -> Chroma:
    # if the directory already exists, load the vector store else create a new one
    if os.path.exists(persist_directory) and not recreate_chroma:
        db = Chroma.load(persist_directory)
        return db
    else:
        # load data
        # might need to chunk if the descriptions are too large, fine for now
        loader = DataFrameLoader(metadata_df, page_content_column="description")
        documents = loader.load() 

        # load model
        model_kwargs = {"device": device}
        encode_kwargs = {"normalize_embeddings": normalize_embeddings}
        embeddings = HuggingFaceEmbeddings(
        model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
        )

        # create vector store in batches
        return parallel_add_to_chroma_db(documents, embeddings, persist_directory=persist_directory)


In [22]:
vectordb = load_document_and_create_vector_store(metadata_df)

Processing batches:  48%|████▊     | 48/101 [06:20<07:40,  8.69s/it]

In [None]:
def create_retriever_and_llm(model_name = "HuggingFaceH4/zephyr-7b-beta", num_return_documents = 10,search_type = "similarity"):
    retriever = vectordb.as_retriever(search_type=search_type, search_kwargs={"k": num_return_documents})
    llm = HuggingFaceHub(model_name=model_name,model_kwargs={"temperature": 0.1, "max_length": 512}, api_key=HUGGINGFACEHUB_API_KEY)
    return retriever, llm

In [12]:
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
embeddings = HuggingFaceEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)



In [13]:
from langchain.document_loaders import DataFrameLoader

In [14]:
loader = DataFrameLoader(df, page_content_column="description")
documents = loader.load()

In [15]:
vectordb = Chroma.from_documents(
        documents, embedding=embeddings, persist_directory="./testdir"
    )


In [16]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.chat_models import ChatOpenAI


In [22]:
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": 5})

# Initiate llm generator. Temperature = 0 (Not creative). Temperature = 1 (Creative). Using
llm = HuggingFaceHub(
    # repo_id="declare-lab/flan-alpaca-large",
    # repo_id = "google/flan-t5-large",
    repo_id = "HuggingFaceH4/zephyr-7b-beta",
    # repo_id="google/flan-t5-large",
    model_kwargs={"temperature": 0.1, "max_length": 512},
    huggingfacehub_api_token=HUGGINGFACEHUB_API_KEY,
)


In [23]:
rqa_prompt_template = "This database is a list of dataset metadata. Use the following pieces of context to find the relevant document. Answer only from the context given using the {question} given. If you do not know the answer, say you do not know. {context}"

In [24]:
RQA_PROMPT = PromptTemplate(
    template=rqa_prompt_template, input_variables=["context", "question"]
)
rqa_chain_type_kwargs = {"prompt": RQA_PROMPT}

In [25]:
qa = RetrievalQA.from_chain_type(
    llm,
    # chain_type="stuff",
    retriever=retriever,
    chain_type_kwargs=rqa_chain_type_kwargs,
    return_source_documents=True,
    verbose=False,
)

In [42]:
# query = "Find me documents that talk about diseases"
query = "Which datasets are good for finance problems"
result = qa({"query": query})

In [43]:
result

{'query': 'Which datasets are good for finance problems',
 'result': 'This database is a list of dataset metadata. Use the following pieces of context to find the relevant document. Answer only from the context given using the Which datasets are good for finance problems given. If you do not know the answer, say you do not know. **Author**: Dr. Hans Hofmann  \n**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)) - 1994    \n**Please cite**: [UCI](https://archive.ics.uci.edu/ml/citation_policy.html)\n\n**German Credit dataset**  \nThis dataset classifies people described by a set of attributes as good or bad credit risks.\n\nThis dataset comes with a cost matrix: \n``` \nGood  Bad (predicted)  \nGood   0    1   (actual)  \nBad    5    0  \n```\n\nIt is worse to class a customer as good when they are bad (5), than it is to class a customer as bad when they are good (1).  \n\n### Attribute description  \n\n1. Status of existing checking account, in Deuts

In [44]:
relevant_docs = [doc.metadata['did'] for doc in result["source_documents"]]
relevant_docs

[31, 31, 31, 31, 29]

In [45]:
# get description of the relevant documents using df
test_descrptions = df[df['did'].isin(relevant_docs)]["description"]
test_descrptions.head()

24    **Author**: Confidential - Donated by Ross Qui...
26    **Author**: Dr. Hans Hofmann  \n**Source**: [U...
Name: description, dtype: object

In [47]:
print(test_descrptions.to_list()[0])

**Author**: Confidential - Donated by Ross Quinlan   
**Source**: [UCI](http://archive.ics.uci.edu/ml/datasets/credit+approval) - 1987  
**Please cite**: [UCI](http://archive.ics.uci.edu/ml/citation_policy.html)  

**Credit Approval**
This file concerns credit card applications. All attribute names and values have been changed to meaningless symbols to protect the confidentiality of the data.  
   
This dataset is interesting because there is a good mix of attributes -- continuous, nominal with small numbers of values, and nominal with larger numbers of values.  There are also a few missing values.
