In [46]:
import numpy as np
import pandas as pd
import re
import string
from bs4 import BeautifulSoup
import warnings
warnings.filterwarnings('ignore')

# Exploratory Data Analysis

In [47]:
df = pd.read_csv('Q_A_RAG.csv')

In [48]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   query          200 non-null    object
 1   answer         200 non-null    object
 2   context        200 non-null    object
 3   sample_number  200 non-null    int64 
 4   tokens         200 non-null    int64 
 5   category       200 non-null    object
dtypes: int64(2), object(4)
memory usage: 9.5+ KB


In [49]:
df.isna().sum()

Unnamed: 0,0
query,0
answer,0
context,0
sample_number,0
tokens,0
category,0


In [50]:
df.head()

Unnamed: 0,query,answer,context,sample_number,tokens,category
0,What is the total amount of the invoice?,"$22,500.00",Services Vendor Inc. \n100 Elm Street Pleasant...,0,138,core
1,What is the invoice number?,#0001,Services Vendor Inc. \n100 Elm Street Pleasant...,1,138,core
2,What is a list of the items being purchased?,•Front End Engineering Service;\n•Back End Eng...,Services Vendor Inc. \n100 Elm Street Pleasant...,2,138,core
3,What is the name of the contact for question?,Bia Hermes,Services Vendor Inc. \n100 Elm Street Pleasant...,3,138,core
4,What is the PO number?,#1000,Services Vendor Inc. \n100 Elm Street Pleasant...,4,138,core


In [51]:
df['sample_number'].value_counts()

Unnamed: 0_level_0,count
sample_number,Unnamed: 1_level_1
0,1
1,1
2,1
3,1
4,1
...,...
195,1
196,1
197,1
198,1


In [52]:
df = df.drop(['sample_number'], axis = 1)

In [53]:
df = df.drop(['tokens'], axis = 1)

# Text Cleaning

In [54]:
def clean_text(text):
  text = BeautifulSoup(text, 'html.parser').get_text() # Removing HTML Tags if exist
  text = text.lower() # Making text lowercase
  text = re.sub(r'[^a-zA-Z ]', '', text) # Removing all special characters except words and space

  return text.strip()

In [55]:
df['cleaned context'] = df['context'].apply(clean_text)

# Vector Database

In [56]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.schema import Document

- Model selection

It is a simple and fast model

In [57]:
model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

- Selecting embedding data and metadata

Here, we select our context as a source of data. And I select category column as metadata.

We benefit with metadata for filtering.

In [58]:
documents = [
    Document
    (
        page_content = row['cleaned context'],
        metadata =
        {
            'category': row['category']
        }

    )
    for _, row in df.iterrows()
]

- Creating Vector Database

In [59]:
vectordb = Chroma.from_documents(documents = documents, embedding = model, persist_directory = "./Chroma_Practice_DB")
vectordb.persist()

Here we query the data. In filter section we choose a filter whatever we want from metada.

In [60]:
query = vectordb.similarity_search(
    'How can I increase the invoice', # query sentence
    k = 1, # number of outputs
    filter = {
        "category": {"$eq": "core"} # selection one or more columns from metadata to filter
    }
)

for doc in query:
  print("- ", doc.page_content) # printing output

-  services vendor inc  elm street pleasantville ny to alpha inc  st street los angeles ca description front end engineering service   back end engineering service   quality assurance manager   total amount  make all checks payable to services vendor inc payment is due within  daysif you have any questions concerning this invoice contact bia hermes thank you for your business  invoice invoice   date  for alpha project po


Because there are just 200 rows data that is not enough, retrieval part is not clear. However the pipeline of Vector DB and RAG is this.