In [19]:
%pip install -r requirements.txt


Collecting psycopg2 (from -r requirements.txt (line 8))
  Downloading psycopg2-2.9.10-cp312-cp312-win_amd64.whl.metadata (5.0 kB)
Collecting pgvector (from -r requirements.txt (line 9))
  Downloading pgvector-0.4.0-py3-none-any.whl.metadata (17 kB)
Downloading psycopg2-2.9.10-cp312-cp312-win_amd64.whl (1.2 MB)
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.2 MB ? eta -:--:--
   --------------------------- ------------ 0.8/1.2 MB 4.2 MB/s eta 0:00:01
   ---------------------------------------- 1.2/1.2 MB 2.9 MB/s eta 0:00:00
Downloading pgvector-0.4.0-py3-none-any.whl (27 kB)
Installing collected packages: psycopg2, pgvector
Successfully installed pgvector-0.4.0 psycopg2-2.9.10


In [1]:
from datasets import Dataset

qa_dataset = Dataset.from_csv('data/db.csv')
qa_dataset

  from .autonotebook import tqdm as notebook_tqdm


Dataset({
    features: ['section', 'subsection', 'question', 'answer'],
    num_rows: 525
})

In [2]:
def concatenate_text(examples):
    return {
        "text": examples["section"]
        + " \n "
        + examples["subsection"]
        + " \n "
        + examples["question"]
        + " \n "
        + examples["answer"]
    }

In [3]:
qa_dataset = qa_dataset.map(concatenate_text)

In [4]:
from transformers import AutoTokenizer, AutoModel

model_ckpt = "sentence-transformers/multi-qa-mpnet-base-dot-v1"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)

In [5]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [6]:
model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [7]:
def cls_pooling(model_output):
    return model_output.last_hidden_state[:, 0]

In [8]:
def get_embeddings(text_list):
    encoded_input = tokenizer(
        text_list, padding=True, truncation=True, return_tensors="pt"
    )
    encoded_input = {k: v.to(device) for k, v in encoded_input.items()}
    model_output = model(**encoded_input)
    return cls_pooling(model_output)

In [9]:
embedding = get_embeddings(qa_dataset["text"][0])
embedding.shape

torch.Size([1, 768])

In [32]:
embeddings_dataset = qa_dataset.map(
    lambda x: {"embedding": get_embeddings(x["text"]).detach().cpu().numpy()[0]}
)

Map: 100%|████████████████████████████████████████████████████████████████████| 525/525 [03:14<00:00,  2.69 examples/s]


In [33]:
embeddings_dataset.add_faiss_index(column="embedding")

100%|███████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00, 180.98it/s]


Dataset({
    features: ['section', 'subsection', 'question', 'answer', 'text', 'embedding'],
    num_rows: 525
})

In [34]:
question = "What is Logistic Regression?"
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape

(1, 768)

In [35]:
scores, samples = embeddings_dataset.get_nearest_examples(
    "embedding", question_embedding, k=5
)

In [36]:
import pandas as pd

samples_df = pd.DataFrame.from_dict(samples)
samples_df["scores"] = scores
samples_df.sort_values("scores", ascending=False, inplace=True)

In [37]:
samples_df

Unnamed: 0,section,subsection,question,answer,text,embedding,scores
4,Classical models,Logistic Regression,Logistic Regression Parameter Interpretation,"log p / 1−p = α + β1x1 + β2x2, where x1 is b...",Classical models \n Logistic Regression \n Log...,"[-0.0908917486667633, -0.5208433270454407, -0....",30.387844
3,Classical models,Logistic Regression,What is the difference between odds and probab...,The probability that an event will occur is th...,Classical models \n Logistic Regression \n Wha...,"[0.004504382610321045, -0.43677017092704773, -...",29.093803
2,Classical models,Logistic Regression,Assumptions of Logistic Regression,We will explore the assumptions of logistic re...,Classical models \n Logistic Regression \n Ass...,"[0.02633441612124443, -0.32643234729766846, -0...",28.364182
1,Classical models,Logistic Regression,What distinguishes Logistic Regression from Li...,While Linear Regression is used to predict con...,Classical models \n Logistic Regression \n Wha...,"[-0.07696059346199036, -0.3094007074832916, -0...",26.248022
0,Classical models,Logistic Regression,What is Logistic Regression?,Logistic regression is used for binary classif...,Classical models \n Logistic Regression \n Wha...,"[0.09677687287330627, -0.6925526857376099, -0....",23.141731


In [38]:
import pickle

In [39]:
with open('faiss_embeddings.pkl', 'wb') as f:
    pickle.dump(question_embedding, f)

In [99]:
import psycopg2

conn = psycopg2.connect(dbname='ds_db', user='user', password='password', host='localhost')

In [116]:
cursor = conn.cursor()

In [109]:
cursor.execute(
    """CREATE TABLE ds_qa (
    id SERIAL PRIMARY KEY,
    section TEXT,
    subsection TEXT,
    question TEXT,
    answer TEXT,
    text TEXT
);"""
)

In [110]:
import faiss
import numpy as np

dim = 768 
base_index = faiss.IndexFlatL2(dim)
index = faiss.IndexIDMap(base_index)

In [142]:
def add_document(record):
    # saving to DB
    cursor.execute("INSERT INTO ds_qa (section, subsection, question, answer, text) VALUES (%s, %s, %s, %s, %s) RETURNING id", 
                (record['section'], record['subsection'], record['question'], record['answer'], record['text']))
    doc_id = cursor.fetchone()[0]
    conn.commit()
    
    # adding into FAISS index
    index.add_with_ids(np.array([record['embedding']], dtype=np.float32), np.array([doc_id], dtype=np.int64))

In [121]:
for record in embeddings_dataset:
    add_document(record)

In [135]:
def db_search(query_embedding, k=5):
    D, I = index.search(query_embedding, k)
    cursor.execute("SELECT * FROM ds_qa WHERE id = ANY(%s)", (I[0].tolist(),))
    results = cursor.fetchall()
    return results

In [136]:
results = db_search(question_embedding, 5)

In [122]:
cursor.execute("SELECT * from ds_qa")

In [130]:
conn.rollback()

In [140]:
df_result = pd.DataFrame(results, columns=['id', 'Section', 'Subsection', 'Question', 'Answer', 'Full Text'])

In [141]:
df_result

Unnamed: 0,id,Section,Subsection,Question,Answer,Full Text
0,14,Classical models,Logistic Regression,What is Logistic Regression?,Logistic regression is used for binary classif...,Classical models \n Logistic Regression \n Wha...
1,15,Classical models,Logistic Regression,What is the difference between odds and probab...,The probability that an event will occur is th...,Classical models \n Logistic Regression \n Wha...
2,16,Classical models,Logistic Regression,Logistic Regression Parameter Interpretation,"log p / 1−p = α + β1x1 + β2x2, where x1 is b...",Classical models \n Logistic Regression \n Log...
3,17,Classical models,Logistic Regression,Assumptions of Logistic Regression,We will explore the assumptions of logistic re...,Classical models \n Logistic Regression \n Ass...
4,18,Classical models,Logistic Regression,What distinguishes Logistic Regression from Li...,While Linear Regression is used to predict con...,Classical models \n Logistic Regression \n Wha...
