In [196]:
import os
import matplotlib.pyplot as plt
import html2text
import requests
from pathlib import Path
import warnings
from tqdm import tqdm
import pandas as pd
from bs4 import BeautifulSoup, NavigableString
from functools import partial
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
import numpy as np
import json
from pinecone import Pinecone, PodSpec, ServerlessSpec
from concurrent.futures import ThreadPoolExecutor, as_completed
import pathlib
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import pipeline

warnings.filterwarnings("ignore")

In [2]:
DOCS_DIR = Path(os.getcwd(), 'data', 'scikit-learn.org', 'stable')

def extract_text_from_section(section):
    texts = []
    for elem in section.children:
        if isinstance(elem, NavigableString):
            if elem.strip():
                texts.append(elem.strip())
        elif elem.name == "section":
            continue
        else:
            texts.append(elem.get_text().strip())
    return "\n".join(texts)


def path_to_uri(path, scheme="https://", domain="scikit-learn.org/stable/"):
    return scheme + domain + str(path).split(domain)[-1]


def extract_sections(record):
    # print("RECORD: ", record)
    with open(record["path"], "r", encoding="utf-8") as html_file:
        soup = BeautifulSoup(html_file, "html.parser")
    sections = soup.find_all("section")
    section_list = []
    for section in sections:
        section_id = section.get("id")
        section_text = extract_text_from_section(section)
        if section_id:
            uri = path_to_uri(path=record["path"])
            section_list.append({"source": f"{uri}#{section_id}", "text": section_text})
    return section_list

In [3]:
data_chunk = []
for i in DOCS_DIR.rglob("*.html"):
    data_chunk+=extract_sections({"path": i})

In [4]:
# HYPER_PARAMETERS

CHUNK_SIZE = 1000
CHUNK_OVERLAP = 100

In [5]:
def chunk_section(section, chunk_size, chunk_overlap):
    text_splitter = RecursiveCharacterTextSplitter(
        separators=["\n\n", "\n", " ", ""],
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len)
    
    chunks = text_splitter.create_documents(
        texts=[section["text"]], 
        metadatas=[{"source": section["source"]}])
    return [{"text": chunk.page_content, "source": chunk.metadata["source"]} for chunk in chunks]

chunk_function = partial(chunk_section, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)

In [6]:
chunks = list()
for section in tqdm(data_chunk):
    for chunk in chunk_function(section):
        chunks.append(chunk)

100%|████████████████████████████████████| 3737/3737 [00:00<00:00, 20793.69it/s]


In [7]:
print(f"Length of all the data chunks: {len(chunks)}")

Length of all the data chunks: 11030


In [8]:
MODEL_NAME = 'paraphrase-MiniLM-L6-v2'

def get_embedding_model(embedding_model_name, model_kwargs, encode_kwargs):
    if embedding_model_name == "text-embedding-ada-002":
        embedding_model = OpenAIEmbeddings(
            model=embedding_model_name,
            openai_api_key=os.environ["OPENAI_API_KEY"])
    else:
        embedding_model = HuggingFaceEmbeddings(
            model_name=embedding_model_name,
            model_kwargs=model_kwargs,
            encode_kwargs=encode_kwargs)
    return embedding_model

class EmbedChunks:
    def __init__(self, model_name):
        self.embedding_model = get_embedding_model(
            embedding_model_name=model_name,
            model_kwargs={"device": "cpu"},
            encode_kwargs={"device": "cpu", "batch_size": 100})

    def __call__(self, batch):
        texts = [chunk["text"] for chunk in batch]
        embeddings = self.embedding_model.embed_documents(texts)
        for i, chunk in enumerate(batch):
            chunk["embeddings"] = embeddings[i]
        
        return batch

In [27]:
model = get_embedding_model(
            embedding_model_name=MODEL_NAME,
            model_kwargs={"device": "cpu"},
            encode_kwargs={"device": "cpu", "batch_size": 100})

In [9]:
embedder = EmbedChunks(model_name=MODEL_NAME)
batch_size = 200
embedded_chunks = []
for i in tqdm(range(0, len(chunks), batch_size)):
    batch = chunks[i:i+batch_size]
    embedded_batch = embedder(batch)
    embedded_chunks.extend(embedded_batch)

100%|███████████████████████████████████████████| 56/56 [01:12<00:00,  1.30s/it]


In [10]:
embedding_json = json.dumps(embedded_chunks, indent=4)

file_name = f'{MODEL_NAME}_{CHUNK_SIZE}_{CHUNK_OVERLAP}.json'
with open(file_name, 'w') as file:
    file.write(embedding_json)

In [11]:
pc = Pinecone(api_key="c054a566-8249-4d2b-9846-c1464fa2d0a5")
EMBEDDING_SIZE = len(embedded_chunks[0]['embeddings'])
INDEX_NAME = f'{MODEL_NAME}-{CHUNK_SIZE}-{CHUNK_OVERLAP}'.lower()

pc.create_index(
    name=INDEX_NAME,
    dimension=EMBEDDING_SIZE,
    metric="cosine",
    spec=PodSpec(
    environment="gcp-starter"
  )
) 
index = pc.Index(INDEX_NAME)
print(f"Created INDEX: {INDEX_NAME} ")

Created INDEX: paraphrase-minilm-l6-v2-1000-100 


In [12]:
upsert_data = [
    (str(i), chunk["embeddings"], {"text": chunk["text"], "source": chunk["source"]})
    for i, chunk in enumerate(embedded_chunks)
]
batch_size = 100 
for i in tqdm(range(0, len(upsert_data), batch_size)):
    batch = upsert_data[i:i+batch_size]
    index.upsert(vectors=batch)

100%|█████████████████████████████████████████| 111/111 [01:26<00:00,  1.29it/s]


In [14]:
file_path = r'./results.csv'
if not os.path.exists(file_path):
    columns = ['Model_name', 'chunk_size', 'chunk_overlap', 'quality_score', 'retrieval_score', 'top_k']

    # Create an empty DataFrame with these columns
    df = pd.DataFrame(columns=columns)
    
    # Check if the file exists. If not, create it and write the DataFrame to it
    df.to_csv(file_path, index=False)
    message = "File created successfully."
else:
    df = pd.read_csv(file_path)
    message = "File already exists."

message


'File already exists.'

In [19]:
file_path = '/Users/harshan/Desktop/Projects/DocuHelper/File_1.json'

# Open the file and load its content
with open(file_path, 'r') as f:
    entries = json.load(f)

In [20]:
df.head()

Unnamed: 0,Model_name,chunk_size,chunk_overlap,quality_score,retrieval_score,top_k


In [41]:


def calc_retrieval_score(model, top_k):
    retrieval_score = 0
    
    for i in tqdm(range(len(entries['question']))):
        ques_emd = model.embed_query(entries["question"][i])
    
        result = index.query(
                    vector=ques_emd,
                    top_k=top_k,
                    include_values=True,
                    include_metadata=True)
    
        for j,row in enumerate(result['matches']):
                metadata_src = row['metadata']['source']
                actual_src = entries["source"][i]
                
                if(metadata_src == actual_src):
                    retrieval_score += 1
                    break
    return retrieval_score/len(entries['question'])

In [57]:
#Hyper-parameter
TOP_K = 20

ret_score = calc_retrieval_score(model, TOP_K)
data_to_add = {
    'Model_name': MODEL_NAME,
    'chunk_size': CHUNK_SIZE,
    'chunk_overlap': CHUNK_OVERLAP,
    'quality_score': None,
    'retrieval_score': ret_score,
    'top_k': TOP_K
}
data_df = pd.DataFrame([data_to_add])

df = pd.concat([df, data_df], ignore_index=True)

100%|█████████████████████████████████████████| 405/405 [00:44<00:00,  9.19it/s]


In [58]:
df

Unnamed: 0,Model_name,chunk_size,chunk_overlap,quality_score,retrieval_score,top_k
0,paraphrase-MiniLM-L6-v2,1000,100,,0.607407,10
1,paraphrase-MiniLM-L6-v2,1000,100,,0.508642,5
2,paraphrase-MiniLM-L6-v2,1000,100,,0.671605,15
3,paraphrase-MiniLM-L6-v2,1000,100,,0.698765,20


In [59]:
# Chunk Size: 300, 500, 750, 1000
# Embedding Models: , , albert-base-v2, thenlper/gte-large, Salesforce/SFR-Embedding-Mistral
# Done: paraphrase-MiniLM-L6-v2, sentence-transformers/all-MiniLM-L6-v2
# Top K: 5, 10, 15, 20
# Generator Models:

In [130]:
MODEL_NAME= 'thenlper/gte-large'
model = get_embedding_model(
            embedding_model_name=MODEL_NAME,
            model_kwargs={"device": "cpu"},
            encode_kwargs={"device": "cpu", "batch_size": 100})
model.embed_query(entries["question"][0])

[0.0028103808872401714,
 0.023341529071331024,
 -0.02021278813481331,
 -0.020648889243602753,
 -0.01504473015666008,
 0.0024883304722607136,
 0.024049343541264534,
 0.003866199404001236,
 0.0036074293311685324,
 0.01650950312614441,
 0.02157127670943737,
 -0.009122013114392757,
 -0.009633460082113743,
 -0.02640335075557232,
 -0.011911495588719845,
 0.01928025111556053,
 -0.03376557677984238,
 -0.034735169261693954,
 -0.026135584339499474,
 -0.007414830848574638,
 0.039037346839904785,
 0.01487586461007595,
 -0.07369969040155411,
 -0.03379564732313156,
 -0.025941139087080956,
 0.03884253278374672,
 0.027977382764220238,
 0.0008827262790873647,
 0.052573125809431076,
 0.05167347192764282,
 -0.025151550769805908,
 -0.05276501923799515,
 0.015540889464318752,
 -0.049494534730911255,
 -0.01282536517828703,
 -0.022013356909155846,
 0.04358859360218048,
 0.0027594962157309055,
 -0.022103294730186462,
 -0.020906765013933182,
 -0.01684598997235298,
 0.00951335858553648,
 0.043720223009586334,
 

In [127]:
pc = Pinecone(api_key="c054a566-8249-4d2b-9846-c1464fa2d0a5")
upsert_batch_size = 100
batch_size_embedding = 100

In [170]:
CHUNK_SIZE = 300
CHUNK_OVERLAP = 50

MODEL_NAME= 'thenlper/gte-large'

In [171]:
chunk_function = partial(chunk_section, chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP)
chunks = list()
for section in tqdm(data_chunk):
    for chunk in chunk_function(section):
        chunks.append(chunk)

model = get_embedding_model(
            embedding_model_name=MODEL_NAME,
            model_kwargs={"device": "mps"},
            encode_kwargs={"device": "mps", "batch_size": 100})

embedder = EmbedChunks(model_name=MODEL_NAME)

embedded_chunks = []
for i in tqdm(range(0, len(chunks), batch_size_embedding)):
    batch = chunks[i:i+batch_size_embedding]
    embedded_batch = embedder(batch)
    embedded_chunks.extend(embedded_batch)


EMBEDDING_SIZE = len(embedded_chunks[0]['embeddings'])
INDEX_NAME = f'{MODEL_NAME.split("/")[-1]}-{CHUNK_SIZE}-{CHUNK_OVERLAP}'.lower()

INDEX_LIST = pc.list_indexes().names()
if INDEX_NAME not in INDEX_LIST:
    if(len(INDEX_LIST)>0):
        pc.delete_index(INDEX_LIST[0])
        print("DELETED INDEX: ", INDEX_LIST[0])
    pc.create_index(
    name=INDEX_NAME,
    dimension=EMBEDDING_SIZE,
    metric="cosine",
    spec=PodSpec(
    environment="gcp-starter"))

    print("Created Index: ", INDEX_NAME)
    


index = pc.Index(INDEX_NAME)

upsert_data = [
    (str(i), chunk["embeddings"], {"text": chunk["text"], "source": chunk["source"]})
    for i, chunk in enumerate(embedded_chunks)
]
 
for i in tqdm(range(0, len(upsert_data), upsert_batch_size)):
    batch = upsert_data[i:i+upsert_batch_size]
    index.upsert(vectors=batch)

100%|█████████████████████████████████████| 3737/3737 [00:00<00:00, 9365.65it/s]
100%|█████████████████████████████████████████| 353/353 [57:30<00:00,  9.77s/it]


DELETED INDEX:  gte-large-500-100
Created Index:  gte-large-300-50


100%|█████████████████████████████████████████| 353/353 [10:10<00:00,  1.73s/it]


In [172]:
TOP_K = [5, 10, 15, 20]
for k in TOP_K:
    ret_score = calc_retrieval_score(model, k)
    data_to_add = {
        'Model_name': MODEL_NAME,
        'chunk_size': CHUNK_SIZE,
        'chunk_overlap': CHUNK_OVERLAP,
        'quality_score': None,
        'retrieval_score': ret_score,
        'top_k': k
    }
    data_df = pd.DataFrame([data_to_add])
    
    df = pd.concat([df, data_df], ignore_index=True)

100%|█████████████████████████████████████████| 405/405 [01:20<00:00,  5.01it/s]
100%|█████████████████████████████████████████| 405/405 [01:10<00:00,  5.72it/s]
100%|█████████████████████████████████████████| 405/405 [01:15<00:00,  5.36it/s]
100%|█████████████████████████████████████████| 405/405 [01:19<00:00,  5.11it/s]


In [173]:
file_path = r'./results.csv'
df.to_csv(file_path, index=False)

In [174]:
df

Unnamed: 0,Model_name,chunk_size,chunk_overlap,quality_score,retrieval_score,top_k
0,paraphrase-MiniLM-L6-v2,1000,100,,0.607407,10
1,paraphrase-MiniLM-L6-v2,1000,100,,0.508642,5
2,paraphrase-MiniLM-L6-v2,1000,100,,0.671605,15
3,paraphrase-MiniLM-L6-v2,1000,100,,0.698765,20
4,paraphrase-MiniLM-L6-v2,500,100,,0.718519,20
5,paraphrase-MiniLM-L6-v2,500,100,,0.520988,5
6,paraphrase-MiniLM-L6-v2,500,100,,0.639506,10
7,paraphrase-MiniLM-L6-v2,500,100,,0.68642,15
8,paraphrase-MiniLM-L6-v2,300,50,,0.496296,5
9,paraphrase-MiniLM-L6-v2,300,50,,0.614815,10


In [175]:


# Writing the JSON string to a file
file_path = f'{INDEX_NAME}.json'  # Define the path to the file

with open(file_path, 'w') as file:
    json.dump(upsert_data, file)

In [178]:
df.sort_values(by='retrieval_score', ascending = False)

Unnamed: 0,Model_name,chunk_size,chunk_overlap,quality_score,retrieval_score,top_k
43,thenlper/gte-large,500,100,,0.871605,20
39,thenlper/gte-large,1000,100,,0.85679,20
47,thenlper/gte-large,300,50,,0.844444,20
42,thenlper/gte-large,500,100,,0.841975,15
38,thenlper/gte-large,1000,100,,0.834568,15
19,sentence-transformers/all-MiniLM-L6-v2,750,50,,0.822222,20
31,sentence-transformers/all-MiniLM-L6-v2,500,100,,0.819753,20
46,thenlper/gte-large,300,50,,0.819753,15
23,sentence-transformers/all-MiniLM-L6-v2,1000,100,,0.809877,20
41,thenlper/gte-large,500,100,,0.8,10


# Generation

In [211]:
MODEL_NAME= 'thenlper/gte-large'
ret_model = get_embedding_model(
            embedding_model_name=MODEL_NAME,
            model_kwargs={"device": "mps"},
            encode_kwargs={"device": "mps", "batch_size": 100})

In [214]:
question = 'How to do Linear regression?'

In [216]:
ques_emd = ret_model.embed_query(question)
    
result = index.query(
            vector=ques_emd,
            top_k=5,
            include_values=True,
            include_metadata=True)

In [219]:
context_results = result.matches
context = [item.metadata["text"] for item in context_results]

In [256]:
client = OpenAI(api_key = OPENAI_API_KEY)

In [257]:
def get_client(llm):
#     if llm.startswith("gpt"):
#         base_url = os.environ["OPENAI_API_BASE"]
#         api_key = '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'
#     else:
#         base_url = os.environ["ANYSCALE_API_BASE"]
#         api_key = os.environ["ANYSCALE_API_KEY"]
#     client = openai.OpenAI(base_url=base_url, api_key=api_key)
    return client


def response_stream(chat_completion):
    for chunk in chat_completion:
        content = chunk.choices[0].delta.content
        if content is not None:
            yield content


def prepare_response(chat_completion, stream):
    if stream:
        return response_stream(chat_completion)
    else:
        return chat_completion.choices[0].message.content

def generate_response(
    llm, temperature=0.0, stream=True,
    system_content="", assistant_content="", user_content="", 
    max_retries=1, retry_interval=60):
    """Generate response from an LLM."""
    retry_count = 0
    client = get_client(llm=llm)
    messages = [{"role": role, "content": content} for role, content in [
        ("system", system_content), 
        ("assistant", assistant_content), 
        ("user", user_content)] if content]
    while retry_count <= max_retries:
        try:
            chat_completion = client.chat.completions.create(
                model=llm,
                temperature=temperature,
                stream=stream,
                messages=messages,
            )
            return prepare_response(chat_completion, stream=stream)

        except Exception as e:
            print(f"Exception: {e}")
            time.sleep(retry_interval)  # default is per-minute rate limits
            retry_count += 1
    return ""

In [258]:
query = "How to do linear regression?"
response = generate_response(
    llm="gpt-3.5-turbo",
    temperature=0.0,
    stream=True,
    system_content="Answer the query using the context provided. Be succinct.",
    user_content=f"query: {query}, context: {context}")
# Stream response
for content in response:
    print(content, end='', flush=True)

To perform linear regression, you can follow these steps:
1. Create a linear regression object using `linear_model.LinearRegression()`.
2. Train the model using the training sets with `regr.fit(diabetes_X_train, diabetes_y_train)`.
3. Make predictions using the testing set with `diabetes_y_pred = regr.predict(diabetes_X_test)`.

In [287]:

import torch
from langchain import HuggingFacePipeline
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig, pipeline

MODEL_NAME = "TheBloke/Llama-2-13b-Chat-GPTQ"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME, torch_dtype=torch.float16, trust_remote_code=True, device_map="auto"
)

generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
generation_config.max_new_tokens = 1024
generation_config.temperature = 0.0001
generation_config.top_p = 0.95
generation_config.do_sample = True
generation_config.repetition_penalty = 1.15

text_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=True,
    generation_config=generation_config,
)

llm = HuggingFacePipeline(pipeline=text_pipeline, model_kwargs={"temperature": 0})

PackageNotFoundError: auto-gptq