<a href="https://colab.research.google.com/github/nithin-k-mundrathi/LLM-practice/blob/main/RAG/RAG_C_6/2_Scaling_Pipeline_VectorStore.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
%%capture
!pip install pinecone-client==5.0.1

In [3]:
f = open("pinecone.txt", "r")
PINECONE_API_KEY=f.readline()
f.close()

## Processing bank churn dataset

In [35]:
import pandas as pd
# Load the CSV file
file_path = '/content/Customer-Churn-Records.csv'
data1 = pd.read_csv(file_path).loc[:300,:]

In [36]:
# Count the chunks
number_of_lines = len(data1)
print("Number of lines: ",number_of_lines)

Number of lines:  301


In [37]:
import pandas as pd
# Initialize an empty list to store the lines
output_lines = []

# Iterate over each row in the DataFrame
for index, row in data1.iterrows():
    # Create a list of "column_name: value" for each column in the row
    row_data = [f"{col}: {row[col]}" for col in data1.columns]
    # Join the list into a single string separated by spaces
    line = ' '.join(row_data)
    # Append the line to the output list
    output_lines.append(line)

# Display or further process `output_lines` as needed
for line in output_lines[:3]:  # Displaying first 5 lines for preview
    print(line)

RowNumber: 1 CustomerId: 15634602 Surname: Hargrave CreditScore: 619 Geography: France Gender: Female Age: 42 Tenure: 2 Balance: 0.0 NumOfProducts: 1 HasCrCard: 1 IsActiveMember: 1 EstimatedSalary: 101348.88 Exited: 1 Complain: 1 Satisfaction Score: 2 Card Type: DIAMOND Point Earned: 464
RowNumber: 2 CustomerId: 15647311 Surname: Hill CreditScore: 608 Geography: Spain Gender: Female Age: 41 Tenure: 1 Balance: 83807.86 NumOfProducts: 1 HasCrCard: 0 IsActiveMember: 1 EstimatedSalary: 112542.58 Exited: 0 Complain: 1 Satisfaction Score: 3 Card Type: DIAMOND Point Earned: 456
RowNumber: 3 CustomerId: 15619304 Surname: Onio CreditScore: 502 Geography: France Gender: Female Age: 42 Tenure: 8 Balance: 159660.8 NumOfProducts: 3 HasCrCard: 1 IsActiveMember: 0 EstimatedSalary: 113931.57 Exited: 1 Complain: 1 Satisfaction Score: 3 Card Type: DIAMOND Point Earned: 377


In [38]:
lines = output_lines.copy()
# Count the lines
number_of_lines = len(lines)
print("Number of lines: ",number_of_lines)

Number of lines:  301


## Chunking and embedding the dataset

In [39]:
import time
start_time = time.time()  # Start timing before the request
# Initialize an empty list for the chunks
chunks = []
# Add each line as a separate chunk to the chunks list
for line in lines:
    chunks.append(line)  # Each line becomes its own chunk

# Now, each line is treated as a separate chunk
print(f"Total number of chunks: {len(chunks)}")
response_time = time.time() - start_time  # Measure response time
print(f"Response Time: {response_time:.2f} seconds")  # Print response time

Total number of chunks: 301
Response Time: 0.00 seconds


In [40]:
for i in range(3):
    print(len(chunks[i]))
    print(chunks[i])

288
RowNumber: 1 CustomerId: 15634602 Surname: Hargrave CreditScore: 619 Geography: France Gender: Female Age: 42 Tenure: 2 Balance: 0.0 NumOfProducts: 1 HasCrCard: 1 IsActiveMember: 1 EstimatedSalary: 101348.88 Exited: 1 Complain: 1 Satisfaction Score: 2 Card Type: DIAMOND Point Earned: 464
288
RowNumber: 2 CustomerId: 15647311 Surname: Hill CreditScore: 608 Geography: Spain Gender: Female Age: 41 Tenure: 1 Balance: 83807.86 NumOfProducts: 1 HasCrCard: 0 IsActiveMember: 1 EstimatedSalary: 112542.58 Exited: 0 Complain: 1 Satisfaction Score: 3 Card Type: DIAMOND Point Earned: 456
289
RowNumber: 3 CustomerId: 15619304 Surname: Onio CreditScore: 502 Geography: France Gender: Female Age: 42 Tenure: 8 Balance: 159660.8 NumOfProducts: 3 HasCrCard: 1 IsActiveMember: 0 EstimatedSalary: 113931.57 Exited: 1 Complain: 1 Satisfaction Score: 3 Card Type: DIAMOND Point Earned: 377


## Embedding

In [41]:
from sentence_transformers import SentenceTransformer
model_sentence = SentenceTransformer('all-MiniLm-L6-v2')

In [11]:
def get_embedding(texts, model= model_sentence):
    texts = [text.replace("\n", " ") for text in texts]  # Clean input texts
    embeddings = [model.encode(data,ignore_errors=True) for data in texts]  # Extract embeddings
    return embeddings

## Embedding the Chunks

In [42]:
def embed_chunks(chunks, embedding_model=model_sentence, batch_size=1000, pause_time=3):
    start_time = time.time()  # Start timing the operation
    embeddings = []  # Initialize an empty list to store the embeddings
    counter = 1  # Batch counter

    # Process chunks in batches
    for i in range(0, len(chunks), batch_size):
        chunk_batch = chunks[i:i + batch_size]  # Select a batch of chunks

        # Get the embeddings for the current batch
        current_embeddings = get_embedding(chunk_batch, model=embedding_model)

        # Append the embeddings to the final list
        embeddings.extend(current_embeddings)

        # Print batch progress and pause
        print(f"Batch {counter} embedded.")
        counter += 1
        time.sleep(pause_time)  # Optional: adjust or remove this depending on rate limits

    # Print total response time
    response_time = time.time() - start_time
    print(f"Total Response Time: {response_time:.2f} seconds")

    return embeddings

embeddings = embed_chunks(chunks)

Batch 1 embedded.
Total Response Time: 5.03 seconds


In [43]:
len(embeddings[0])

384

In [13]:
print("First embedding:", embeddings[0])

First embedding: [-6.38262108e-02  1.68146752e-02 -8.85062441e-02 -1.22976536e-03
 -4.86669950e-02  9.61372033e-02 -7.88151380e-03  2.11041179e-02
 -3.00560556e-02 -8.31193030e-02 -1.93316564e-02 -1.00771211e-01
 -2.05566362e-02 -8.04295912e-02 -1.02015629e-01 -2.12674364e-02
 -8.09553452e-03 -1.72354765e-02  4.28637192e-02 -9.35124978e-03
 -1.28633305e-02  6.10680021e-02 -2.29817666e-02 -2.99872644e-02
  4.67298349e-04 -6.50635436e-02 -5.46946190e-02  3.07161584e-02
 -3.86336036e-02 -5.73025569e-02  4.89004552e-02  7.33829066e-02
  1.11306369e-01  8.14606845e-02  3.69490832e-02 -3.02295368e-02
 -5.19268103e-02 -1.96103621e-02 -3.66857499e-02  4.58646864e-02
  2.43729949e-02 -8.82684365e-02 -4.21835063e-03  1.46632707e-02
 -5.63134206e-03 -2.62033455e-02 -3.98395807e-02  9.34956409e-03
 -2.98012309e-02  8.95939618e-02 -1.02689721e-01  8.56672451e-02
  2.60421466e-02 -8.02441686e-02 -6.02043159e-02  4.40684007e-03
  6.94212876e-03 -2.52083782e-02  5.34914713e-03  4.13522385e-02
  5.5230

In [44]:
# Check the lengths of the chunks and embeddings
num_chunks = len(chunks)
print(f"Number of chunks: {num_chunks}")
print(f"Number of embeddings: {len(embeddings)}")

Number of chunks: 301
Number of embeddings: 301


In [45]:
# Define the duplication size
dsize = 1  # You can set this to any value between 1 and n as per your experimentation requirements
total=dsize * len(chunks)
print("Total size", total)
# Initialize new lists for duplicated chunks and embeddings
duplicated_chunks = []
duplicated_embeddings = []

# Loop through the original lists and duplicate each entry
for i in range(len(chunks)):
    for _ in range(dsize):
        duplicated_chunks.append(chunks[i])
        duplicated_embeddings.append(embeddings[i])

# Checking the lengths of the duplicated lists
print(f"Number of duplicated chunks: {len(duplicated_chunks)}")
print(f"Number of duplicated embeddings: {len(duplicated_embeddings)}")

Total size 301
Number of duplicated chunks: 301
Number of duplicated embeddings: 301


## The Pinecone index

In [46]:
import os
from pinecone import Pinecone, ServerlessSpec

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'

from pinecone import Pinecone, ServerlessSpec
pc = Pinecone(api_key=PINECONE_API_KEY)

In [47]:
from pinecone import ServerlessSpec

index_name = 'bank-index-50000'
cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [48]:
import time
import pinecone
# check if index already exists (it shouldn't if this is first time)
if index_name not in pc.list_indexes().names():
    # if does not exist, create index
    pc.create_index(
        index_name,
        dimension=384,  # dimension of the embedding model
        metric='cosine',
        spec=spec
    )
    # wait for index to be initialized
    time.sleep(1)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Upserting

In [49]:
# upsert function
def upsert_to_pinecone(data, batch_size):
    for i in range(0, len(data), batch_size):
        batch = data[i:i+batch_size]
        index.upsert(vectors=batch)
        #time.sleep(1)  # Optional: add delay to avoid rate limits

In [50]:
import pinecone
import time
import sys

start_time = time.time()  # Start timing before the request

# Function to calculate the size of a batch
def get_batch_size(data, limit=600000):  # limit set to 4MB to be safe
    total_size = 0
    batch_size = 0
    for item in data:
        item_size = sum([sys.getsizeof(v) for v in item.values()])
        if total_size + item_size > limit:
            break
        total_size += item_size
        batch_size += 1
    return batch_size

def batch_upsert(data):
    total = len(data)
    i = 0
    while i < total:
        batch_size = get_batch_size(data[i:])
        batch = data[i:i + batch_size]
        if batch:
            upsert_to_pinecone(batch,batch_size)
            i += batch_size
            print(f"Upserted {i}/{total} items...")  # Display current progress
        else:
            break
    print("Upsert complete.")

# Generate IDs for each data item
ids = [str(i) for i in range(1, len(duplicated_chunks) + 1)]

# Prepare data for upsert
data_for_upsert = [
    {"id": str(id), "values": emb, "metadata": {"text": chunk}}
    for id, (chunk, emb) in zip(ids, zip(duplicated_chunks, duplicated_embeddings))
]

# Upsert data in batches
batch_upsert(data_for_upsert)

response_time = time.time() - start_time  # Measure response time
print(f"Upsertion response time: {response_time:.2f} seconds")  # Print response time

Upserted 301/301 items...
Upsert complete.
Upsertion response time: 3.10 seconds


In [51]:
print("Index stats")
print(index.describe_index_stats(include_metadata=True))

Index stats
{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}


## Querying the Index

In [52]:
# Print the query results along with metadata
def display_results(query_results):
  for match in query_results['matches']:
    print(f"ID: {match['id']}, Score: {match['score']}")
    if 'metadata' in match and 'text' in match['metadata']:
        print(f"Text: {match['metadata']['text']}")
    else:
        print("No metadata available.")

In [53]:
def get_embedding(text, model=model_sentence):
    text = text.replace("\n", " ")
    embedding = model.encode(text,ignore_errors=True)
    return embedding

In [62]:
print("Querying vector store")
start_time = time.time()  # Start timing before the request
query_text = "Customer Hill CreditScore 608Age 41 Tenure 1Balance 83807.86NumOfProducts 1HasCrCard 0IsActiveMember 1EstimatedSalary 112542.58 Exited 0Complain 1Satisfaction Score 3Card Type DIAMONDPoint Earned 456"
query_embedding = get_embedding(query_text,model=model_sentence).tolist()
query_results = index.query(vector=query_embedding, top_k=1, include_metadata=True)  # Request metadata
#print("raw query_results",query_results)
print("processed query results")
display_results(query_results) #display results
response_time = time.time() - start_time              # Measure response time
print(f"Querying response time: {response_time:.2f} seconds")  # Print response time

Querying vector store
processed query results
ID: 2, Score: 0.714985132
Text: RowNumber: 2 CustomerId: 15647311 Surname: Hill CreditScore: 608 Geography: Spain Gender: Female Age: 41 Tenure: 1 Balance: 83807.86 NumOfProducts: 1 HasCrCard: 0 IsActiveMember: 1 EstimatedSalary: 112542.58 Exited: 0 Complain: 1 Satisfaction Score: 3 Card Type: DIAMOND Point Earned: 456
Querying response time: 1.00 seconds


In [63]:
data1

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited,Complain,Satisfaction Score,Card Type,Point Earned
0,1,15634602,Hargrave,619,France,Female,42,2,0.00,1,1,1,101348.88,1,1,2,DIAMOND,464
1,2,15647311,Hill,608,Spain,Female,41,1,83807.86,1,0,1,112542.58,0,1,3,DIAMOND,456
2,3,15619304,Onio,502,France,Female,42,8,159660.80,3,1,0,113931.57,1,1,3,DIAMOND,377
3,4,15701354,Boni,699,France,Female,39,1,0.00,2,0,0,93826.63,0,0,5,GOLD,350
4,5,15737888,Mitchell,850,Spain,Female,43,2,125510.82,1,1,1,79084.10,0,0,5,GOLD,425
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296,297,15774510,Tien,714,France,Female,31,4,125169.26,1,1,1,106636.89,0,0,3,PLATINUM,669
297,298,15684173,Chang,687,Spain,Female,44,7,0.00,3,1,0,155853.52,1,1,2,SILVER,336
298,299,15650068,Johnson,511,France,Male,58,0,149117.31,1,1,1,162599.51,0,0,1,GOLD,517
299,300,15811490,French,627,France,Male,33,5,0.00,2,1,1,103737.82,0,0,4,PLATINUM,499
