In [4]:
import pandas as pd
import pyarrow.parquet as pa
import spacy


In [5]:
#imports the load dataset from the Hugging Face DataSet library
from datasets import load_dataset
#loads the first 10,0000 rows from Parquet files in the variable data
data = load_dataset("parquet", data_files="train-00000-of-00001-090b52ccb189d47a.parquet", split='train[:10000]')
data

Dataset({
    features: ['text'],
    num_rows: 10000
})

In [6]:
import re
#checking if row is title 
def is_title(row):
    return len(row['text'].split()) <= 2
    

In [7]:
def separate_title_and_text(dataset):
    final_rows = []
    current_title = None
    accumulated_text = []

    for row in dataset:
        if is_title(row):  # Identify if the row is a title
            if current_title is not None:
                # Save the previous title and its accumulated text
                final_rows.append({
                    'title': current_title,
                    'text': ' '.join(accumulated_text)
                })
            # Update the current title and reset the accumulated text
            current_title = row['text']
            accumulated_text = []
        else:
            # If it's part of the text, accumulate it
            accumulated_text.append(row['text'])

    # Append the last title and its text if exists
    if current_title is not None:
        final_rows.append({
            'title': current_title,
            'text': ' '.join(accumulated_text)
        })

    return final_rows

wiki_data = separate_title_and_text(data)


In [8]:
# converts lists into Hugging Face Dataset
wiki_data=data.from_list(wiki_data)

In [9]:
# Convert wiki_data into an array of dictionaries and add in a unique id for each article
wiki_dictionaries = []
for i in range(len(wiki_data)):
    new_dict = {"id": i, "title": wiki_data['title'][i], "text": wiki_data['text'][i]}
    wiki_dictionaries.append(new_dict)

In [10]:
#converts the list of dictionaries into Hugging Face Dataset
wiki_data=wiki_data.from_list(wiki_dictionaries)

In [11]:
import tiktoken
tiktoken.encoding_for_model('gpt-3.5-turbo')

<Encoding 'cl100k_base'>

In [12]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [13]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [14]:
# Split text into 400-char chunks with 20-char overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""], 
)

In [15]:
#data_files="train-00000-of-00001-090b52ccb189d47a.parquet"

In [16]:
data_files = wiki_data

In [17]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

data_files.set_format("torch", columns=["text"])

def generate_embedding(batch):
    inputs = tokenizer(batch['text'], return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    batch_embeddings=embeddings.numpy().tolist()
    return batch

In [18]:
embeddings = data_files.map(generate_embedding,batched=True,batch_size=16)

Map: 100%|█████████████████████████████| 988/988 [04:39<00:00,  3.53 examples/s]


In [39]:
import pinecone

In [43]:
from pinecone import Pinecone, ServerlessSpec
import torch
from transformers import AutoTokenizer, AutoModel


# Initialize Pinecone
pc = Pinecone(api_key="188cbb39-7aad-4697-8c0f-d9172a679d56")
index_name = "experiment2"


# Get existing index or create new one
try:
    index = pc.Index(index_name)
except:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(
            cloud="aws",
            region="us-east-1"
        )
    )
    index = pc.Index(index_name)


# Prepare and upload vectors in batches
batch_size = 50
for i in range(0, len(embeddings), batch_size):
    batch_vectors = []
    for j, e in enumerate(embeddings[i:i + batch_size]):
        batch_vectors.append({
            "id": str(i + j),
            "values": e,
        })
    
    # Upsert batch
    index.upsert(
        vectors=batch_vectors,
        namespace="ns1"
    )
    print(f"Uploaded batch {i//batch_size + 1}")
    

Uploaded batch 1
Uploaded batch 2
Uploaded batch 3
Uploaded batch 4
Uploaded batch 5
Uploaded batch 6
Uploaded batch 7
Uploaded batch 8
Uploaded batch 9
Uploaded batch 10
Uploaded batch 11
Uploaded batch 12
Uploaded batch 13
Uploaded batch 14
Uploaded batch 15
Uploaded batch 16
Uploaded batch 17
Uploaded batch 18
Uploaded batch 19
Uploaded batch 20


In [41]:
# indexes = index.list_indexes()
# print("Indexes:", indexes)
index_name = "experiment" 
index = pc.describe_index(index_name)
print(index)
print(embeddings[0].shape)

{'deletion_protection': 'disabled',
 'dimension': 1536,
 'host': 'experiment-3x6unml.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'experiment',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}
torch.Size([768])


In [49]:
# Define the query
query = "Tell me about the tech company known as Apple"

# Generate the query embedding
inputs = tokenizer(query, return_tensors="pt", padding=True, truncation=True)
with torch.no_grad():
    query_embedding = model(**inputs).last_hidden_state.mean(dim=1).numpy()[0]

# Query the Pinecone index
results = index.query(
    namespace="ns1",  # Ensure this matches your namespace
    vector=query_embedding.tolist(),
    top_k=3,
    include_values=False,
    include_metadata=True
)

# Print the results
print(results)

{'matches': [{'id': '296',
              'score': 0.570666552,
              'values': [0.0156990588,
                         -0.208821476,
                         -0.0209778342,
                         -0.00953460857,
                         -0.0218118504,
                         0.0796902478,
                         -0.0217291452,
                         -0.0261138286,
                         0.0788325667,
                         0.045102451,
                         -0.00973570533,
                         0.126339719,
                         0.0329154506,
                         0.0262293536,
                         -0.0586323,
                         0.00346624292,
                         0.252259374,
                         0.0105415098,
                         -0.0623872355,
                         0.0240055826,
                         -0.0134642515,
                         -0.0167109147,
                         -0.0619047098,
                         -0.0086

In [6]:
pc

<pinecone.control.pinecone.Pinecone at 0x11fe586d0>