In [1]:
import pandas as pd
import pyarrow.parquet as pa
import spacy


In [2]:
#imports the load dataset from the Hugging Face DataSet library
from datasets import load_dataset
#loads the first 10,0000 rows from Parquet files in the variable data
data = load_dataset("parquet", data_files="train-00000-of-00001-090b52ccb189d47a.parquet", split='train[:10000]')
data

Dataset({
    features: ['text'],
    num_rows: 10000
})

In [3]:
import re
#checking if row is title 
def is_title(row):
    return len(row['text'].split()) <= 2
    

In [4]:
def separate_title_and_text(dataset):
    final_rows = []
    current_title = None
    accumulated_text = []

    for row in dataset:
        if is_title(row):  # Identify if the row is a title
            if current_title is not None:
                # Save the previous title and its accumulated text
                final_rows.append({
                    'title': current_title,
                    'text': ' '.join(accumulated_text)
                })
            # Update the current title and reset the accumulated text
            current_title = row['text']
            accumulated_text = []
        else:
            # If it's part of the text, accumulate it
            accumulated_text.append(row['text'])

    # Append the last title and its text if exists
    if current_title is not None:
        final_rows.append({
            'title': current_title,
            'text': ' '.join(accumulated_text)
        })

    return final_rows

wiki_data = separate_title_and_text(data)


In [5]:
# converts lists into Hugging Face Dataset
wiki_data=data.from_list(wiki_data)

In [6]:
# Convert wiki_data into an array of dictionaries and add in a unique id for each article
wiki_dictionaries = []
for i in range(len(wiki_data)):
    new_dict = {"id": i, "title": wiki_data['title'][i], "text": wiki_data['text'][i]}
    wiki_dictionaries.append(new_dict)

In [7]:
#converts the list of dictionaries into Hugging Face Dataset
wiki_data=wiki_data.from_list(wiki_dictionaries)

In [8]:
import tiktoken
tiktoken.encoding_for_model('gpt-3.5-turbo')

<Encoding 'cl100k_base'>

In [9]:
import tiktoken

tokenizer = tiktoken.get_encoding('cl100k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [11]:
# Split text into 400-char chunks with 20-char overlap
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""], 
)

In [24]:
#data_files="train-00000-of-00001-090b52ccb189d47a.parquet"

In [12]:
data_files = wiki_data

In [13]:
from transformers import AutoTokenizer, AutoModel
import torch

model_name = "sentence-transformers/all-mpnet-base-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

data_files.set_format("torch", columns=["text"])

def generate_embedding(batch):
    inputs = tokenizer(batch['text'], return_tensors="pt", padding=True, truncation=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    batch_embeddings=embeddings.numpy().tolist()
    return batch

In [14]:
embeddings = data_files.map(generate_embedding,batched=True,batch_size=16)

Map:   0%|          | 0/988 [00:00<?, ? examples/s]

In [15]:
import pinecone

In [16]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key="188cbb39-7aad-4697-8c0f-d9172a679d56")

ids = []
for i in range(len(embeddings)):
    ids.append(i)

vectors = []
for i, e in zip(ids, embeddings):
    vectors.append({
        "id": i,
        "values": e,
    })

index_name = "quickstart"

pc.create_index(
    name=index_name,
    dimension=2, # Replace with your model dimensions
    metric="cosine", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

index.upsert(
    vectors=vectors,
    namespace="ns1"
)

PineconeApiException: (409)
Reason: Conflict
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'x-pinecone-api-version': '2024-07', 'X-Cloud-Trace-Context': 'd89886ba2da0dfed02df71fc0361a63f', 'Date': 'Fri, 15 Nov 2024 03:12:19 GMT', 'Server': 'Google Frontend', 'Content-Length': '85', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"ALREADY_EXISTS","message":"Resource  already exists"},"status":409}


In [6]:
pc

<pinecone.control.pinecone.Pinecone at 0x11fe586d0>