In [13]:
# ! pip install pinecone-client

In [15]:
from dotenv import load_dotenv
import os
import json
import pandas as pd
from pinecone import Pinecone, ServerlessSpec

## Setup

In [10]:
load_dotenv()
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
DATA_DIRECTORY = os.getenv('DATA_DIRECTORY')
pc = Pinecone(api_key=PINECONE_API_KEY)

## Import data

In [44]:
csv_file_path = os.path.join(DATA_DIRECTORY, 'data.csv')
data = pd.read_csv(csv_file_path)
data['id'] = data['id'].astype(str)
data.head()

data = data[data['textbook'] != "An Introduction to Statistical Learning"] # Exceeds the storage limit

## Create database

In [42]:
INDEX_NAME = os.getenv('INDEX_NAME')
CLOUD = os.getenv('CLOUD') or 'aws'
REGION = os.getenv('REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=CLOUD, region=REGION)

if INDEX_NAME not in pc.list_indexes():
    pc.create_index(
        INDEX_NAME,
        metric="dotproduct",
        dimension=1024,
        spec = spec
    )
index = pc.Index(INDEX_NAME)

### Upload data

In [45]:
data['embedding'] = data['embedding'].apply(lambda x: json.loads(x))
to_insert = data.apply(lambda row: (row['id'], row['embedding'], {"content": row['content'], "textbook": row['textbook'], "chapter": row['chapter']}), axis=1).tolist()
index.upsert(vectors=to_insert)
print("Data uploaded successfully!")