# Installing Dependencies

In [None]:
!pip install langchain
!pip install openai
!pip install pinecone-client
!pip install jq
!pip install tiktoken

# Importing Dependencies

In [None]:
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
import pinecone
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

# API keys and Env

In [4]:
openai_api_key = ''

PINECONE_API_KEY = ''

PINECONE_API_ENV = ''

# Reading the Json file

In [7]:
import json
from pathlib import Path
from pprint import pprint


file_path='/kaggle/input/json-dataset-of-people/Customer data.json'
data = json.loads(Path(file_path).read_text())

# The data insertion process requires a specified formart of a list of (id, vector) tuples. See more at https://docs.pinecone.io/docs/insert-data

## For us, for easy access, the id is replaced by the names of the customers, and the vectors are generated from rest of the data.

# Obtaining the names for our 'ids' part.

In [None]:

names_list = []
for json_obj in data:
  name = json_obj["first_name"] + " " + json_obj["last_name"]
  names_list.append(name)
names_list

# The list below is a text list of all the other data, that will be embedded

In [12]:
vector_list = []
for obj in data:
   names = obj["Ethnicity"] + " " + obj["Location"] + " " + obj["Occupation"] + " " + obj["email"] + " " + obj["gender"] + " " + obj["ip_address"]
   vector_list.append(names)
vector_list[:3]

['White Guararapes Nuclear Power Engineer pmacmaster0@bandcamp.com Male 208.24.50.172',
 'Tongan Fram Health Coach II csantori1@wisc.edu Female 151.160.135.93',
 'Delaware Bandera VP Product Management ccadle2@flavors.me Male 196.150.14.252']

# Embedding the documents Using Pinecone

In [13]:
# Get embedding engine ready
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

In [14]:
v_embeddings = embeddings.embed_documents(vector_list)

# Open Ai embeddings have dim = 1536.

In [12]:
len(v_embeddings[0])

1536

# Combining them both gives the (id,vector) tuple that can now be upserted to pinecone.

In [15]:
combined_list = [(names_list[i], v_embeddings[i]) for i in range(len(names_list))]

# The data needs to in 100 batches to be upserted.

# Splitting the data and storing them in one list.

In [16]:
main_list=[]
first_100 = combined_list[:100]
main_list.append(first_100)
second_100 = combined_list[100:200]
main_list.append(second_100)
third_100 = combined_list[200:300]
main_list.append(third_100)
fourth_100 = combined_list[300:400]
main_list.append(fourth_100)
fifth_100 = combined_list[400:500]
main_list.append(fifth_100)
sixth_100 = combined_list[500:600]
main_list.append(sixth_100)
seventh_100 = combined_list[600:700]
main_list.append(seventh_100)
eighth_100 = combined_list[700:800]
main_list.append(eighth_100)
ninth_100 = combined_list[800:900]
main_list.append(ninth_100)
tenth_100 = combined_list[900:1000]
main_list.append(tenth_100)


# Checking if the list are in 100 batches as recommended

In [25]:
print(len(first_100))
print(len(tenth_100))

100

100


# The length of the list should be 10

In [17]:
len(main_list)

10

# Initialize pinecone

In [18]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "insert-data" # put in the name of your pinecone index here

# Instantiating the index.

In [19]:
index = pinecone.Index("insert-data")

# Can only Upsert data in batches of 100. The loop below will upsert the data in required batches

In [20]:
for i in main_list:
    index.upsert(
        i
    )

# Describing the index

In [23]:
pinecone.describe_index("insert-data")

IndexDescription(name='insert-data', metric='cosine', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='p1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')