# Installing Dependencies.

In [None]:
!pip install langchain
!pip install openai
!pip install pinecone-client
!pip install jq
!pip install tiktoken

# Importing Dependencies.

In [None]:
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
import pinecone
from langchain.document_loaders import JSONLoader


# API keys and Env.

In [4]:
openai_api_key = ''

PINECONE_API_KEY = ''

PINECONE_API_ENV = ''

# Changing the json to a dictionary.

In [None]:
import json

# Read the JSON file
with open('/kaggle/input/json-dataset-of-people/Customer data.json', 'r') as file:
    json_data = file.read()

# Parse the JSON data into a dictionary
data_dict = json.loads(json_data)

# Now you can work with the data_dict as a regular Python dictionary
print(data_dict)

# Removing the names from our metadata list.

In [6]:
for obj in data_dict:
    del obj['first_name']
    del obj['last_name']

# Initialize pinecone.

In [8]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "metadata-insert" # put in the name of your pinecone index here



# Getting embeddings ready.

In [11]:
# Get embedding engine ready
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

# Instantiating the index.

In [9]:
index = pinecone.Index("metadata-insert")

# Querying using metadata.

In [12]:
text = "Return anyone with id given by the metadata"

# embedding the query
query_vector = embeddings.embed_query(text)


# link to Metadata Filtering https://docs.pinecone.io/docs/metadata-filtering

## we used all the key's in our dictionary as the metadata excluding only the (first_name and last_name).So the metadata we can use in querying are: ['id', 'email', 'gender', 'ip_address', 'Location', 'Occupation', 'Ethnicity']

In [13]:
print(data_dict[0].keys())

dict_keys(['id', 'email', 'gender', 'ip_address', 'Location', 'Occupation', 'Ethnicity'])


In [14]:
result= index.query(
            vector=query_vector,
            filter={
                "id": 5 
            },
            top_k=1,
            include_metadata=True
)

In [15]:
result

{'matches': [{'id': 'Beverie Frandsen',
              'metadata': {'Ethnicity': 'Yakama',
                           'Location': 'Longwei',
                           'Occupation': 'Developer III',
                           'email': 'bfrandsen4@cargocollective.com',
                           'gender': 'Female',
                           'id': 5.0,
                           'ip_address': '235.124.253.241'},
              'score': 0.680275083,
              'values': []}],
 'namespace': ''}

You can confirm from your original data if this is accurate. 