# Installing Dependencies.

In [None]:
!pip install langchain
!pip install openai
!pip install pinecone-client
!pip install jq
!pip install tiktoken

# Importing Dependencies.

In [None]:
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
import pinecone
from langchain.document_loaders import JSONLoader


# API keys and Env.

In [3]:
openai_api_key = ''

PINECONE_API_KEY = ''

PINECONE_API_ENV = ''

# Changing the json to a dictionary.

In [None]:
import json

# Read the JSON file
with open('/kaggle/input/json-dataset-of-people/Customer data.json', 'r') as file:
    json_data = file.read()

# Parse the JSON data into a dictionary
data_dict = json.loads(json_data)

# Now you can work with the data_dict as a regular Python dictionary
print(data_dict)

# Removing the names from our metadata list.

In [5]:
for obj in data_dict:
    del obj['first_name']
    del obj['last_name']

# Initialize pinecone.

In [6]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "metadata-insert" # put in the name of your pinecone index here



# Getting embeddings ready.

In [7]:
# Get embedding engine ready
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

# Instantiating the index.

In [8]:
index = pinecone.Index("metadata-insert")

# Querying using metadata.

In [9]:
text = "Return anyone described by the metadata"

# embedding the query
query_vector = embeddings.embed_query(text)


# link to Metadata Filtering https://docs.pinecone.io/docs/metadata-filtering

## we used all the key's in our dictionary as the metadata excluding only the (first_name and last_name).So the metadata we can use in querying are: ['id', 'email', 'gender', 'ip_address', 'Location', 'Occupation', 'Ethnicity']

In [13]:
print(data_dict[0].keys())

dict_keys(['id', 'email', 'gender', 'ip_address', 'Location', 'Occupation', 'Ethnicity'])


# No. 1
## $eq - Equal to (number, string, boolean)
## The query returns anyone who fits the ethnicity of filipino and is female, increasing     the top_k number returns all the entries you need.

In [16]:
result= index.query(
            vector=query_vector,
            filter={
                "Ethnicity": {"$eq": "Filipino"},
                "gender":{"$eq": "Female"}
            },
            top_k=4,
            include_metadata=True
)

In [14]:
result

{'matches': [{'id': 'Jerrilyn Crankshaw',
              'metadata': {'Ethnicity': 'Filipino',
                           'Location': 'São Bento',
                           'Occupation': 'Geological Engineer',
                           'email': 'jcrankshaw83@networksolutions.com',
                           'gender': 'Female',
                           'id': 292.0,
                           'ip_address': '130.220.32.157'},
              'score': 0.701002598,
              'values': []},
             {'id': 'Magdaia Amberger',
              'metadata': {'Ethnicity': 'Filipino',
                           'Location': 'Verkhniy Baskunchak',
                           'Occupation': 'Environmental Specialist',
                           'email': 'mambergercg@walmart.com',
                           'gender': 'Female',
                           'id': 449.0,
                           'ip_address': '85.51.110.216'},
              'score': 0.700918615,
              'values': []},
        

You can confirm from your original data if this is accurate. 

# No. 2
## $ne - Not equal to (number, string, boolean)
## works just like a traditional database, return all values except for the one specified.
## Lets return any gender that is Filipino but isn't female in the code below.

In [17]:
result_2 = index.query(
            vector=query_vector,
            filter={
                "Ethnicity": {"$eq": "Filipino"},
                "gender":{"$ne": "Female"}
            },
            top_k=4,
            include_metadata=True
)

In [18]:
result_2

{'matches': [{'id': 'Jacobo Whittuck',
              'metadata': {'Ethnicity': 'Filipino',
                           'Location': 'Frei Paulo',
                           'Occupation': 'Information Systems Manager',
                           'email': 'jwhittuckpz@engadget.com',
                           'gender': 'Male',
                           'id': 936.0,
                           'ip_address': '230.85.80.123'},
              'score': 0.69381088,
              'values': []},
             {'id': 'Brose Seemmonds',
              'metadata': {'Ethnicity': 'Filipino',
                           'Location': 'Salt Lake City',
                           'Occupation': 'Human Resources Manager',
                           'email': 'bseemmonds1r@google.com.br',
                           'gender': 'Genderfluid',
                           'id': 64.0,
                           'ip_address': '241.116.198.85'},
              'score': 0.691104174,
              'values': []},
             {

# No. 3
## $gt - Greater than (number).
## When you are using int field types and want to filter by number size.
## Let's return entries with data with id>100 in the query below.

In [19]:
result_3 = index.query(
            vector=query_vector,
            filter={
                "Ethnicity": {"$eq": "Filipino"},
                "gender":{"$ne": "Female"},
                "id":{"$gt": 100}
            },
            top_k=4,
            include_metadata=True
)

In [20]:
result_3

{'matches': [{'id': 'Jacobo Whittuck',
              'metadata': {'Ethnicity': 'Filipino',
                           'Location': 'Frei Paulo',
                           'Occupation': 'Information Systems Manager',
                           'email': 'jwhittuckpz@engadget.com',
                           'gender': 'Male',
                           'id': 936.0,
                           'ip_address': '230.85.80.123'},
              'score': 0.69381088,
              'values': []},
             {'id': 'Jay Farrington',
              'metadata': {'Ethnicity': 'Filipino',
                           'Location': 'Llano Largo',
                           'Occupation': 'Account Coordinator',
                           'email': 'jfarringtonbu@cocolog-nifty.com',
                           'gender': 'Male',
                           'id': 427.0,
                           'ip_address': '192.2.79.199'},
              'score': 0.688752294,
              'values': []},
             {'id': 'Jech

Confirm the id values are above 100.

# No. 4
## $gte - Greater than or equal to (number)
## Works pretty much like no.3, just includes the 100th id.

In [22]:
result_4 = index.query(
            vector=query_vector,
            filter={
                "Ethnicity": {"$eq": "Filipino"},
                "gender":{"$ne": "Female"},
                "id":{"$gte": 100}
            },
            top_k=4,
            include_metadata=True
)

In [23]:
result_4

{'matches': [{'id': 'Jacobo Whittuck',
              'metadata': {'Ethnicity': 'Filipino',
                           'Location': 'Frei Paulo',
                           'Occupation': 'Information Systems Manager',
                           'email': 'jwhittuckpz@engadget.com',
                           'gender': 'Male',
                           'id': 936.0,
                           'ip_address': '230.85.80.123'},
              'score': 0.69381088,
              'values': []},
             {'id': 'Jay Farrington',
              'metadata': {'Ethnicity': 'Filipino',
                           'Location': 'Llano Largo',
                           'Occupation': 'Account Coordinator',
                           'email': 'jfarringtonbu@cocolog-nifty.com',
                           'gender': 'Male',
                           'id': 427.0,
                           'ip_address': '192.2.79.199'},
              'score': 0.688752294,
              'values': []},
             {'id': 'Jech

# No. 5
## $lt - Less than (number)
## Returns all the values lower than an int value.

In [24]:
result_5 = index.query(
            vector=query_vector,
            filter={
                "Ethnicity": {"$eq": "Filipino"},
                "gender":{"$ne": "Female"},
                "id":{"$lt": 100}
            },
            top_k=4,
            include_metadata=True
)

In [25]:
result_5

{'matches': [{'id': 'Brose Seemmonds',
              'metadata': {'Ethnicity': 'Filipino',
                           'Location': 'Salt Lake City',
                           'Occupation': 'Human Resources Manager',
                           'email': 'bseemmonds1r@google.com.br',
                           'gender': 'Genderfluid',
                           'id': 64.0,
                           'ip_address': '241.116.198.85'},
              'score': 0.691104174,
              'values': []},
             {'id': 'Aaron Schankel',
              'metadata': {'Ethnicity': 'Filipino',
                           'Location': 'Shahrisabz Shahri',
                           'Occupation': 'Account Executive',
                           'email': 'aschankel21@msn.com',
                           'gender': 'Male',
                           'id': 74.0,
                           'ip_address': '99.81.173.112'},
              'score': 0.679342866,
              'values': []},
             {'id': 'Pi

# No. 6
## $lte - Less than or equal to (number)
## Works pretty much like no.5, just includes the 100th id

In [27]:
result_6 = index.query(
            vector=query_vector,
            filter={
                "Ethnicity": {"$eq": "Filipino"},
                "gender":{"$ne": "Female"},
                "id":{"$lte": 100}
            },
            top_k=4,
            include_metadata=True
)

In [28]:
result_6

{'matches': [{'id': 'Brose Seemmonds',
              'metadata': {'Ethnicity': 'Filipino',
                           'Location': 'Salt Lake City',
                           'Occupation': 'Human Resources Manager',
                           'email': 'bseemmonds1r@google.com.br',
                           'gender': 'Genderfluid',
                           'id': 64.0,
                           'ip_address': '241.116.198.85'},
              'score': 0.691104174,
              'values': []},
             {'id': 'Aaron Schankel',
              'metadata': {'Ethnicity': 'Filipino',
                           'Location': 'Shahrisabz Shahri',
                           'Occupation': 'Account Executive',
                           'email': 'aschankel21@msn.com',
                           'gender': 'Male',
                           'id': 74.0,
                           'ip_address': '99.81.173.112'},
              'score': 0.679342866,
              'values': []},
             {'id': 'Pi

# No. 7
## $in - In array (string or number).
## when you want to combine values of a particular metadata.
* ## lets say you want to return together with above filters, someone who is a 'Human Resources Manager' or 'Software Test Engineer IV', 

In [29]:
result_7 = index.query(
            vector=query_vector,
            filter={
                "Ethnicity": {"$eq": "Filipino"},
                "gender":{"$ne": "Female"},
                "id":{"$lte": 100},
                "Occupation":{"$in":['Human Resources Manager','Software Test Engineer IV']}
            },
            top_k=4,
            include_metadata=True
)

In [31]:
result_7

{'matches': [{'id': 'Brose Seemmonds',
              'metadata': {'Ethnicity': 'Filipino',
                           'Location': 'Salt Lake City',
                           'Occupation': 'Human Resources Manager',
                           'email': 'bseemmonds1r@google.com.br',
                           'gender': 'Genderfluid',
                           'id': 64.0,
                           'ip_address': '241.116.198.85'},
              'score': 0.691104174,
              'values': []},
             {'id': 'Pieter Banbrick',
              'metadata': {'Ethnicity': 'Filipino',
                           'Location': 'Jiangkou',
                           'Occupation': 'Software Test Engineer IV',
                           'email': 'pbanbrick2d@hugedomains.com',
                           'gender': 'Male',
                           'id': 86.0,
                           'ip_address': '124.26.63.148'},
              'score': 0.669276774,
              'values': []}],
 'namespace': 

# No. 8
## $nin - Not in array (string or number)
## Works just like no.7 but exludes the name categories.
## lets exclude 'Human Resources Manager','Software Test Engineer IV' from our filter. 

In [32]:
result_8 = index.query(
            vector=query_vector,
            filter={
                "Ethnicity": {"$eq": "Filipino"},
                "gender":{"$ne": "Female"},
                "id":{"$lte": 100},
                "Occupation":{"$nin":['Human Resources Manager','Software Test Engineer IV']}
            },
            top_k=4,
            include_metadata=True
)

In [33]:
result_8

{'matches': [{'id': 'Aaron Schankel',
              'metadata': {'Ethnicity': 'Filipino',
                           'Location': 'Shahrisabz Shahri',
                           'Occupation': 'Account Executive',
                           'email': 'aschankel21@msn.com',
                           'gender': 'Male',
                           'id': 74.0,
                           'ip_address': '99.81.173.112'},
              'score': 0.679342866,
              'values': []}],
 'namespace': ''}

The only entry that fits thats survives the filters is the one above.