# Installing Dependencies

In [None]:
!pip install langchain
!pip install openai
!pip install pinecone-client
!pip install jq
!pip install tiktoken

# Importing Dependencies

In [2]:
from langchain.vectorstores import Pinecone
from langchain.embeddings import OpenAIEmbeddings
import pinecone
from langchain.document_loaders import JSONLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

  from tqdm.autonotebook import tqdm


# API keys and Env

In [3]:
openai_api_key = ''

PINECONE_API_KEY = ''

PINECONE_API_ENV = ''

# Reading the Json file

In [4]:
import json
from pathlib import Path
from pprint import pprint


file_path='/kaggle/input/json-dataset-of-people/Customer data.json'
data = json.loads(Path(file_path).read_text())

# Embedding the documents Using Pinecone

In [5]:
# Get embedding engine ready
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

# Initialize pinecone

In [6]:
# initialize pinecone
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_API_ENV  # next to api key in console
)
index_name = "insert-data" # put in the name of your pinecone index here

# Instantiating the index.

In [7]:
index = pinecone.Index("insert-data")

# Describing the index

In [8]:
pinecone.list_indexes()

['insert-data']

In [23]:
pinecone.describe_index("insert-data")

IndexDescription(name='insert-data', metric='cosine', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='p1', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')

# Querying the data.

# prompts.

In [13]:
text = "Give a list of only asian women"
text_2 = " list only engineers"

query_vector = embeddings.embed_query(text)
query_vector2 = embeddings.embed_query(text_2)

# Query formulation in pinecone.
 ## note: Metadata has not been added.

In [10]:
result = index.query(
  vector=query_vector,
  top_k=5,
  include_values=True
)

In [16]:
result2 = index.query(
  vector=query_vector2,
  top_k=10,
  include_values=True
)

# Finding only asian women in the data.

In [12]:
matches = result['matches']

for match in matches:
    print(f"ID: {match['id']}, Score: {match['score']}")

ID: Dom Craise, Score: 0.801425159
ID: Esme Le Estut, Score: 0.800467789
ID: Meaghan Chater, Score: 0.799819767
ID: Fredi Tilberry, Score: 0.797618032
ID: Gussi Le Grice, Score: 0.794192433


# Finding engineers.

In [17]:
matches = result2['matches']

for match in matches:
    print(f"ID: {match['id']}, Score: {match['score']}")

ID: Conn Cawte, Score: 0.773957968
ID: Errol Dutch, Score: 0.771544516
ID: Alano Densun, Score: 0.769783
ID: Judah Plues, Score: 0.769239306
ID: Harley Kelso, Score: 0.769052267
ID: Burgess Egdal, Score: 0.769000947
ID: Basilio Daskiewicz, Score: 0.768206775
ID: Petrina Brunsen, Score: 0.767332196
ID: Les Piers, Score: 0.767185032
ID: Corenda Anselmi, Score: 0.766613841
