In [13]:
import os 
from dotenv import find_dotenv, load_dotenv
from pinecone import Pinecone, ServerlessSpec
import pandas as pd
from langchain_openai.embeddings import OpenAIEmbeddings

In [32]:
_ = load_dotenv(find_dotenv())

In [33]:
print(os.getenv('PINECONE_API_KEY'))

702c291e-2eb2-4399-b69b-25655f1c40df


In [201]:
df = pd.read_csv('../../data/spider_data_with_type.csv', index_col=[0])
df = df[['question', 'query', 'query_type']]
df.shape

(8034, 3)

In [202]:
import ast

df['query_type'] = df['query_type'].apply(ast.literal_eval)
df

Unnamed: 0,question,query,query_type
0,How many heads of the departments are older th...,SELECT count(*) FROM head WHERE age > 56,"[aggregate, filter]"
1,"List the name, born state and age of the heads...","SELECT name , born_state , age FROM head ORD...",[aggregate]
2,"List the creation year, name and budget of eac...","SELECT creation , name , budget_in_billions ...",[standard]
3,What are the maximum and minimum budget of the...,"SELECT max(budget_in_billions) , min(budget_i...",[aggregate]
4,What is the average number of employees of the...,SELECT avg(num_employees) FROM department WHER...,"[aggregate, filter]"
...,...,...,...
8029,What are the citizenships that are shared by s...,SELECT Citizenship FROM singer WHERE Birth_Yea...,"[filter, combine]"
8030,How many available features are there in total?,SELECT count(*) FROM Other_Available_Features,[aggregate]
8031,What is the feature type name of feature AirCon?,SELECT T2.feature_type_name FROM Other_Availab...,"[filter, combine]"
8032,Show the property type descriptions of propert...,SELECT T2.property_type_description FROM Prope...,"[aggregate, combine]"


In [203]:
df_train = df.loc[:8000]
df_test = df.loc[8001:]

df_train.shape, df_test.shape

((8001, 3), (33, 3))

## Build Pincone

In [34]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))



In [39]:
index_name = 'snlp-test'
embedd = OpenAIEmbeddings()

dim_size = 1536

In [40]:
pc.create_index(
    name=index_name,
    dimension=dim_size,
    metric="euclidean", # cosine
    spec=ServerlessSpec(
        cloud='aws', 
        region='us-west-2'
    ) 
) 

In [165]:
index = pc.Index(index_name)

In [204]:
df_train = df_train.sample(n = 200, random_state=42)
df_train.head()

Unnamed: 0,question,query,query_type
2215,Find the id and location of circuits that belo...,"SELECT circuitid , LOCATION FROM circuits WHE...",[filter]
2582,Which room has the highest rate? List the room...,"SELECT T2.roomName , T1.Rate , T1.CheckIn , ...","[aggregate, combine]"
1662,Return the name of the artist who has the late...,SELECT name FROM artist ORDER BY year_join DES...,[aggregate]
3027,In how many different cities are banks located?,SELECT count(DISTINCT city) FROM bank,[aggregate]
6974,Show all book categories and the number of boo...,"SELECT category , count(*) FROM book_club GRO...",[aggregate]


In [198]:
import itertools

def chunks(iterable, batch_size = 100):
    it = iter(iterable)
    chunk =  tuple(itertools.islice(it, batch_size))
    while chunk:
        yield chunk
        chunk = tuple(itertools.islice(it, batch_size))


extract_data = lambda x: {
    "id": f'question-{x.Index+1}',
    'values': embedd.embed_query(x.question),
    "metadata": {'type': x.query_type, 'question': x.question, 'query': x.query}
}
    

In [207]:
count = df_train.shape
df_train.reset_index(inplace = True, drop=True)
df_train



Unnamed: 0,question,query,query_type
0,Find the id and location of circuits that belo...,"SELECT circuitid , LOCATION FROM circuits WHE...",[filter]
1,Which room has the highest rate? List the room...,"SELECT T2.roomName , T1.Rate , T1.CheckIn , ...","[aggregate, combine]"
2,Return the name of the artist who has the late...,SELECT name FROM artist ORDER BY year_join DES...,[aggregate]
3,In how many different cities are banks located?,SELECT count(DISTINCT city) FROM bank,[aggregate]
4,Show all book categories and the number of boo...,"SELECT category , count(*) FROM book_club GRO...",[aggregate]
...,...,...,...
195,What is the receipt date of the document with ...,SELECT receipt_date FROM Documents WHERE docum...,[filter]
196,What are the distinct grant amount for the gra...,SELECT T1.grant_amount FROM Grants AS T1 JOIN ...,"[filter, combine]"
197,What is the name of the tallest building?,SELECT name FROM building ORDER BY height_feet...,[aggregate]
198,Which 3 players won the most player awards? Li...,"SELECT T1.name_first , T1.name_last , T1.pla...","[aggregate, combine]"


In [212]:
data_generator = map(extract_data, df_train.itertuples(index=True))

In [213]:
for i in chunks(data_generator, batch_size=100):
    index.upsert(i)

In [214]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 200}},
 'total_vector_count': 200}

## Querying 

In [224]:
def type_prediction(x):
    return ['aggregate', 'combine']

def filter_dict(x):
    return {"$and": [{"type": type} for type in x]}

In [216]:
test_question = 'how many teams in the football league?'

test_embedding = embedd.embed_query(test_question)

In [227]:
filterTest = filter_dict(type_prediction(test_question))
filterTest

{'$and': [{'type': 'aggregate'}, {'type': 'combine'}]}

In [217]:
res = index.query(
    vector=test_embedding,
    top_k=3,
    include_metadata=True
)

In [218]:
res

{'matches': [{'id': 'question-80',
              'metadata': {'query': 'SELECT count(*) FROM club',
                           'question': 'How many clubs are there?',
                           'type': ['aggregate']},
              'score': 0.305142164,
              'values': []},
             {'id': 'question-32',
              'metadata': {'query': 'SELECT count(DISTINCT sportname) FROM '
                                    'Sportsinfo',
                           'question': 'How many different types of sports do '
                                       'we offer?',
                           'type': ['aggregate']},
              'score': 0.373051405,
              'values': []},
             {'id': 'question-182',
              'metadata': {'query': 'SELECT COUNT(*) FROM (SELECT T1.state '
                                    'FROM college AS T1 JOIN tryout AS T2 ON '
                                    'T1.cName  =  T2.cName WHERE T2.pPos  =  '
                                   

In [228]:
res_filter = index.query(
    vector=test_embedding,
    top_k=3,
    filter={
        '$and': [{'type': 'aggregate'}, {'type': 'combine'}]
    },
    include_metadata=True
)

In [237]:
res_filter['matches']

[{'id': 'question-182',
  'metadata': {'query': 'SELECT COUNT(*) FROM (SELECT T1.state FROM college AS '
                        'T1 JOIN tryout AS T2 ON T1.cName  =  T2.cName WHERE '
                        "T2.pPos  =  'mid' EXCEPT SELECT T1.state FROM college "
                        'AS T1 JOIN tryout AS T2 ON T1.cName  =  T2.cName WHERE '
                        "T2.pPos  =  'goalie')",
               'question': 'What is the count of states with college students '
                           'playing in the mid position but not as goalies?',
               'type': ['aggregate', 'filter', 'combine']},
  'score': 0.404613137,
  'values': []},
 {'id': 'question-147',
  'metadata': {'query': 'SELECT count(*) FROM employees AS T1 JOIN customers AS '
                        'T2 ON T2.support_rep_id = T1.id WHERE T1.first_name = '
                        '"Steve" AND T1.last_name = "Johnson";',
               'question': 'What is the count of customers that Steve Johnson '
             

## delete

In [205]:
index.delete(delete_all=True)

{}

In [206]:
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Hybird search


In [234]:
vectordb = Pinecone(index, embedd.embed_query, text_key = 'text')

TypeError: __init__() missing 2 required positional arguments: 'embedding' and 'text_key'

In [235]:
embedd.embed_query

<bound method OpenAIEmbeddings.embed_query of OpenAIEmbeddings(client=<openai.resources.embeddings.Embeddings object at 0x7fc58a22c2e0>, async_client=<openai.resources.embeddings.AsyncEmbeddings object at 0x7fc58a627670>, model='text-embedding-ada-002', dimensions=None, deployment='text-embedding-ada-002', openai_api_version='', openai_api_base=None, openai_api_type='', openai_proxy='', embedding_ctx_length=8191, openai_api_key='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', openai_organization=None, allowed_special=set(), disallowed_special='all', chunk_size=1000, max_retries=2, request_timeout=None, headers=None, tiktoken_enabled=True, tiktoken_model_name=None, show_progress_bar=False, model_kwargs={}, skip_empty=False, default_headers=None, default_query=None, retry_min_seconds=4, retry_max_seconds=20, http_client=None)>