In [1]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
pd.set_option('mode.chained_assignment', None)

In [4]:
train_data = load_dataset('wikisql', split='train')
test_data = load_dataset('wikisql', split='test')
val_data = load_dataset('wikisql', split='validation')

In [5]:
train_data = train_data.to_pandas()
test_data = test_data.to_pandas()
val_data = val_data.to_pandas()

## Feature extraction

In [6]:
def fetch_SQL(dict_sql, col_to_fetch):
    return dict_sql[col_to_fetch]

In [7]:
train_data['query'] = train_data.sql.apply(lambda x: fetch_SQL(x, 'human_readable'))
test_data['query'] = test_data.sql.apply(lambda x: fetch_SQL(x, 'human_readable'))
val_data['query'] = val_data.sql.apply(lambda x: fetch_SQL(x, 'human_readable'))

In [8]:
final_train_data = train_data[['question', 'query']]
final_test_data = test_data[['question', 'query']]
final_val_data = val_data[['question', 'query']]

In [9]:
final_train_data

Unnamed: 0,question,query
0,Tell me what the notes are for South Australia,SELECT Notes FROM table WHERE Current slogan =...
1,What is the current series where the new serie...,SELECT Current series FROM table WHERE Notes =...
2,What is the format for South Australia?,SELECT Format FROM table WHERE State/territory...
3,Name the background colour for the Australian ...,SELECT Text/background colour FROM table WHERE...
4,how many times is the fuel propulsion is cng?,SELECT COUNT Fleet Series (Quantity) FROM tabl...
...,...,...
56350,What time was the match played with a score of...,SELECT Time FROM table WHERE Score = 3-2
56351,On which ground did the team play Aston Villa?,SELECT Ground FROM table WHERE Opponent = asto...
56352,What kind of competition was it at San Siro at...,SELECT Competition FROM table WHERE Ground = s...
56353,What is the total number of decile for the red...,SELECT COUNT Decile FROM table WHERE Name = re...


## BERT Embedding

In [10]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [11]:
def get_bert_embed(data):
    embedding = tokenizer.batch_encode_plus(data,
                padding=True,              # Pad to the maximum sequence length
                truncation=True,           # Truncate to the maximum sequence length if necessary
                # return_tensors='pt',      # Return PyTorch tensors
                add_special_tokens=True    # Add special tokens CLS and SEP
                )
    return embedding

In [12]:
final_train_data.loc[:,'Encoded_Question_input_ids'] = pd.Series(get_bert_embed(final_train_data['question'])['input_ids'])
final_test_data.loc[:,'Encoded_Question_input_ids'] = pd.Series(get_bert_embed(final_test_data['question'])['input_ids'])
final_val_data.loc[:,'Encoded_Question_input_ids'] = pd.Series(get_bert_embed(final_val_data['question'])['input_ids'])

final_train_data.loc[:,'Encoded_query_input_ids'] = pd.Series(get_bert_embed(final_train_data['query'])['input_ids'])
final_test_data.loc[:,'Encoded_query_input_ids'] = pd.Series(get_bert_embed(final_test_data['query'])['input_ids'])
final_val_data.loc[:,'Encoded_query_input_ids'] = pd.Series(get_bert_embed(final_val_data['query'])['input_ids'])

In [19]:
training_tensor_X = torch.tensor(final_train_data.Encoded_Question_input_ids)
training_tensor_y = torch.tensor(final_train_data.Encoded_query_input_ids)