In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:

!pip install --quiet scann
!pip install --quiet datasets

import pandas as pd 
import numpy as np 
import sklearn
from sklearn.model_selection import train_test_split
import tensorflow as tf 
from tensorflow import keras 
from tensorflow.keras import layers, Input, Model
from tensorflow.keras.layers import *
import tensorflow_hub as hub 

from sklearn.metrics import confusion_matrix

import scann
# from data_loader import data_loader
# from negative_maker import negative_maker
# from model import model
import datasets
from platform import python_version
import os 

[K     |████████████████████████████████| 10.4 MB 19.6 MB/s 
[K     |████████████████████████████████| 578.0 MB 15 kB/s 
[K     |████████████████████████████████| 438 kB 59.2 MB/s 
[K     |████████████████████████████████| 5.9 MB 70.5 MB/s 
[K     |████████████████████████████████| 1.7 MB 62.7 MB/s 
[K     |████████████████████████████████| 431 kB 16.5 MB/s 
[K     |████████████████████████████████| 212 kB 37.5 MB/s 
[K     |████████████████████████████████| 120 kB 43.8 MB/s 
[K     |████████████████████████████████| 115 kB 76.9 MB/s 
[K     |████████████████████████████████| 127 kB 87.9 MB/s 
[?25h

In [3]:
print('python_version:', python_version())
print('numpy version:',np.__version__)
print('pandas version:',pd.__version__)
print('sklearn version:',sklearn.__version__)
print('tensorflow version:',tf.__version__)
print('keras version',keras.__version__)
print('tf_hub version:',hub.__version__)
print('datasets version:',datasets.__version__)
print('scaNN version:','1.2.8')

python_version: 3.7.14
numpy version: 1.21.6
pandas version: 1.3.5
sklearn version: 1.0.2
tensorflow version: 2.10.0
keras version 2.10.0
tf_hub version: 0.12.0
datasets version: 2.5.1
scaNN version: 1.2.8


In [4]:
class data_loader():
    def __init__(self,partition):
        self.partition = partition
        
    def frame_maker(self):
        # 'train_eli5'
        eli5 = datasets.load_dataset('eli5', split = self.partition)
        df = pd.DataFrame({'title':eli5['title'], 'selftext':eli5['selftext'], 'answer':eli5['answers']})

        answer_len = []
        first_answer = []
        for i in range(len(df)):
            answer_len.append(len(df['answer'][i]['text']))
            first_answer.append(df['answer'][i]['text'][0])

        df['first_answer'] = first_answer

        unique_answer = df['first_answer'].unique()
        num_unique_answer = len(unique_answer)

        unique_questions = df['title'].unique()
        num_unique_questions =  len(unique_questions)

        return df


class negative_maker():
    def __init__(self,df):
        self.df = df 
    def neg_maker(self):
        neg_pos = []
        neg_title = []
        neg_answer = []
        for i in range(len(self.df)):
            x = np.random.randint(0, len(self.df))
            neg_pos.append(x)
            neg_title.append(self.df['title'][x])
            neg_answer.append(self.df['first_answer'][x])

        self.df['neg_title'] = neg_title
        self.df['neg_answer'] = neg_answer

        return self.df


In [5]:
train_df = data_loader('train_eli5').frame_maker()
test_df = data_loader('test_eli5').frame_maker()

Downloading builder script:   0%|          | 0.00/18.2k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/6.36k [00:00<?, ?B/s]

Downloading and preparing dataset eli5/LFQA_reddit (download: 6.03 MiB, generated: 1.26 GiB, post-processed: Unknown size, total: 1.26 GiB) to /root/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa...


Downloading:   0%|          | 0.00/3.50k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/576M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/21.1M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/286M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/9.65M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/330M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/18.7M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/36.2M [00:00<?, ?B/s]

Dataset eli5 downloaded and prepared to /root/.cache/huggingface/datasets/eli5/LFQA_reddit/1.0.0/17574e5502a10f41bbd17beba83e22475b499fa62caa1384a3d093fc856fe6fa. Subsequent calls will reuse this data.




In [6]:
use_hub = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4") ## universal sentence encoder model

In [7]:
train_df = negative_maker(train_df).neg_maker()

In [8]:
### LOSS FUNCTION

def distance_calc(y_true, y_pred):
    anchor, positive, negative = tf.split(y_pred, 3, axis=1)
    ap_distance = tf.reduce_sum(tf.square(anchor - positive), -1)
    an_distance = tf.reduce_sum(tf.square(anchor - negative), -1)
    loss = ap_distance - an_distance
    margin = 0
    loss = tf.maximum(loss + margin, 0.0)
    return loss

In [9]:
def model():
    use_hub = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")


    anc_inp = Input(shape =(), dtype = tf.string, name = 'anchor_input')
    pos_inp = Input(shape =(), dtype = tf.string, name = 'positive_input')
    neg_inp = Input(shape =(), dtype = tf.string, name = 'negative_input')

    use_emb = hub.KerasLayer(use_hub, trainable =True, name = 'sentence_encoder')

    anc_emb = use_emb(anc_inp)
    pos_emb = use_emb(pos_inp)
    neg_emb = use_emb(neg_inp)

    # d1_anc = Dense(256, activation = 'relu')(anc_emb)
    # d1_pos = Dense(256, activation = 'relu')(pos_emb)
    # d1_neg = Dense(256, activation = 'relu')(neg_emb)

    final = tf.keras.layers.Concatenate(axis=-1)([anc_emb, pos_emb, neg_emb])
    final = Dropout(0.2)(final)

    return Model(inputs = [anc_inp, pos_inp, neg_inp], outputs = final)

In [10]:
triplet_model = model()



In [16]:

triplet_model.compile(
    optimizer = 'Adam',
    loss = distance_calc
)
y_dummy = np.ones(len(train_df)).reshape(-1,1)
triplet_model.fit([np.array(train_df['title']),
                   np.array(train_df['first_answer']),
                   np.array(train_df['neg_answer'])
                   ],
                   y_dummy,
                   epochs = 4,
                  batch_size = 64*64
                  )


In [11]:
triplet_model.save_weights('drive/MyDrive/Colab Notebooks/quick_response/triplet_model_weights.h5')
triplet_model.load_weights('drive/MyDrive/Colab Notebooks/quick_response/triplet_model_weights.h5')

In [12]:
use_emb =  triplet_model.get_layer('sentence_encoder')

In [13]:
test_df['title'][0]

'Why do you get chills/goosebumps from hearing large crowds sing along to songs?'

In [14]:
#### For testing purpose, all test answer will be converted into embeddings
#### using trained embedding layer.

q_0 = []
use_emb_test = []
for i in range(len(test_df)):
    test_answer = test_df['first_answer'][i]
    y = np.array(use_emb(([test_answer]))).reshape(1,512)
    use_emb_test.append(y)

use_emb_test = np.squeeze(np.array(use_emb_test), axis  =1)

In [15]:
#### vector values of the test answer embedding will be stored in the 
#### vector similarity search library scaNN

searcher = scann.scann_ops_pybind.builder(use_emb_test, 40, "dot_product").tree(
    num_leaves=2000, num_leaves_to_search=100, training_sample_size=250000).score_ah(
    2, anisotropic_quantization_threshold=0.2).reorder(100).build()

In [16]:
## create serialize target dir
saving_path = 'drive/MyDrive/Colab Notebooks/quick_response/scann_save'
os.makedirs(saving_path, exist_ok=True)
## serialize the searcher
searcher.serialize(saving_path)

In [17]:
searcher = scann.scann_ops_pybind.load_searcher(saving_path)

In [26]:
test_quest = test_df['title'][2]
test_quest_emb = np.array(use_emb(([test_quest]))).reshape(1,512)

index, distance = searcher.search(test_quest_emb.ravel())
index

array([    2,  1621,  8615, 14549, 23917,  9816,  7135,  2823, 16688,
       15926,  4228,   824,   633, 14433, 18935,  7729,   843,  2532,
        2953, 11183, 16366, 13973, 16904, 12756, 16278, 23605,   334,
        3871, 21573, 12495, 16185, 17960, 10804, 18916, 20089, 13133,
        8927, 12362, 17914,  2470], dtype=uint32)

In [27]:
test_df['title'][2]

"What's the difference between a bush, a shrub, and a tree?"

In [28]:
test_df['first_answer'][index[0]]

"Shrubs and trees are both specifically *woody* plants with stems that survive throughput the winter. A tree has a clear central trunk whereas a shrub has multiple stems rising from the ground.\n\n'Bush' is a more general term for any plant with multiple stems rising from the ground, and that can be either woody or what's called herbaceous, herbaceous plants are ones where the stems die back completely or substantially in the winter leaving the plant with just its roots and new stems grow next spring."

In [38]:
alternative_question = 'Is there any difference between a bush, a shrub, and a tree?'
alternative_embeddings = np.array(use_emb(([alternative_question]))).reshape(1,512)
alternative_index = searcher.search(alternative_embeddings.ravel())[0][0]

In [39]:
test_df['first_answer'][alternative_index]

"Shrubs and trees are both specifically *woody* plants with stems that survive throughput the winter. A tree has a clear central trunk whereas a shrub has multiple stems rising from the ground.\n\n'Bush' is a more general term for any plant with multiple stems rising from the ground, and that can be either woody or what's called herbaceous, herbaceous plants are ones where the stems die back completely or substantially in the winter leaving the plant with just its roots and new stems grow next spring."

##EVALUATION


In [22]:
close_index = []
for i in range(len(test_df)):
    var1 = test_df['title'][i]
    var1 = np.array(use_emb(([var1]))).reshape(1,512)
    index, distance = searcher.search(var1.ravel())
    close_index.append(index)

def _compute_precision_recall(targets, predictions, k):

    pred = predictions[:k]
    num_hit = len(set(pred).intersection(set(targets)))
    precision = float(num_hit) / len(pred)
    recall = float(num_hit) / len(targets)
    return precision, recall

N = [1,3,5,10,20]
for t in N:
    precisions = []
    recalls = []
    for i, _k in enumerate(close_index):
        precision, recall = _compute_precision_recall([i], _k,t)
    # print(precision)
        precisions.append(precision)
        recalls.append(recall)

    print('precision @',t,':', np.mean(precisions))
    print('recalls @',t,',:', np.mean(recalls))
    print(' ')

precision @ 1 : 0.23759791122715404
recalls @ 1 ,: 0.23759791122715404
 
precision @ 3 : 0.1299635552654482
recalls @ 3 ,: 0.3898906657963446
 
precision @ 5 : 0.09236292428198431
recalls @ 5 ,: 0.4618146214099217
 
precision @ 10 : 0.05571964751958224
recalls @ 10 ,: 0.5571964751958225
 
precision @ 20 : 0.032102643603133155
recalls @ 20 ,: 0.6420528720626631
 


In [37]:
test_df_short = test_df[['title','first_answer']]
test_df_short.to_csv('test_df.csv')