In [1]:
from datasets import load_dataset
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import random as rd
from nearpy import Engine
from nearpy.hashes import RandomBinaryProjections

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

## Preparing the dataset

In [12]:
# load the dataset
train = pd.DataFrame(load_dataset("tapaco", "en", split='train')) #[0:157000]
#test = load_dataset("tapaco", "en", split='train[157000:]')
n = len(train)

In [13]:
X_train_raw = train[['paraphrase_set_id', 'paraphrase']]

In [14]:
# List of materials
materials = pd.DataFrame({'materials': ["glass", "metal", "wood", "plastic", "paper", "fabric", "stone", "ceramic",
                                        "rubber", "leather", "concrete", "diamond", "silk", "aluminum", "copper",
                                        "bronze", "silver", "gold", "bamboo"]})
materials_sampled = materials.sample(n, replace=True, random_state=42)
materials_sampled = materials_sampled.sample(frac=1).reset_index(drop=True)

random_numbers = pd.DataFrame({"number": [rd.randint(1, 1000) for _ in range(n)]})

X_train_raw = pd.concat([X_train_raw, materials_sampled, random_numbers], axis=1)
print(X_train_raw.head(10))

  paraphrase_set_id               paraphrase materials  number
0                 1        I ate the cheese.   plastic     703
1                 1            I eat cheese.   leather     985
2                 1     I'm eating a yogurt.     metal      86
3                 1       I'm eating cheese.  aluminum     307
4                 1  I'm having some cheese.  aluminum     393
5                 1       I eat some cheese.    bamboo     740
6                 1       I ate some cheese.      wood      93
7                 5             It's Monday.  aluminum     776
8                 5      It is Monday today.     stone     841
9                 5       It's Monday today.     metal     135


In [15]:
# merge columns into one
X_train = pd.DataFrame({"paraphrase_set_id": X_train_raw.paraphrase_set_id, "paraphrase_all": X_train_raw.paraphrase})
X_train['paraphrase_all'] = X_train_raw[X_train_raw.columns[1:]].apply(
    lambda x: ','.join(x.dropna().astype(str)),
    axis=1
)

In [None]:
# write to xlsx
#with pd.ExcelWriter("C:/Users/LucijaTokic/OneDrive - Clear Peaks SL/Documentos/02 - Projects/06 - ADNOC/01-NLP"
#                    "/tapaco_expanded.xlsx") as writer:
#    X_train_raw.to_excel(writer, sheet_name="raw_dataset", float_format="%.2f")

# Embedding

In [16]:
# embedding
tv = TfidfVectorizer(binary=False, norm=None, use_idf=False, smooth_idf=False, lowercase=True, stop_words='english',
                     min_df=0.001, max_df=0.9, max_features=None, ngram_range=(1,1))
X_train_emb = pd.DataFrame(tv.fit_transform(X_train.paraphrase_all).toarray())
print(X_train_emb.shape)
#print(X_train_emb.head(10))

(158053, 934)


# LSH

In [17]:
##### LSH
# convert it to numpy array
embeddings_array = X_train_emb.to_numpy()

In [18]:
# Dimension of our vector space
dimension = X_train_emb.shape[1]

### First way: RandomBinaryProjections

In [88]:
# Create a random binary hash with 10 bits
rbp = RandomBinaryProjections('rbp', 20)

# Create engine with pipeline configuration
engine = Engine(dimension, lshashes=[rbp])

# Add the embeddings to the LSH engine
for idx, embedding in enumerate(embeddings_array):
    engine.store_vector(embedding, idx)


In [91]:
# Create random query vector
neighbors = engine.neighbours(embeddings_array[10])
#print(neighbors)

# Get the index of the nearest neighbor
nearest_neighbor_index = neighbors[0][0]
#print(nearest_neighbor_index)
#print(type(nearest_neighbor_index))

# Retrieve the corresponding text from X_train_emb
nearest_neighbor_text = X_train.iloc[nearest_neighbor_index]['paraphrase_all']

# Print the nearest neighbor's text
print(nearest_neighbor_text)
#print(nearest_neighbor_text.shape)

0    I ate the cheese.,plastic,703
0    I ate the cheese.,plastic,703
0    I ate the cheese.,plastic,703
0    I ate the cheese.,plastic,703
0    I ate the cheese.,plastic,703
                 ...              
0    I ate the cheese.,plastic,703
0    I ate the cheese.,plastic,703
0    I ate the cheese.,plastic,703
0    I ate the cheese.,plastic,703
0    I ate the cheese.,plastic,703
Name: paraphrase_all, Length: 934, dtype: object


### Second way: Faiss

In [20]:
import faiss

In [19]:
for nbits in [2, 4, 8, 16, 24, 32]:
    buckets = 1 << nbits
    print(f"nbits == {nbits}")
    print(f"{X_train_emb.shape[0]} / {buckets} = {X_train_emb.shape[0]/buckets}")

nbits == 2
158053 / 4 = 39513.25
nbits == 4
158053 / 16 = 9878.3125
nbits == 8
158053 / 256 = 617.39453125
nbits == 16
158053 / 65536 = 2.4116973876953125
nbits == 24
158053 / 16777216 = 0.009420692920684814
nbits == 32
158053 / 4294967296 = 3.6799581721425056e-05


In [84]:
# initialize the index using our vectors dimensionality (128) and nbits
nbits = 24
index = faiss.IndexLSH(dimension, nbits)
# then add the data
index.add(embeddings_array)

In [85]:
xq0 = embeddings_array[10].reshape(1, dimension)
# we use the search method to find the k nearest vectors
D, I = index.search(xq0, k=5)
# the indexes of these vectors are returned to I
I

array([[   10, 40687, 84792,   866, 57723]], dtype=int64)

In [86]:
# Retrieve the corresponding text from X_train_emb
nearest_neighbor_text = X_train.iloc[I[0]]['paraphrase_all']
# Print the nearest neighbor's text
print(nearest_neighbor_text)

10                             Today is Monday.,copper,803
40687    There's no need to go to school today.,copper,212
84792                 She hasn't got glasses.,concrete,132
866                        It is snowing today.,copper,645
57723                    Is today his birthday?,copper,986
Name: paraphrase_all, dtype: object


In [65]:
print(len(X_train_raw[X_train_raw['materials'] == 'plastic']))

8345


In [33]:
from sklearn.metrics.pairwise import cosine_similarity

In [82]:
cosine_similarity(embeddings_array[I[0]], xq0)

array([[1.        ],
       [0.33333333],
       [0.        ],
       [0.33333333],
       [0.33333333]])

In [67]:
D

array([[0., 0., 0., 1., 1.]], dtype=float32)

In [39]:
import numpy as np

In [83]:
k = 5

for nbits in [2, 4, 8, 16, 17, 18, 19, 20, 21, 22, 23, 24, 32]:
    index = faiss.IndexLSH(dimension, nbits)
    index.add(embeddings_array)
    D, I = index.search(xq0, k=k)
    cos = cosine_similarity(embeddings_array[I[0]], xq0)
    print("nbits = %d --> cos = %s" % (nbits, np.mean(cos)))

nbits = 2 --> cos = 0.414982991426106
nbits = 4 --> cos = 0.28164965809277265
nbits = 8 --> cos = 0.20000000000000004
nbits = 16 --> cos = 0.20000000000000004
nbits = 17 --> cos = 0.4877010097711745
nbits = 18 --> cos = 0.2666666666666667
nbits = 19 --> cos = 0.5398924451310624
nbits = 20 --> cos = 0.4000000000000001
nbits = 21 --> cos = 0.5116156409449846
nbits = 22 --> cos = 0.20000000000000004
nbits = 23 --> cos = 0.5699023252744027
nbits = 24 --> cos = 0.6420686862090157
nbits = 32 --> cos = 0.36710983178735745


In [76]:
# extract index binary codes (represented as int)
arr = faiss.vector_to_array(index.codes)
arr

array([186, 139,  15, ..., 234, 119,  12], dtype=uint8)

In [77]:
arr.shape

(474159,)

In [71]:
X_train_raw.shape

(158053, 4)

In [78]:
# now translate them into the binary vector format
arr_bites = (((arr[:, None] & (1 << np.arange(nbits)))) > 0).astype(int)
print(len(arr_bites[0]))

20
