In [35]:
import pandas as pd
import random
import tensorflow as tf 

from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import train_test_split

### Get a text dataset

In [36]:
siameseData = pd.read_csv("data.txt", sep= ';', index_col=None, names= ["text"], header=None)
siameseData

Unnamed: 0,text
0,Similarity has always been a key aspect in com...
1,"Any time two element vectors are compared, man..."
2,But if the comparison has to be applied to mor...
3,"In these cases, a siamese neural network may b..."
4,The two neural networks are both feedforward p...
...,...
95,The network applies a ReLU activation function...
96,Thus the kth filter map in each layer takes th...
97,We have presented a strategy for performing on...
98,We outlined new results comparing the performa...


In [93]:
siameseData["target"] = 1
siameseData.to_csv('siamese_nn.csv',index=False)

In [38]:
siameseData.head(10)

Unnamed: 0,text,target
0,Similarity has always been a key aspect in com...,1
1,"Any time two element vectors are compared, man...",1
2,But if the comparison has to be applied to mor...,1
3,"In these cases, a siamese neural network may b...",1
4,The two neural networks are both feedforward p...,1
5,they work parallelly in tandem and compare th...,1
6,The output generated by a siamese neural netwo...,1
7,In this overview we first describe the siamese...,1
8,"Additionally, we list the programming language...",1
9,This section reviews existing tracking method...,1


In [70]:
randomData = pd.read_csv("random.txt", sep= ';', index_col=None, names= ["text"], header=None)

In [71]:
randomData["target"] = 0

In [72]:
train_df = pd.concat([randomData, siameseData], ignore_index=True )
train_df

Unnamed: 0,text,target
0,The Views section describes all you need to kn...,0
1,"Views consist of an app template, a layout, an...",0
2,vue file inside the layouts directory,0
3,This will be used for all pages that don't hav...,0
4,The only thing you need to include in the layo...,0
...,...,...
160,The network applies a ReLU activation function...,1
161,Thus the kth filter map in each layer takes th...,1
162,We have presented a strategy for performing on...,1
163,We outlined new results comparing the performa...,1


### Shuffle training dataframe
frac ==  100%

In [73]:
train_shuffle=train_df.sample(frac=1,random_state=9) 
train_shuffle.head(10)

Unnamed: 0,text,target
13,While traditional deep neural networks assume ...,0
80,"For instance, Wang et al",1
105,This is typically used for comparing similar i...,1
68,"In these cases, a siamese neural network may b...",1
44,“We may not understand the choice that they’re...,0
88,"Recently, the DSiamM tracker proposes to perfo...",1
17,"As a result, recurrent networks need to accoun...",0
69,The two neural networks are both feedforward p...,1
39,Advertisement The DogPhone considers both owne...,0
148,"In LeCun et al., the authors used a contrastiv...",1


In [74]:
train_shuffle.target.value_counts()

1    100
0     65
Name: target, dtype: int64

In [75]:
train_sentences, test_sentences, train_labels,test_labels = train_test_split(
    train_shuffle['text'].to_numpy(),
    train_shuffle['target'].to_numpy(),
    test_size=0.1, 
    random_state=40
)

len(train_sentences),len(test_sentences),len(train_labels),len(test_labels)

(148, 17, 148, 17)

### Converting text into numbers 

TextVectorization
1. turn into lowercase
2. split to sub sq (into words)
3. recombine each substring into ngrams
4. index token to numerical value
5. create vector

In [78]:
# avg len of sent
max_sq_len = round(sum([len(i.split()) for i in train_sentences])/len(train_sentences))
max_sq_len

20

In [79]:
text_vectorizer = TextVectorization(max_tokens=10000,  #number of word in vocabulary
                                    standardize="lower_and_strip_punctuation", 
                                    split ="whitespace",
                                    output_mode="int",
                                    output_sequence_length=max_sq_len,
                                    pad_to_max_tokens=True)

In [80]:
text_vectorizer.adapt(train_sentences)

In [81]:
words = text_vectorizer.get_vocabulary()

In [82]:
words[:10]

['', '[UNK]', 'the', 'to', 'a', 'of', 'and', 'in', 'is', 'networks']

In [83]:
words[-10:]

['accepted',
 'accept',
 'able',
 'ability',
 '2011',
 '1994',
 '1990s',
 '180',
 '16',
 '0']

In [84]:
test_string = "twin neural network"

In [85]:
sample_tokenized = text_vectorizer([test_string])
sample_tokenized

<tf.Tensor: shape=(1, 20), dtype=int64, numpy=
array([[51, 12, 13,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0]])>

In [89]:
### Embedding

embedding = Embedding(
                        input_dim=10000, 
                        output_dim=128, 
                        input_length=max_sq_len, 
                        name = 'embeding_1'
)

In [90]:
random_text = random.choice(train_sentences)
print(random_text)
sample_embed = embedding(text_vectorizer([random_text]))
sample_embed 

Hence, can learn semantic similarity.


<tf.Tensor: shape=(1, 20, 128), dtype=float32, numpy=
array([[[ 0.00339515,  0.04540007,  0.03598148, ..., -0.00110666,
          0.03714195,  0.04015541],
        [ 0.04304346,  0.02444938,  0.00724043, ...,  0.04288545,
         -0.00234977, -0.0440491 ],
        [-0.02650821,  0.00725113,  0.0156454 , ...,  0.02672983,
          0.04999033, -0.00710227],
        ...,
        [ 0.02050493, -0.01166279, -0.00415207, ...,  0.00956981,
          0.02461679, -0.02199488],
        [ 0.02050493, -0.01166279, -0.00415207, ...,  0.00956981,
          0.02461679, -0.02199488],
        [ 0.02050493, -0.01166279, -0.00415207, ...,  0.00956981,
          0.02461679, -0.02199488]]], dtype=float32)>