In [4]:
from numpy import array
from numpy import asarray
from numpy import zeros
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Dense, Input, Concatenate, Dropout, GlobalAveragePooling1D
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
import numpy as np
import pandas as pd
import random
from sklearn.metrics import accuracy_score, confusion_matrix


In [5]:
max_query_length = 15
max_category_length = 45
num_of_classes = 5
epochs = 1000
batch_size = 8192
checkpoint = '/data/shpx/data/mkabir/checkpoint/1000_train_sampling_zero_point_one_two_tower_model.h5'
n_tr_rows = None
n_ts_rows = None
n_vl_rows = None
result_path = '/data/shpx/data/mkabir/train_sample_output_0point1_1000.txt'

In [6]:
df_train = pd.read_csv("/data/shpx/data/mkabir/dataset/train.tsv", sep = "\t", header = None, nrows = n_tr_rows)
df_train = df_train.dropna(how='any')
df_train = df_train.sample(frac=0.1, replace=True, random_state=1)
df_val = pd.read_csv("/data/shpx/data/mkabir/dataset/dev.tsv", sep = "\t", header = None, nrows = n_ts_rows)
df_val = df_val.dropna(how='any')
df_test = pd.read_csv("/data/shpx/data/mkabir/dataset/test.tsv", sep = "\t", header = None, nrows = n_vl_rows)
df_test = df_test.dropna(how='any')

In [7]:
t = Tokenizer()
t.fit_on_texts(df_train.values[:,0])
t.fit_on_texts(df_train.values[:,1])
t.fit_on_texts(df_val.values[:,0])
t.fit_on_texts(df_val.values[:,1])
t.fit_on_texts(df_test.values[:,0])
t.fit_on_texts(df_test.values[:,1])
t.fit_on_texts(['all categories'])

vocab_size = len(t.word_index) + 1

print(len(t.word_index))


818428


In [8]:
with open("/data/shpx/data/mkabir/dataset/jose_w3_queries_w_singularity_cats.txt", encoding='utf-8') as f:
    content = f.readlines()
print(len(content))
embeddings_index = {}  
for c in content[1:]:
    data = c.split(" ")
    #print(len(data))
    if len(data) == 102:
        key = data[0].lower()
        key = key.replace('é', 'e')
        embeddings_index[key] = np.array([float(a) for a in data[1:101]])

print('Loaded %s word vectors.' % len(embeddings_index))
#print(list(embeddings_index.items())[:14])

176352
Loaded 176096 word vectors.


In [9]:
def calculate_labels(val_y):
    lab_y = []
    thresholds = [-1.1, 0.82, 0.9, 0.95, 0.97, 1]
    
    for v in val_y:
        l = 0
        start = thresholds[0]
        p = 1 - v
        for thr in thresholds[1:]:
            if start < p and p <= thr:
                lab_y.append(l)
                break
            if p > 1:
                lab_y.append(4)
                break
            l = l + 1
            start = thr
    return lab_y

In [11]:
q1 = []
c1 = []
q2 = []
c2 = []
for v in df_train.values:
    splitted = v[0].split('<::>')
    if len(splitted) >= 2:
        q1.append(splitted[0])
        c1.append(splitted[1])
    else:
        q1.append(splitted[0])
        c1.append('all categories')
        
    splitted = v[1].split('<::>')
    if len(splitted) >= 2:
        q2.append(splitted[0])
        c2.append(splitted[1])
    else:
        q2.append(splitted[0])
        c2.append('all categories')

    
q1 = t.texts_to_sequences(q1)
q1 = pad_sequences(q1, maxlen=max_query_length, padding='post')
c1 = t.texts_to_sequences(c1)
c1 = pad_sequences(c1, maxlen=max_category_length, padding='post')

print('First step completed')

First step completed


In [12]:
q2 = t.texts_to_sequences(q2)
q2 = pad_sequences(q2, maxlen=max_query_length, padding='post')
c2 = t.texts_to_sequences(c2)
c2 = pad_sequences(c2, maxlen=max_category_length, padding='post')
print('Second step completed')

Second step completed


In [13]:
train_x = [np.array(q1.copy()),np.array(c1.copy()), np.array(q2.copy()), np.array(c2.copy())]
train_y = np.array(calculate_labels(df_train.values[:,2]))
print(train_x[0].shape)
print(train_y.shape)

print(train_x[:5])
print(train_y[:5])

(49524536, 15)
(49524536,)
[array([[ 1030,  7141,  6218, ...,     0,     0,     0],
       [ 1877,  7547,    53, ...,     0,     0,     0],
       [32661,   406,  2595, ...,     0,     0,     0],
       ...,
       [  654, 23200,  7703, ...,     0,     0,     0],
       [  297,  5698,  5807, ...,     0,     0,     0],
       [ 9126,  1883,    82, ...,     0,     0,     0]], dtype=int32), array([[ 888, 2003,    1, ...,    0,    0,    0],
       [  14,   21,    1, ...,    0,    0,    0],
       [  97,   79,   75, ...,    0,    0,    0],
       ...,
       [  19,   12,    6, ...,    0,    0,    0],
       [ 111,  225, 1026, ...,    0,    0,    0],
       [   9,   11,   42, ...,    0,    0,    0]], dtype=int32), array([[ 6218,     0,     0, ...,     0,     0,     0],
       [ 7547,   930,     0, ...,     0,     0,     0],
       [  538,     0,     0, ...,     0,     0,     0],
       ...,
       [ 3252,  1264,     0, ...,     0,     0,     0],
       [  318,  5698,  5807, ...,     0,     0

In [14]:
q1 = []
c1 = []
q2 = []
c2 = []
for v in df_val.values:
    splitted = v[0].split('<::>')
    if len(splitted) >= 2:
        q1.append(splitted[0])
        c1.append(splitted[1])
    else:
        q1.append(splitted[0])
        c1.append('all categories')
        
    splitted = v[1].split('<::>')
    if len(splitted) >= 2:
        q2.append(splitted[0])
        c2.append(splitted[1])
    else:
        q2.append(splitted[0])
        c2.append('all categories')
        
    
q1 = t.texts_to_sequences(q1)
q1 = pad_sequences(q1, maxlen=max_query_length, padding='post')
c1 = t.texts_to_sequences(c1)
c1 = pad_sequences(c1, maxlen=max_category_length, padding='post')

q2 = t.texts_to_sequences(q2)
q2 = pad_sequences(q2, maxlen=max_query_length, padding='post')
c2 = t.texts_to_sequences(c2)
c2 = pad_sequences(c2, maxlen=max_category_length, padding='post')
val_x = [np.array(q1.copy()),np.array(c1.copy()), np.array(q2.copy()), np.array(c2.copy())]
val_y = np.array(calculate_labels(df_val.values[:,2]))
print(val_x[0].shape)
print(val_y.shape)



(5610865, 15)
(5610865,)


In [15]:
q1 = []
c1 = []
q2 = []
c2 = []
for v in df_test.values:
    splitted = v[0].split('<::>')
    if len(splitted) >= 2:
        q1.append(splitted[0])
        c1.append(splitted[1])
    else:
        q1.append(splitted[0])
        c1.append('all categories')
        splitted = v[0].split('<::>')
        
    splitted = v[1].split('<::>')
    if len(splitted) >= 2:
        q2.append(splitted[0])
        c2.append(splitted[1])
    else:
        q2.append(splitted[0])
        c2.append('all categories')
        

q1 = t.texts_to_sequences(q1)
q1 = pad_sequences(q1, maxlen=max_query_length, padding='post')
c1 = t.texts_to_sequences(c1)
c1 = pad_sequences(c1, maxlen=max_category_length, padding='post')

q2 = t.texts_to_sequences(q2)
q2 = pad_sequences(q2, maxlen=max_query_length, padding='post')
c2 = t.texts_to_sequences(c2)
c2 = pad_sequences(c2, maxlen=max_category_length, padding='post')


test_x = [np.array(q1.copy()),np.array(c1.copy()), np.array(q2.copy()), np.array(c2.copy())]
test_y = np.array(calculate_labels(df_test.values[:,2]))
print(test_x[0].shape)
print(test_y.shape)

(8159537, 15)
(8159537,)


In [16]:
embedding_matrix = zeros((vocab_size, 100))
exception = 0
words = []
for word, i in t.word_index.items():
    w = word.lower()
    w = w.replace('é', 'e')
    embedding_vector = embeddings_index.get(w)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
    else:
        embedding_matrix[i] = np.zeros(100)
        words.append(w)
        exception = exception + 1

In [17]:
q1_input = Input(shape=[max_query_length], name='query1')
c1_input = Input(shape=[max_category_length], name='cat1')
q2_input = Input(shape=[max_query_length], name='query2')
c2_input = Input(shape=[max_category_length], name='cat2')

q_emb = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_query_length, trainable=False, mask_zero = True)
c_emb = Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_category_length, trainable=False, mask_zero = True)

q1_embedding = q_emb(q1_input)
c1_embedding = c_emb(c1_input)
q2_embedding = q_emb(q2_input)
c2_embedding = c_emb(c2_input)

q1_avg = GlobalAveragePooling1D()(q1_embedding)
c1_avg = GlobalAveragePooling1D()(c1_embedding)
q2_avg = GlobalAveragePooling1D()(q2_embedding)
c2_avg = GlobalAveragePooling1D()(c2_embedding)

input_vecs = Concatenate()([q1_avg, c1_avg, q2_avg, c2_avg])

x = Dense(256, activation="relu")(input_vecs) 

x = Dropout(0.1)(x)

x = Dense(128, activation="relu")(x) 

x = Dropout(0.1)(x)

y = Dense(1, activation="linear")(x)

check = Model(inputs = c1_input, outputs=c1_avg)

model = Model(inputs=[q1_input, c1_input, q2_input, c2_input], outputs=y)
model.compile(loss= 'mse', optimizer = 'adam', metrics=['mse', 'mae'])


In [18]:
mc = ModelCheckpoint(checkpoint, monitor='val_loss', verbose=1, save_best_only = True)
es = EarlyStopping(monitor = 'val_loss', patience = 2000, verbose=1, restore_best_weights = True)

In [19]:
print(train_x[:10])

[array([[ 1030,  7141,  6218, ...,     0,     0,     0],
       [ 1877,  7547,    53, ...,     0,     0,     0],
       [32661,   406,  2595, ...,     0,     0,     0],
       ...,
       [  654, 23200,  7703, ...,     0,     0,     0],
       [  297,  5698,  5807, ...,     0,     0,     0],
       [ 9126,  1883,    82, ...,     0,     0,     0]], dtype=int32), array([[ 888, 2003,    1, ...,    0,    0,    0],
       [  14,   21,    1, ...,    0,    0,    0],
       [  97,   79,   75, ...,    0,    0,    0],
       ...,
       [  19,   12,    6, ...,    0,    0,    0],
       [ 111,  225, 1026, ...,    0,    0,    0],
       [   9,   11,   42, ...,    0,    0,    0]], dtype=int32), array([[ 6218,     0,     0, ...,     0,     0,     0],
       [ 7547,   930,     0, ...,     0,     0,     0],
       [  538,     0,     0, ...,     0,     0,     0],
       ...,
       [ 3252,  1264,     0, ...,     0,     0,     0],
       [  318,  5698,  5807, ...,     0,     0,     0],
       [12577,  1

In [20]:
#history = model.fit(train_x, train_y, validation_data = (val_x, val_y), epochs = epochs, batch_size = batch_size,callbacks = [es, mc])
history = model.fit(train_x, train_y, validation_data = (val_x, val_y), epochs = 1, batch_size = batch_size)



In [21]:
model.load_weights(checkpoint)

In [22]:
def calculate_labels_from_predictions(val_y, num_of_classes):
    lab_y = []
    
    for v in val_y:
        if np.isnan(v):
            lab_y.append(num_of_classes - 1)
            print('Still None')
            continue
        t = round(v[0])
        if t < 0:
            t = 0
        if t > num_of_classes - 1:
            t = num_of_classes - 1
        lab_y.append(t)
    return lab_y

In [24]:
pred_y = model.predict(test_x)
num_of_classes = 5

In [25]:
print(pred_y[:10])
print(np.isnan(pred_y[0]))

[[0.8588687 ]
 [1.1846895 ]
 [0.22063911]
 [3.7653232 ]
 [1.1964676 ]
 [2.9128995 ]
 [1.9520273 ]
 [0.4687314 ]
 [0.09857488]
 [0.31484908]]
[False]


In [26]:
pred_y = calculate_labels_from_predictions(pred_y, num_of_classes)

In [27]:
acc = accuracy_score(test_y, pred_y)
male = np.sum([abs(pred_y[i] - a) for i,a in enumerate(test_y)]) / len(test_y)
with open(result_path, 'w') as f:
    for item in [acc, male]:
        f.write("%s\n" % item)

In [28]:
print([acc,male])

[0.5881182964180444, 0.45956173738779543]


In [29]:
print([len(test_y), len(pred_y)])

[8159537, 8159537]
