In [2]:

import os
import numpy as np 
import pandas as pd 
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

In [24]:
train_df = pd.read_csv("../project/input/train.csv")
train_df, val_df = train_test_split(train_df, test_size=0.1)

In [7]:
emb_index = {}
emb_file = open('../project/input/embeddings/glove.840B.300d/glove.840B.300d.txt')
for line in tqdm(emb_file):
    emb_values = line.split(" ")
    word = emb_values[0]
    vecs = np.asarray(emb_values[1:], dtype='float32')
    emb_index[word] = vecs
emb_file.close()

print('Found %s word vectors.' % len(emb_index))

2196017it [02:00, 18241.99it/s]

Found 2196016 word vectors.





In [8]:
TIME_STEP = 30
EMB_VEC_SIZE = 300
BATCH_SIZE = 128
def text_to_vec(text):
    init_emb = np.zeros(EMB_VEC_SIZE)
    text = text[:-1].split()[:TIME_STEP]
    embeds = [emb_index.get(x, init_emb) for x in text]
    embeds+= [init_emb] * (TIME_STEP - len(embeds))
    return np.array(embeds)


In [9]:

def train_gen(train_df, shuffle=False):
    n_batches = math.ceil(len(train_df) / BATCH_SIZE)
    while True: 
        if(shuffle==True):
            train_df = train_df.sample(frac=1.)  # Shuffle the data.
        for i in range(n_batches):
            texts = train_df.iloc[i*BATCH_SIZE:(i+1)*BATCH_SIZE, 1]
            text_arr = np.array([text_to_vec(text) for text in texts])
            yield text_arr, np.array(train_df["target"][i*BATCH_SIZE:(i+1)*BATCH_SIZE])

In [10]:
from keras.models import  Model
from keras.layers import LSTM, Dense, Bidirectional, Input, CuDNNLSTM
from tensorflow.python.client import device_lib

Using TensorFlow backend.


In [11]:
def get_available_gpus():
        local_device_protos = device_lib.list_local_devices()
        return [x.name for x in local_device_protos if x.device_type == 'GPU']
    
def get_base_model(input):

  num_gpus = get_available_gpus()
  #print(len(num_gpus))
  #Build LSTM network

  if(len(num_gpus)>0):
    first_lstm = Bidirectional(CuDNNLSTM(64, return_sequences=True, name='lstm1'))(input)
    second_lstm = Bidirectional(CuDNNLSTM(64, name='lstm2'))(first_lstm)
  else:  
    first_lstm = Bidirectional(LSTM(64, return_sequences=True, name='lstm1'))(input)
    second_lstm = Bidirectional(LSTM(64, name='lstm2'))(first_lstm)
  outputs = Dense(1, activation="sigmoid", name='last')(second_lstm)
  
  #create model LSTM+dense
  l_model = Model(input, outputs) 
  
  return(l_model)

In [12]:
features = Input(shape=(TIME_STEP, EMB_VEC_SIZE))
model = get_base_model(features)
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.


In [17]:

train_generator = train_gen(train_df, True)
validation_generator = train_gen(val_df, False)
model.fit_generator(train_generator,
                    steps_per_epoch=int(len(train_df) / BATCH_SIZE),
                    validation_data=validation_generator,
                    validation_steps = int(len(val_df) / BATCH_SIZE),
                    epochs=1,
                    verbose=True)

Instructions for updating:
Use tf.cast instead.
Epoch 1/1


<keras.callbacks.History at 0x1a77b56c88>

In [39]:
def test_gen(train_df):
    n_batches = math.ceil(len(train_df) / BATCH_SIZE)
    for i in range(n_batches):
            texts = train_df.iloc[i*BATCH_SIZE:(i+1)*BATCH_SIZE, 1]
            text_arr = np.array([text_to_vec(text) for text in texts])
            yield text_arr, np.array(train_df["target"][i*BATCH_SIZE:(i+1)*BATCH_SIZE])

In [77]:
y_test = []
y_preds = []
for x, y in tqdm(test_gen(val_df)):
    y_preds.extend(model.predict(x))
    y_test.extend(y)


0it [00:00, ?it/s][A
1it [00:00,  5.07it/s][A
2it [00:00,  4.96it/s][A
3it [00:00,  4.90it/s][A
4it [00:00,  5.08it/s][A
5it [00:01,  4.98it/s][A
6it [00:01,  4.94it/s][A
7it [00:01,  5.07it/s][A
8it [00:01,  4.98it/s][A
9it [00:01,  4.93it/s][A
10it [00:02,  5.04it/s][A
11it [00:02,  4.95it/s][A
12it [00:02,  4.88it/s][A
13it [00:02,  5.03it/s][A
14it [00:02,  4.95it/s][A
15it [00:03,  4.90it/s][A
16it [00:03,  5.07it/s][A
17it [00:03,  4.98it/s][A
18it [00:03,  4.91it/s][A
19it [00:03,  5.04it/s][A
20it [00:04,  4.94it/s][A
21it [00:04,  4.88it/s][A
22it [00:04,  5.03it/s][A
23it [00:04,  5.00it/s][A
24it [00:04,  4.91it/s][A
25it [00:05,  5.02it/s][A
26it [00:05,  4.98it/s][A
27it [00:05,  4.90it/s][A
28it [00:05,  4.71it/s][A
29it [00:05,  4.91it/s][A
30it [00:06,  4.86it/s][A
31it [00:06,  4.80it/s][A
32it [00:06,  4.92it/s][A
33it [00:06,  4.86it/s][A
34it [00:06,  4.79it/s][A
35it [00:07,  4.94it/s][A
36it [00:07,  4.94it/s][A
37it [00:07,  

296it [01:04,  4.23it/s][A
297it [01:05,  4.35it/s][A
298it [01:05,  4.42it/s][A
299it [01:05,  4.35it/s][A
300it [01:05,  4.30it/s][A
301it [01:05,  4.27it/s][A
302it [01:06,  4.39it/s][A
303it [01:06,  4.32it/s][A
304it [01:06,  4.29it/s][A
305it [01:06,  4.27it/s][A
306it [01:07,  4.41it/s][A
307it [01:07,  4.43it/s][A
308it [01:07,  4.37it/s][A
309it [01:07,  4.33it/s][A
310it [01:08,  4.30it/s][A
311it [01:08,  4.42it/s][A
312it [01:08,  4.33it/s][A
313it [01:08,  4.29it/s][A
314it [01:08,  4.26it/s][A
315it [01:09,  4.37it/s][A
316it [01:09,  4.39it/s][A
317it [01:09,  4.34it/s][A
318it [01:09,  4.28it/s][A
319it [01:10,  4.25it/s][A
320it [01:10,  4.37it/s][A
321it [01:10,  4.32it/s][A
322it [01:10,  4.28it/s][A
323it [01:11,  4.25it/s][A
324it [01:11,  4.22it/s][A
325it [01:11,  4.37it/s][A
326it [01:11,  4.30it/s][A
327it [01:12,  4.25it/s][A
328it [01:12,  4.25it/s][A
329it [01:12,  4.38it/s][A
330it [01:12,  4.44it/s][A
331it [01:12,  4.34i

588it [02:12,  4.55it/s][A
589it [02:12,  4.58it/s][A
590it [02:12,  4.48it/s][A
591it [02:12,  4.36it/s][A
592it [02:13,  4.53it/s][A
593it [02:13,  4.61it/s][A
594it [02:13,  4.53it/s][A
595it [02:13,  4.48it/s][A
596it [02:13,  4.60it/s][A
597it [02:14,  4.53it/s][A
598it [02:14,  4.46it/s][A
599it [02:14,  4.41it/s][A
600it [02:14,  4.52it/s][A
601it [02:15,  4.55it/s][A
602it [02:15,  4.47it/s][A
603it [02:15,  4.43it/s][A
604it [02:15,  4.53it/s][A
605it [02:15,  4.59it/s][A
606it [02:16,  4.49it/s][A
607it [02:16,  4.41it/s][A
608it [02:16,  4.57it/s][A
609it [02:16,  4.58it/s][A
610it [02:17,  4.48it/s][A
611it [02:17,  4.43it/s][A
612it [02:17,  4.55it/s][A
613it [02:17,  4.58it/s][A
614it [02:17,  4.51it/s][A
615it [02:18,  4.48it/s][A
616it [02:18,  4.61it/s][A
617it [02:18,  4.54it/s][A
618it [02:18,  4.45it/s][A
619it [02:19,  4.42it/s][A
620it [02:19,  4.55it/s][A
621it [02:19,  4.55it/s][A
622it [02:19,  4.45it/s][A
623it [02:19,  4.40i

880it [03:18,  4.42it/s][A
881it [03:18,  4.37it/s][A
882it [03:19,  4.34it/s][A
883it [03:19,  4.46it/s][A
884it [03:19,  4.40it/s][A
885it [03:19,  4.36it/s][A
886it [03:20,  4.32it/s][A
887it [03:20,  4.46it/s][A
888it [03:20,  4.49it/s][A
889it [03:20,  4.41it/s][A
890it [03:20,  4.37it/s][A
891it [03:21,  4.32it/s][A
892it [03:21,  4.48it/s][A
893it [03:21,  4.42it/s][A
894it [03:21,  4.35it/s][A
895it [03:22,  4.33it/s][A
896it [03:22,  4.43it/s][A
897it [03:22,  4.38it/s][A
898it [03:22,  4.32it/s][A
899it [03:22,  4.32it/s][A
900it [03:23,  4.28it/s][A
901it [03:23,  4.42it/s][A
902it [03:23,  4.38it/s][A
903it [03:23,  4.33it/s][A
904it [03:24,  4.29it/s][A
905it [03:24,  4.43it/s][A
906it [03:24,  4.39it/s][A
907it [03:24,  4.35it/s][A
908it [03:25,  4.34it/s][A
909it [03:25,  4.47it/s][A
910it [03:25,  4.48it/s][A
911it [03:25,  4.41it/s][A
912it [03:25,  4.37it/s][A
913it [03:26,  4.33it/s][A
914it [03:26,  4.45it/s][A
915it [03:26,  4.40i

In [100]:
y1 = np.array(y_preds)
y2 = np.array(y_test)
y11 = y1.squeeze()
y1 = y11 > 0.5
y1 = y1.astype(np.int)        
score = f1_score(y2, y1)
print(score)    

0.6076005302695537
