In [1]:
import os
import time
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm
import math
from sklearn.model_selection import train_test_split
from sklearn import metrics

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers

Using TensorFlow backend.


In [2]:
train_df = pd.read_csv("../project/input/train.csv")
test_df = pd.read_csv("../project/input/test.csv")
print("Train shape : ",train_df.shape)
print("Test shape : ",test_df.shape)

Train shape :  (1306122, 3)
Test shape :  (375806, 2)


In [3]:
## Config values 
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=2019)
embed_size = 300 # how big is each word vector
max_features = 50000 # how many unique words to use (i.e num rows in embedding vector)
maxlen = 100 # max number of words in a question to use

## fill up the missing values
train_X = train_df["question_text"].fillna("_na_").values
val_X = val_df["question_text"].fillna("_na_").values
test_X = test_df["question_text"].fillna("_na_").values

## Tokenize the sentences
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(train_X))
train_X = tokenizer.texts_to_sequences(train_X)
val_X = tokenizer.texts_to_sequences(val_X)
test_X = tokenizer.texts_to_sequences(test_X)

## Pad the sentences 
train_X = pad_sequences(train_X, maxlen=maxlen)
val_X = pad_sequences(val_X, maxlen=maxlen)
test_X = pad_sequences(test_X, maxlen=maxlen)

## Get the target values
train_y = train_df['target'].values
val_y = val_df['target'].values

In [10]:
EMBEDDING_FILE = '../project/input/embeddings/glove.840B.300d/glove.840B.300d.txt'
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.split(" ")) for o in open(EMBEDDING_FILE))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
embed_size = all_embs.shape[1]

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        

  """


In [19]:
from tensorflow.python.client import device_lib
def get_available_gpus():
        local_device_protos = device_lib.list_local_devices()
        return [x.name for x in local_device_protos if x.device_type == 'GPU']
    
def get_base_model(input):
  num_gpus = get_available_gpus()
  #print(len(num_gpus))
  #Build LSTM network

  emb_out = Embedding(max_features, embed_size, weights=[embedding_matrix])(input)

  if(len(num_gpus)>0):
     from keras.layers import CuDNNLSTM
     lstm_out = Bidirectional(CuDNNLSTM(64, return_sequences=True))(emb_out)
  else:
     lstm_out = Bidirectional(LSTM(64, return_sequences=True))(emb_out)
        
  max_out = GlobalMaxPool1D()(lstm_out)
  dense_16 = Dense(16, activation="relu")(max_out)
  drop_out = Dropout(0.1)(dense_16)
  output = Dense(1, activation="sigmoid")(drop_out)
  l_model = Model(input, output) 
  
  return(l_model)

In [20]:
features = Input(shape=(maxlen,))

model = get_base_model(features)
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
print(model.summary())    

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 100, 300)          15000000  
_________________________________________________________________
bidirectional_5 (Bidirection (None, 100, 128)          186880    
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 128)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 16)                2064      
_________________________________________________________________
dropout_5 (Dropout)          (None, 16)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 17        
Total para

In [21]:
model.fit(train_X, train_y, batch_size=512, epochs=10, validation_data=(val_X, val_y))

Train on 1175509 samples, validate on 130613 samples
Epoch 1/10
   7168/1175509 [..............................] - ETA: 1:11:37 - loss: 0.3182 - acc: 0.8836

KeyboardInterrupt: 

In [22]:
pred_glove_val_y = model.predict([val_X], batch_size=1024, verbose=1)
for thresh in np.arange(0.1, 0.501, 0.01):
    thresh = np.round(thresh, 2)
    f1_score = metrics.f1_score(val_y, (pred_glove_val_y>thresh).astype(int))
    print("F1 score at threshold {0} is {1}".format(thresh, f1_ecore))



NameError: name 'f1_ecore' is not defined