In [63]:
import pandas as pd
import numpy as np
from pandasql import sqldf
import boto3
import json
import os
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras import models, layers
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# python -m spacy download en
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# %run '../extra_fns.ipynb'

In [64]:
with open('../config.json') as json_data:
    config = json.load(json_data)

In [65]:
s3 = boto3.client(
    's3',
    aws_access_key_id=config['boto']['aws_access_key_id'],
    aws_secret_access_key=config['boto']['aws_secret_access_key']
)

In [66]:
model_dir = 'trained_models'
try:
    os.makedirs(model_dir)
except Exception as e:
    print(e)

[Errno 17] File exists: 'trained_models'


In [67]:
data_dir = 'data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    
train_file = data_dir + '/train.csv'
test_file = data_dir + '/test.csv'

In [68]:
# s3.upload_file(train_file, config['boto']['buckets']['kaggle'], train_file)
# s3.upload_file(src_file_cleaned, boto_config['buckets']['kaggle'], src_file_cleaned)

# s3.download_file(boto_config['buckets']['kaggle'], src_file, src_file)
# s3.download_file(boto_config['buckets']['kaggle'], src_file_cleaned, src_file_cleaned)

In [69]:
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

In [70]:
print('% of positive class: {0}'.format(train.target.sum()/train.shape[0]*100))

% of positive class: 6.187017751787352


In [71]:
train['question_length'] = train.question_text.str.len()

# Preprocess data (spacy)

In [130]:
# # clean up text
# nlp = spacy.load('en')

# def token_filter(token):
#     return not (token.is_punct | token.is_space | token.is_stop)

# filtered_tokens = []
# for doc in nlp.pipe(train.question_text.tolist()):
#     tokens = [token.text for token in doc if token_filter(token) and token.text not in STOP_WORDS]
#     filtered_tokens.append(' '.join(tokens))
# # data['clean_text'] = filtered_tokens

# # data.to_csv(src_file_cleaned, index=False)

# Split data

In [131]:
train.question_length.describe()

count    456257.000000
mean         38.650471
std           8.102890
min           1.000000
25%          33.000000
50%          40.000000
75%          45.000000
max          50.000000
Name: question_length, dtype: float64

In [132]:
seq_maxlen = 50
train = train[train.question_length<=seq_maxlen]

In [145]:
X_train, X_test, y_train, y_test = train_test_split(train.question_text, 
                                                    train.target, 
                                                    test_size=0.1, 
                                                    random_state=1, 
                                                    stratify=train.target)

# Sequentialize words

In [146]:
vocab_maxlen = 15000
tk = Tokenizer(num_words=vocab_maxlen)

In [147]:
tk.fit_on_texts(X_train)

In [148]:
X_train = tk.texts_to_sequences(X_train)
X_test = tk.texts_to_sequences(X_test)

In [149]:
X_train = pad_sequences(X_train, maxlen=seq_maxlen)
X_test = pad_sequences(X_test, maxlen=seq_maxlen)

# Training

In [150]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=1),
    ModelCheckpoint(filepath=model_dir + '/basic_model.h5', monitor='val_loss', save_best_only=True)
]

In [151]:
embedding_dim = 50
model = models.Sequential()

model.add(layers.Embedding(vocab_maxlen, embedding_dim, input_length=seq_maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(60, activation='relu'))
model.add(layers.Dense(30, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 60, 50)            750000    
_________________________________________________________________
flatten_7 (Flatten)          (None, 3000)              0         
_________________________________________________________________
dense_16 (Dense)             (None, 60)                180060    
_________________________________________________________________
dense_17 (Dense)             (None, 30)                1830      
_________________________________________________________________
dropout_6 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 31        
Total params: 931,921
Trainable params: 931,921
Non-trainable params: 0
_________________________________________________________________


In [152]:
history = model.fit(X_train, 
                    y_train, 
                    epochs=10, 
                    batch_size=1024, 
                    validation_split=0.10, 
                    callbacks=callbacks)

Train on 369567 samples, validate on 41064 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10


In [153]:
model.evaluate(X_test, y_test)



[0.09000294391853417, 0.9700828475018456]

In [None]:
submission = pd.DataFrame()
submission['qid'] = test.qid
submission['prediction'] = model.predict_classes(X_test)

submission.to_csv(data_dir + '/submission.csv', index=False)