In [10]:
import pandas as pd
import numpy as np
from pandasql import sqldf
import boto3
import json
import os
from keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard
from keras import models, layers
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# python -m spacy download en
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.metrics import confusion_matrix, f1_score, precision_score, recall_score

# %run '../extra_fns.ipynb'

In [11]:
with open('../config.json') as json_data:
    config = json.load(json_data)

In [12]:
s3 = boto3.client(
    's3',
    aws_access_key_id=config['boto']['aws_access_key_id'],
    aws_secret_access_key=config['boto']['aws_secret_access_key']
)

In [13]:
model_dir = 'trained_models'
try:
    os.makedirs(model_dir)
except Exception as e:
    print(e)

[Errno 17] File exists: 'trained_models'


In [14]:
data_dir = 'data'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)
    
train_file = data_dir + '/train.csv'
test_file = data_dir + '/test.csv'

In [15]:
# s3.upload_file(train_file, config['boto']['buckets']['kaggle'], train_file)
# s3.upload_file(src_file_cleaned, boto_config['buckets']['kaggle'], src_file_cleaned)

# s3.download_file(boto_config['buckets']['kaggle'], src_file, src_file)
# s3.download_file(boto_config['buckets']['kaggle'], src_file_cleaned, src_file_cleaned)

In [16]:
train = pd.read_csv(train_file)
test = pd.read_csv(test_file)

In [17]:
print('% of positive class: {0}'.format(train.target.sum()/train.shape[0]*100))

% of positive class: 6.187017751787352


In [18]:
train['question_length'] = train.question_text.str.len()

# Preprocess data (spacy)

In [19]:
# # clean up text
# nlp = spacy.load('en')

# def token_filter(token):
#     return not (token.is_punct | token.is_space | token.is_stop)

# filtered_tokens = []
# for doc in nlp.pipe(train.question_text.tolist()):
#     tokens = [token.text for token in doc if token_filter(token) and token.text not in STOP_WORDS]
#     filtered_tokens.append(' '.join(tokens))
# # data['clean_text'] = filtered_tokens

# # data.to_csv(src_file_cleaned, index=False)

# Split data

In [20]:
train.question_length.describe()

count    1.306122e+06
mean     7.067884e+01
std      3.878428e+01
min      1.000000e+00
25%      4.500000e+01
50%      6.000000e+01
75%      8.500000e+01
max      1.017000e+03
Name: question_length, dtype: float64

In [21]:
seq_maxlen = 50
train = train[train.question_length<=seq_maxlen]

In [22]:
X_train, X_test, y_train, y_test = train_test_split(np.array(train.question_text), 
                                                    np.array(train.target), 
                                                    test_size=0.1, 
                                                    random_state=1, 
                                                    stratify=train.target)

# Undersample

In [23]:
# ix_pos = np.where(y_train==1)[0]
# ix_neg = np.where(y_train==0)[0]

In [24]:
# us_rate = 0.3
# np.random.shuffle(ix_neg)
# ix_neg = ix_neg[0:int((1-us_rate)*ix_neg.shape[0])]

In [25]:
# ix = np.append(ix_pos, ix_neg)
# np.random.shuffle(ix)

In [26]:
# y_train = y_train[ix]
# X_train = X_train[ix]

# Sample Weight

In [27]:
from sklearn.utils import class_weight
list_classes = [0,1]
sample_weights = class_weight.compute_sample_weight('balanced', y_train)

In [28]:
sample_weights[0:15]

array([ 0.51940605,  0.51940605,  0.51940605,  0.51940605,  0.51940605,
        0.51940605,  0.51940605,  0.51940605,  0.51940605,  0.51940605,
        0.51940605,  0.51940605,  0.51940605,  0.51940605, 13.38257724])

In [29]:
pos_perc = round(y_train.sum()/y_train.shape[0], 2)

In [30]:
weights = {0:(1-pos_perc), 1:pos_perc}

# Sequentialize words

In [31]:
vocab_maxlen = 15000
tk = Tokenizer(num_words=vocab_maxlen)

In [32]:
tk.fit_on_texts(X_train)

In [33]:
X_train = tk.texts_to_sequences(X_train)
X_test = tk.texts_to_sequences(X_test)

In [34]:
X_train = pad_sequences(X_train, maxlen=seq_maxlen)
X_test = pad_sequences(X_test, maxlen=seq_maxlen)

# Training

In [35]:
callbacks = [
    EarlyStopping(monitor='val_loss', patience=1),
    ModelCheckpoint(filepath=model_dir + '/basic_model.h5', monitor='val_loss', save_best_only=True)
]

In [36]:
embedding_dim = 50
model = models.Sequential()

model.add(layers.Embedding(vocab_maxlen, embedding_dim, input_length=seq_maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(60, activation='relu'))
model.add(layers.Dense(30, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            750000    
_________________________________________________________________
flatten_1 (Flatten)          (None, 2500)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 60)                150060    
_________________________________________________________________
dense_2 (Dense)              (None, 30)                1830      
_________________________________________________________________
dropout_1 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 31        
Total params: 901,921
Trainable params: 901,921
Non-trainable params: 0
_________________________________________________________________


In [37]:
history = model.fit(X_train, 
                    y_train, 
                    epochs=10, 
                    batch_size=512,
                    sample_weight=sample_weights,
                    validation_split=0.10, 
                    callbacks=callbacks)

Train on 369567 samples, validate on 41064 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10


In [38]:
model.evaluate(X_test, y_test)



[0.57563886898819, 0.7596984175741555]

In [40]:
embedding_dim = 50
model = models.Sequential()

model.add(layers.Embedding(vocab_maxlen, embedding_dim, input_length=seq_maxlen))
model.add(layers.LSTM(32))
model.add(layers.Dense(60, activation='relu'))
model.add(layers.Dense(30, activation='relu'))
model.add(layers.Dropout(0.2))
model.add(layers.Dense(1, activation='sigmoid'))

model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 50)            750000    
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                10624     
_________________________________________________________________
dense_7 (Dense)              (None, 60)                1980      
_________________________________________________________________
dense_8 (Dense)              (None, 30)                1830      
_________________________________________________________________
dropout_3 (Dropout)          (None, 30)                0         
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 31        
Total params: 764,465
Trainable params: 764,465
Non-trainable params: 0
_________________________________________________________________


In [41]:
history = model.fit(X_train, 
                    y_train, 
                    epochs=10, 
                    batch_size=512,
                    sample_weight=sample_weights,
                    validation_split=0.10, 
                    callbacks=callbacks)

Train on 369567 samples, validate on 41064 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [42]:
model.evaluate(X_test, y_test)



[0.2648771057874637, 0.8979310042545744]

In [51]:
confusion_matrix(y_test, model.predict_classes(X_test))

array([[36629,  7292],
       [  194,  1511]])

In [52]:
f1_score(y_test, model.predict_classes(X_test))

0.28759040730871716

In [53]:
precision_score(y_test, model.predict_classes(X_test))

0.17164602976258095

In [54]:
recall_score(y_test, model.predict_classes(X_test))

0.886217008797654

In [43]:
confusion_matrix(y_test, model.predict_classes(X_test))

array([[43662,   259],
       [ 1106,   599]])

In [45]:
f1_score(y_test, model.predict_classes(X_test))

0.46742099102614126

In [47]:
precision_score(y_test, model.predict_classes(X_test))

0.6981351981351981

In [46]:
recall_score(y_test, model.predict_classes(X_test))

0.35131964809384164

In [None]:
submission = pd.DataFrame()
submission['qid'] = test.qid
submission['prediction'] = model.predict_classes(X_test)

submission.to_csv(data_dir + '/submission.csv', index=False)