In [None]:
%%sh
## Dependencies
pip install pydotplus
pip install tqdm

In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
%load_ext autoreload
%autoreload 2

# helper python file
import ml_pipeline as pipeline
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold

Using TensorFlow backend.





In [3]:
(train_df,test_df)=pipeline.read_input_data()
(seq_train,seq_test,word_index)=pipeline.tokenize_data(train_df,test_df)

  2%|▏         | 3415/159571 [00:00<00:04, 34149.95it/s]

num train:  159571
num test:  153164
pre-processing train data...


100%|██████████| 159571/159571 [00:04<00:00, 35122.78it/s]
100%|██████████| 153164/153164 [00:03<00:00, 38701.33it/s]


tokenizing input data...
dictionary size:  282101


In [4]:
label_names = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [5]:
embedding_matrix=pipeline.prepare_embeddings("Glove-Twitter",word_index)

1193514it [01:14, 16096.20it/s]


preparing embedding matrix...
number of null word embeddings: 2316


In [13]:
#5-Fold Cross-Validation

kf = KFold(n_splits=5)
nfold=1
cvscores = []

#training params
batch_size = 256 
num_epochs = 10 

for train_index, test_index in kf.split(seq_train):
    print(f' In Fold {nfold}... : \n')
    print("TRAIN:", len(train_index), "TEST:", len(test_index))
    
    seq_train_split=seq_train[train_index]
    y_train_split = train_df[label_names].iloc[train_index ].values
    
    # train the model
    model=pipeline.init_cnn_model(embedding_matrix)
    hist=pipeline.train_model(model,num_epochs,batch_size,seq_train_split,y_train_split)
    
    # evaluate the model
    seq_test_split=seq_train[test_index]
    y_test_split = train_df[label_names].iloc[test_index ].values
    scores = model.evaluate(seq_test_split, y_test_split, verbose=0)
    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    
    # save the model
    pipeline.save_model(model, f'models/cnn-fold-{nfold}')
    cvscores.append(scores[1] * 100)
    
    # save OOF predictions
    y_test = model.predict(seq_test_split)
    pipeline.create_submission_file(train_df.iloc[test_index],y_test,f'outputs/cnn_glove_twitter_cv_{nfold}_oof.csv')
    
    nfold +=1

 In Fold 1... : 

TRAIN: 127656 TEST: 31915
Building CNN model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_12 (Embedding)     (None, 150, 200)          6000000   
_________________________________________________________________
conv1d_23 (Conv1D)           (None, 150, 64)           89664     
_________________________________________________________________
batch_normalization_12 (Batc (None, 150, 64)           256       
_________________________________________________________________
max_pooling1d_12 (MaxPooling (None, 75, 64)            0         
_________________________________________________________________
conv1d_24 (Conv1D)           (None, 75, 64)            28736     
_________________________________________________________________
global_max_pooling1d_12 (Glo (None, 64)                0         
_________________________________________________________________
dropout_12

Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
<keras.callbacks.History object at 0x7f8101713e80>
acc: 98.25%
Saved model to disk
 In Fold 4... : 

TRAIN: 127657 TEST: 31914
Building CNN model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_15 (Embedding)     (None, 150, 200)          6000000   
_________________________________________________________________
conv1d_29 (Conv1D)           (None, 150, 64)           89664     
_________________________________________________________________
batch_normalization_15 (Batc (None, 150, 64)           256       
_________________________________________________________________
max_pooling1d_15 (MaxPooling (None, 75, 64)            0         
_________________________________________________________________
conv1d_30 (Conv1D)           (None, 75, 64)            28736     
_______________________________________________

In [14]:
cvscores

[98.10642722630489,
 98.17739214568148,
 98.25259423341086,
 98.15598036432024,
 98.18888144130925]

In [15]:
best_model_index=cvscores.index(max(cvscores))+1
print (f'best_model_index : {best_model_index}')
best_model=pipeline.load_model(f'models/cnn-fold-{best_model_index}')

best_model_index : 3
Loaded model from disk


In [26]:
y_test = best_model.predict(seq_test)
pipeline.create_submission_file(test_df,y_test,f'outputs/cnn_glove_twitter_cv_submission.csv')

(153164, 7)


In [37]:
# concatenate OOF files
dataframes = []
filenames=[1,2,3,4,5]
for i in filenames:
    dataframes.append(pd.read_csv(f'outputs/cnn_glove_twitter_cv_{i}_oof.csv'))
oof=dataframes[0]
for df in dataframes[1:]:
    oof=oof.append(df)
print (oof.shape)
oof.to_csv('outputs/cnn_glove_twitter_cv_oof.csv', index=False)