In [2]:
import numpy as np
import pandas as pd
import os

from utils import ToxicCommentsDataset, RocAucEvaluation

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from xgboost import XGBClassifier

from keras.layers import Dense,Input,LSTM,Bidirectional,Activation,Conv1D,GRU
from keras.callbacks import Callback
from keras.layers import Dropout,Embedding,GlobalMaxPooling1D, MaxPooling1D, Add, Flatten
from keras.preprocessing import text, sequence
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.models import Model
from keras.optimizers import Adam

Using TensorFlow backend.


In [3]:
desktop_data_dir = '/home/nhan/Downloads/toxic_comments'
laptop_data_dir = '/mnt/d/kaggle-toxic-comments/data'

In [4]:
toxic_comments_dataset = ToxicCommentsDataset(desktop_data_dir,
                                              'train.csv',
                                              'test.csv')

In [5]:
max_words = 100000
maxlen = 150

In [6]:
_, y_train, _ = toxic_comments_dataset.get_texts_and_train_labels()

In [7]:
x_train, x_test, word_index = toxic_comments_dataset.tokenize_by_keras(max_words=max_words, maxlen=maxlen)

Found 394787 unique tokens in corpus.


In [8]:
embedding_dir = '/home/nhan/Downloads/word_embeddings/'

In [9]:
embedding_files = sorted(os.listdir(embedding_dir))[:2]

In [10]:
list_embeddings_index = []
for file in embedding_files:
    embeddings_index = {}
    with open(os.path.join(embedding_dir, file),encoding='utf8') as f:
        for line in f:
            values = line.rstrip().rsplit(' ')
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    f.close()
    list_embeddings_index.append(embeddings_index)

In [11]:
#prepare embedding matrix
num_words = min(max_words, len(word_index) + 1)
embedding_matrix0 = np.zeros((num_words, 300))
for word, i in word_index.items():
    if i >= max_words:
        continue
    embedding_vector = list_embeddings_index[0].get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix0[i] = embedding_vector

sequence_input = Input(shape=(maxlen, ))
x_glove300 = Embedding(max_words, 300, weights=[embedding_matrix0], trainable = False)(sequence_input)

num_words = min(max_words, len(word_index) + 1)
embedding_matrix1 = np.zeros((num_words, 200))
for word, i in word_index.items():
    if i >= max_words:
        continue
    embedding_vector = list_embeddings_index[1].get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix1[i] = embedding_vector

x_glove200 = Embedding(max_words, 200, weights=[embedding_matrix1], trainable = False)(sequence_input)

del list_embeddings_index

In [12]:
x = concatenate([x_glove300, x_glove200])
x = SpatialDropout1D(0.2)(x)
x = Bidirectional(GRU(256, return_sequences=True, dropout=0.2, recurrent_dropout=0.2))(x)
x = Conv1D(128, kernel_size = 3, padding = "valid", kernel_initializer = "glorot_uniform")(x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
x = concatenate([avg_pool, max_pool]) 
preds = Dense(6, activation="sigmoid")(x)
model = Model(sequence_input, preds)
model.compile(loss='binary_crossentropy',optimizer=Adam(lr=1e-3),metrics=['accuracy'])

In [13]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 150)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 150, 300)     30000000    input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 150, 200)     20000000    input_1[0][0]                    
__________________________________________________________________________________________________
concatenate_1 (Concatenate)     (None, 150, 500)     0           embedding_1[0][0]                
                                                                 embedding_2[0][0]                
__________

In [14]:
batch_size = 256
# epochs = 500
X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.9, random_state=233)



In [15]:
filepath = "weights_base.best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
early = EarlyStopping(monitor="val_acc", mode="max", patience=5)
ra_val = RocAucEvaluation(validation_data=(X_val, y_val), interval = 1)
callbacks_list = [ra_val,checkpoint, early]

In [18]:
import h5py

model.fit(X_tra, y_tra, 
          batch_size=batch_size, epochs=epochs, 
          validation_data=(X_val, y_val),
          callbacks = callbacks_list,verbose=1)

  from ._conv import register_converters as _register_converters


Train on 143613 samples, validate on 15958 samples
Epoch 1/4
 16256/143613 [==>...........................] - ETA: 3:44 - loss: 0.0423 - acc: 0.9836

KeyboardInterrupt: 

In [None]:
param_grid = {'n_estimators': np.array([5, 10, 15, 20, 25]),
              'max_depth': np.array([5,10,15,20,25]),
              'subsample': np.array([0.5,0.6,0.7,0.8,0.9,1.0]),
              'colsample_bytree': np.array([0.5,0.6,0.7,0.8,0.9,1.0]),
              'learning_rate': np.array([0.01,0.05,0.10,0.20,0.30,0.40]),
              'gamma': np.array([0.00,0.05,0.10,0.15,0.20]),
              'scale_pos_weight': np.array([30,40,50,300,400,500,600,700])}

In [None]:
randomized = RandomizedSearchCV(xgb, param_distributions=param_grid,
                                n_iter=5, scoring='roc_auc')

In [None]:
randomized.fit(tf_idf_x_train, y_train[:, 0])

In [None]:
outputs = pd.read_csv('outputs/xgb_outputs.csv')
outputs.head()

In [None]:
toxic_comments_dataset = ToxicCommentsDataset(data_dir='/home/nhan/Downloads/toxic_comments',
                                              train_csv_file='train.csv',
                                              test_csv_file='test.csv')

In [None]:
train_texts, y_train, test_texts = toxic_comments_dataset.get_texts_and_train_labels()

In [None]:
predictions_on_test = outputs.iloc[len(train_texts):, :]

In [None]:
len(predictions_on_test) == len(test_texts)

In [None]:
test_idx = pd.read_csv(os.path.join('/home/nhan/Downloads/toxic_comments', 'test.csv')).iloc[:, 0]

In [None]:
columns = predictions_on_test.columns.tolist()
columns = columns[-1:] + columns[:-1]
predictions_on_test = predictions_on_test[columns]

In [None]:
predictions_on_test.loc[:, 'id'] = test_idx.values
predictions_on_test.to_csv('outputs/xgb_outputs_test_with_idx.csv', index=False)

In [None]:
predictions_on_train = outputs.iloc[:len(train_texts), :]
len(predictions_on_train) == len(train_texts)

In [None]:
train_idx = pd.read_csv(os.path.join('/home/nhan/Downloads/toxic_comments', 'train.csv')).iloc[:, 0]

In [None]:
predictions_on_train.loc[:, 'id'] = train_idx.values
predictions_on_train = predictions_on_train[columns]
predictions_on_train.to_csv('outputs/xgb_outputs_train_with_idx.csv', index=False)