In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
# set path/seeds
sys.path.append('../')
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(42)
import tensorflow as tf
tf.set_random_seed(42)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# %matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, Flatten, BatchNormalization
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping, ModelCheckpoint, Callback
from keras.optimizers import Adam

# custom imports
from utils.custom_keras import toggle_train, all_layers_train, ROC_Eval
from utils.metrics import accuracy
from utils.metrics import columnwise_auc

# get data
train = pd.read_csv('../../data/train.csv')

# train cutdown (just for testing purposes, can cut down amount of data here)
train = train.sample(frac=0.1, random_state=42)

# seperate classes
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values


## CREATE MY SPLIT HERE
X_train, X_test, y_train, y_test= train_test_split(train, y, test_size = 0.1, 
                                                random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1,
                                                 random_state=42)

list_sentences_train = X_train["comment_text"]
list_sentences_val = X_val["comment_text"]
list_sentences_test = X_test["comment_text"]

# tokenize/index
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_val = tokenizer.texts_to_sequences(list_sentences_val)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

# get single sequence length
maxlen = 250
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_val = pad_sequences(list_tokenized_val, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)


# get gembeddings and read into a dict (word: vector)
# then use the embeddings to make embedding matrix, random init for ones not in vocab
embedding_file = "../glove.6B.200d.txt"
embed_size = 200

embeddings_index = pd.read_table(embedding_file, sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)


all_embs = np.stack(embeddings_index.values)
emb_mean,emb_std = all_embs.mean(), all_embs.std()
word_index = tokenizer.word_index
nb_words = min(len(word_index), max_features)

# init with random ones for words not seen
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector
        


# model building

# first iteration, embeddings train = False, train for 2 epochs
inp = Input(shape=(maxlen, )) #maxlen=200 as defined earlier
x = Embedding(max_features, embed_size, weights=[embedding_matrix],
              name = 'embedding', trainable=False)(inp)
x = LSTM(60, return_sequences=True, name='lstm_layer',
         dropout=0.1, recurrent_dropout=0.1,
          kernel_regularizer=regularizers.l2(0.001),
          bias_regularizer=regularizers.l2(0.001))(x)
x = GlobalMaxPool1D()(x)
x = BatchNormalization()(x)
x = Dense(50, activation="relu",
          kernel_regularizer=regularizers.l2(0.001),
          bias_regularizer=regularizers.l2(0.001))(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)

model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
model.summary()


  (fname, cnt))
  (fname, cnt))
  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 250)               0         
_________________________________________________________________
embedding (Embedding)        (None, 250, 200)          4000000   
_________________________________________________________________
lstm_layer (LSTM)            (None, 250, 60)           62640     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 60)                0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 60)                240       
_________________________________________________________________
dense_1 (Dense)              (None, 50)                3050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
__________

In [2]:
# training
batch_size = 64

# call backs
file_path = "bidir_glov_reg001_embed6B200d_3e_multiTRAIN.hdf5"
es = EarlyStopping('val_loss', patience=5, mode="min")
msave = ModelCheckpoint(file_path, save_best_only=True)
roc = ROC_Eval(X_val, y_val)

# train everything but embedding
hist = model.fit(X_t,y_train, batch_size=batch_size, epochs=2,
          shuffle=True, callbacks=[es, msave, roc] , validation_data=(X_val, y_val))


# CAN LOWER THE LEARNING RATE MANUALLY ON THESE NEXT EPOCHS! ITS NOT LEARNING DICK I DONT THINK
# EVEN PARTWAY THROUGH THE SECOND?

# ALSO, COULD ALSO COMBINE THE LAST TWO? I.E. NOT HAVE A SEPERATE ONE WHERE WE JUST TRAIN
# THE EMBEDDING LAYER, BUT JUST HAVE ONE WHERE WE TRAIN EVERYTHING?








# # train only the embedding
# model = toggle_train(model)
# model.compile(loss='binary_crossentropy',
#                   optimizer='adam',
#                   metrics=['accuracy'])
# model.summary()
# model.fit(X_t,y_train, batch_size=batch_size, epochs=1,
#           shuffle=True) # callbacks=callbacks , validation_data=(X_val, y_val)


# # train everything
# model = all_layers_train(model)
# model.compile(loss='binary_crossentropy',
#                   optimizer='adam',
#                   metrics=['accuracy'])
# model.summary()
# model.fit(X_t,y_train, batch_size=batch_size, epochs=1,
#           shuffle=True) # callbacks=callbacks , validation_data=(X_val, y_val)


# model.save("bidir_glov_reg001_embed6B200d_3e_multiTRAIN.hdf5")


Train on 12924 samples, validate on 1437 samples
Epoch 1/2
 ROC-AUC - score: 0.807847
Epoch 2/2
 ROC-AUC - score: 0.894225


In [3]:
print(hist.history)
print(roc.aucs)

{'val_loss': [0.26782124882036396, 0.18430817811623496], 'val_acc': [0.9632335930710131, 0.969612609452478], 'loss': [0.4443782192935371, 0.22026551213474033], 'acc': [0.927692660954079, 0.9704554782899605]}
[0.8078472321660612, 0.8942250338662338]


In [None]:
# test on test data here X_te, y_test   Need to do the columwise AUC here and acc


probs = model.predict(X_te)


acc = accuracy(y_test, probs)
print("acc", acc)
mean_col_auc = columnwise_auc(y_test, probs)
print("mean col", mean_col_auc)
    


In [None]:
# submission prediction
model.load_weights("bidir_glov_reg001_embed6B200d_3e_multiTRAIN.hdf5")

sub = pd.read_csv('../../data/test.csv')
list_tokenized_sub = tokenizer.texts_to_sequences(sub["comment_text"])
X_sub = pad_sequences(list_tokenized_sub, maxlen=maxlen)

y_sub = model.predict(X_sub)

sample_submission = pd.read_csv("../../data/sample_submission.csv")
sample_submission[list_classes] = y_sub

sample_submission.to_csv("bidir_glov_reg001_embed6B200d_3e_multiTRAIN.csv", index=False)