In [3]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
# set path/seeds
sys.path.append('../')
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(42)
import tensorflow as tf
tf.set_random_seed(42)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# %matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, Flatten, BatchNormalization
from keras.layers import GlobalAvgPool1D, concatenate
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam, Nadam

# custom imports
from utils.custom_keras import toggle_train, all_layers_train
from utils.metrics import accuracy
from utils.metrics import columnwise_auc

# get data
train = pd.read_csv('../../data/train.csv')

# train cutdown (just for testing purposes, can cut down amount of data here)
train = train.sample(frac=0.1, random_state=42)

# seperate classes
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values


## CREATE MY SPLIT HERE
X_train, X_test, y_train, y_test= train_test_split(train, y, test_size = 0.1, 
                                                random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1,
                                                 random_state=42)

list_sentences_train = X_train["comment_text"]
list_sentences_val = X_val["comment_text"]
list_sentences_test = X_test["comment_text"]

# tokenize/index
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_val = tokenizer.texts_to_sequences(list_sentences_val)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

# get single sequence length
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_val = pad_sequences(list_tokenized_val, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)


# get gembeddings and read into a dict (word: vector)
# then use the embeddings to make embedding matrix, random init for ones not in vocab
embedding_file = "../glove.6B.200d.txt"
embed_size = 200

def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embedding_file))

all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()
word_index = tokenizer.word_index
nb_words = min(len(word_index), max_features)

# init with random ones for words not seen
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))

for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


def get_callbacks(filepath, patience=2):
    es = EarlyStopping('val_loss', patience=patience, mode="min")
    msave = ModelCheckpoint(filepath, save_best_only=True)
    return [es, msave]
file_path = "bidir_glov_reg001_embed6B200d_3e_multiTRAIN.hdf5"
callbacks = get_callbacks(filepath=file_path, patience=50)


# model building

# first iteration, embeddings train = False, train for 2 epochs
inp = Input(shape=(maxlen, )) #maxlen=200 as defined earlier
x = Embedding(max_features, embed_size, weights=[embedding_matrix],
              name = 'embedding', trainable=False)(inp)
x = Bidirectional(LSTM(120, return_sequences=True, name='lstm_layer1',
         dropout=0.1, recurrent_dropout=0.1,
          kernel_regularizer=regularizers.l2(0.001),
          bias_regularizer=regularizers.l2(0.001)))(x)
x = Bidirectional(LSTM(120, return_sequences=True, name='lstm_layer2',
         dropout=0.1, recurrent_dropout=0.1,
          kernel_regularizer=regularizers.l2(0.001),
          bias_regularizer=regularizers.l2(0.001)))(x)
maxpool = GlobalMaxPool1D()(x)
avgpool = GlobalAvgPool1D()(x)
concat = concatenate([maxpool, avgpool])
x = BatchNormalization()(concat)
x = Dense(128, activation="relu",
          kernel_regularizer=regularizers.l2(0.001),
          bias_regularizer=regularizers.l2(0.001))(x)
x = Dropout(0.3)(x)
x = BatchNormalization()(x)
x = Dense(128, activation="relu",
          kernel_regularizer=regularizers.l2(0.001),
          bias_regularizer=regularizers.l2(0.001))(x)
x = Dropout(0.3)(x)
x = Dense(6, activation="sigmoid")(x)

model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer=Adam(lr=0.001),
                  metrics=['accuracy'])
model.summary()


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 200)          0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 200, 200)     4000000     input_2[0][0]                    
__________________________________________________________________________________________________
bidirectional_3 (Bidirectional) (None, 200, 240)     308160      embedding[0][0]                  
__________________________________________________________________________________________________
bidirectional_4 (Bidirectional) (None, 200, 240)     346560      bidirectional_3[0][0]            
__________________________________________________________________________________________________
global_max

In [4]:
# training
batch_size = 32
# train everything but embedding
model.fit(X_t,y_train, batch_size=batch_size, epochs=2,
          shuffle=True, validation_data=(X_val, y_val)) # callbacks=callbacks , validation_data=(X_val, y_val)


# train only the embedding
# model = toggle_train(model)
# model.compile(loss='binary_crossentropy',
#                   optimizer='adam',
#                   metrics=['accuracy'])
# model.summary()
# model.fit(X_t,y_train, batch_size=batch_size, epochs=1,
#           shuffle=True) # callbacks=callbacks , validation_data=(X_val, y_val)


# # train everything
# model = all_layers_train(model)
# model.compile(loss='binary_crossentropy',
#                   optimizer='adam',
#                   metrics=['accuracy'])
# model.summary()
# model.fit(X_t,y_train, batch_size=batch_size, epochs=1,
#           shuffle=True) # callbacks=callbacks , validation_data=(X_val, y_val)


# model.save("experimental_glov_reg001_embed6B200d_2e_multiTRAIN.hdf5")


Train on 12924 samples, validate on 1437 samples
Epoch 1/2
  544/12924 [>.............................] - ETA: 9:08 - loss: 1.7951 - acc: 0.6314

KeyboardInterrupt: 

In [None]:
# test on test data here X_te, y_test   Need to do the columwise AUC here and acc


probs = model.predict(X_te)


acc = accuracy(y_test, probs)
print("acc", acc)
mean_col_auc = columnwise_auc(y_test, probs)
print("mean col", mean_col_auc)
    

# just putting the numbers from prev runs here
"""
Baseline AUC:


"""


In [None]:
# submission prediction
model.load_weights("experimental_glov_reg001_embed6B200d_2e_multiTRAIN.hdf5")

sub = pd.read_csv('../../data/test.csv')
list_tokenized_sub = tokenizer.texts_to_sequences(sub["comment_text"])
X_sub = pad_sequences(list_tokenized_sub, maxlen=maxlen)

y_sub = model.predict(X_sub)

sample_submission = pd.read_csv("../../data/sample_submission.csv")
sample_submission[list_classes] = y_sub

sample_submission.to_csv("experimental_glov_reg001_embed6B200d_2e_multiTRAIN.csv", index=False)