In [1]:
import sys, os, re, csv, codecs, numpy as np, pandas as pd
# set path/seeds
sys.path.append('../')
os.environ['PYTHONHASHSEED'] = '0'
np.random.seed(42)
import tensorflow as tf
tf.set_random_seed(42)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
# %matplotlib inline
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D
from keras.layers import Bidirectional, GlobalMaxPool1D, Flatten, BatchNormalization
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import Adam

# get data
train = pd.read_csv('../../data/train.csv')

# train cutdown (just for testing purposes, can cut down amount of data here)
train = train.sample(frac=0.2, random_state=42)

# seperate classes
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train[list_classes].values

## CREATE MY SPLIT HERE
X_train, X_test, y_train, y_test= train_test_split(train, y, test_size = 0.1, 
                                                random_state=42)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.1,
                                                 random_state=42)

list_sentences_train = X_train["comment_text"]
list_sentences_val = X_val["comment_text"]
list_sentences_test = X_test["comment_text"]

# tokenize/index
max_features = 20000
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(list_sentences_train))
list_tokenized_train = tokenizer.texts_to_sequences(list_sentences_train)
list_tokenized_val = tokenizer.texts_to_sequences(list_sentences_val)
list_tokenized_test = tokenizer.texts_to_sequences(list_sentences_test)

# get single sequence length
maxlen = 200
X_t = pad_sequences(list_tokenized_train, maxlen=maxlen)
X_val = pad_sequences(list_tokenized_val, maxlen=maxlen)
X_te = pad_sequences(list_tokenized_test, maxlen=maxlen)

# model building

embed_size = 128
# empty space tells keras to infer automatically
inp = Input(shape=(maxlen, )) #maxlen=200 as defined earlier
x = Embedding(max_features, embed_size)(inp)
x = LSTM(60, return_sequences=True, name='lstm_layer')(x)
x = GlobalMaxPool1D()(x)
x = BatchNormalization()(x)
x = Dropout(0.1)(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)

model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

model.summary()

# graveyard layers
# x = Bidirectional(LSTM(60, return_sequences=True, name='lstm_layer'))(x)
# Adam(lr=0.0001)

def get_callbacks(filepath, patience=2):
    es = EarlyStopping('val_loss', patience=patience, mode="min")
    msave = ModelCheckpoint(filepath, save_best_only=True)
    return [es, msave]
file_path = "model_weights.hdf5"
callbacks = get_callbacks(filepath=file_path, patience=50)

print("complete")

  (fname, cnt))
  (fname, cnt))
  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 200)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 200, 128)          2560000   
_________________________________________________________________
lstm_layer (LSTM)            (None, 200, 60)           45360     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 60)                0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 60)                240       
_________________________________________________________________
dropout_1 (Dropout)          (None, 60)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                3050      
__________

In [2]:
# training
batch_size = 32
epochs = 2
model.fit(X_t,y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
          shuffle=True) # callbacks=callbacks




Train on 25849 samples, validate on 2873 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7fdcbc346898>

In [None]:
# test on test data here X_te, y_test   Need to do the columwise AUC here and acc
from utils.metrics import accuracy
from utils.metrics import columnwise_auc

probs = model.predict(X_te)


acc = accuracy(y_test, probs)
print("acc", acc)
mean_col_auc = columnwise_auc(y_test, probs)
print("mean col", mean_col_auc)
    


In [None]:
# submission prediction
model.load_weights(file_path)

sub = pd.read_csv('../../data/test.csv')
y_sub = model.predict(sub)

sample_submission = pd.read_csv("../../data/sample_submission.csv")
sample_submission[list_classes] = y_sub

sample_submission.to_csv("baseline_bongo.csv", index=False)