In [0]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

# Install a Drive FUSE wrapper.
# https://github.com/astrada/google-drive-ocamlfuse
!apt-get update -qq 2>&1 > /dev/null
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse

# Generate auth tokens for Colab
from google.colab import auth
auth.authenticate_user()

# Generate creds for the Drive FUSE library.
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
# Work around misordering of STREAM and STDIN in Jupyter.
# https://github.com/jupyter/notebook/issues/3159
prompt = !google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass(prompt[0] + '\n\nEnter verification code: ')
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

# Create a directory and mount Google Drive using that directory.
!mkdir -p drive
!google-drive-ocamlfuse drive

print('Files in Drive:')
!ls drive/

Found GPU at: /device:GPU:0
E: Package 'python-software-properties' has no installation candidate
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force

Enter verification code: ··········
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
Please enter the verification code: Access token retrieved correctly.
Files in Drive:
 achievement-habit-the-roth-bernard.pdf     MNNIT-F20181227215601383.pdf
 attendance_apps			    Movies
'Colab Notebooks'			   'Mummy weds Papa'
 Cverification_Whitepaper.p

In [0]:
ls

[0m[01;34mbaselines[0m/        LICENSE    train_bgru.ipynb                train_rhn.ipynb
clean_data.ipynb  Readme.md  train_char_level.ipynb
[01;34mDataset[0m/          [01;34msotoxic[0m/   train_cleaned_word_level.ipynb
[01;34mfeatures[0m/         [01;34mtools[0m/     train_meta_label.ipynb


In [0]:
########################################
## import packages
########################################
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd
import operator
import sys

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

import sotoxic.models.keras.model_zoo as model_zoo
from sotoxic.train import trainer
from sotoxic.data_helper.data_loader import DataLoader

Using TensorFlow backend.


In [0]:
path = 'Dataset/'
EMBEDDING_FILE='features/crawl-300d-2M.vec'
#EMBEDDING_FILE='features/glove.840B.300d.txt'
#EMBEDDING_FILE='features/glove.twitter.27B.200d.txt'
TRAIN_DATA_FILE=path + 'cleaned_train.csv'
TEST_DATA_FILE=path + 'cleaned_test.csv'

MAX_SEQUENCE_LENGTH = 400
MAX_NB_WORDS = 100000
EMBEDDING_DIM = 300

In [0]:
train_df = pd.read_csv(TRAIN_DATA_FILE)
test_df = pd.read_csv(TEST_DATA_FILE)
data_loader = DataLoader()
embeddings_index = data_loader.load_embedding(EMBEDDING_FILE)

Total 2000000 word vectors.


In [0]:
########################################
## process texts in datasets
########################################
print('Processing text dataset')
list_sentences_train = train_df["comment_text"].fillna("no comment").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
y = train_df[list_classes].values
list_sentences_test = test_df["comment_text"].fillna("no comment").values

comments = []
for text in list_sentences_train:
    comments.append(text)
    
test_comments=[]
for text in list_sentences_test:
    test_comments.append(text)

#tokenizer = Tokenizer(num_words=MAX_NB_WORDS, filters='"#%&()+,-./:;<=>@[\\]^_`{|}~\t\n')
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)

tokenizer.fit_on_texts(comments + test_comments)

sequences = tokenizer.texts_to_sequences(comments)
test_sequences = tokenizer.texts_to_sequences(test_comments)

word_index = tokenizer.word_index
print('Found %s unique tokens' % len(word_index))

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', y.shape)

test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)
print('Shape of test_data tensor:', test_data.shape)

Processing text dataset
Found 332116 unique tokens
Shape of data tensor: (159571, 400)
Shape of label tensor: (159571, 6)
Shape of test_data tensor: (153164, 400)


In [0]:
########################################
## prepare embeddings
########################################
print('Preparing embedding matrix')
nb_words = min(MAX_NB_WORDS, len(word_index))
#embedding_matrix = np.random.normal(loc=matrix_mean, scale=matrix_std, size=(nb_words, EMBEDDING_DIM))
embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
null_count = 0
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
    else:
        null_count += 1
print('Null word embeddings: %d' % null_count)

Preparing embedding matrix
Null word embeddings: 24105


In [0]:
def get_model():
    return model_zoo.get_dropout_bi_gru(nb_words, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH, out_size=6)

keras_model_trainer = trainer.KerasModelTrainer(model_stamp='dropout_bi_gru', epoch_num=8, learning_rate=1e-3)

In [0]:
models, val_loss, total_auc, fold_predictions = keras_model_trainer.train_folds(data, y, fold_count=5, batch_size=256, get_model_func=get_model)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Deprecated in favor of operator or tf.math.divide.
Train on 127657 samples, validate on 31914 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
AUC Score 0.990209129571145
Train on 127657 samples, validate on 31914 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
AUC Score 0.9890318491810173
Train on 127657 samples, validate on 31914 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
AUC Score 0.9900564970566146
Train on 127657 samples, validate on 31914 samples
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
AUC Score 0.9895665829033845
Train on 127656 samples, validate on 31915 sampl

In [0]:
print("Overall val-loss:", val_loss, "AUC", total_auc)

Overall val-loss: 0.03882668310386659 AUC 0.9892441929070168


## Predections

In [0]:
train_fold_preditcions = np.concatenate(fold_predictions, axis=0)

In [0]:
from sklearn.metrics import roc_auc_score
training_auc = roc_auc_score(y[:], train_fold_preditcions)
print("Training AUC", training_auc)

Training AUC 0.9884331287570647


In [0]:
#test_data = test_df
CLASSES = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
submit_path_prefix = "predict_path"

print("Predicting testing results...")
test_predicts_list = []
for fold_id, model in enumerate(models):
    test_predicts = model.predict(test_data, batch_size=256, verbose=1)
    test_predicts_list.append (test_predicts)
    np.save("predict_path/", test_predicts)

test_predicts = np.zeros(test_predicts_list[0].shape)
for fold_predict in test_predicts_list:
    test_predicts += fold_predict
test_predicts /= len(test_predicts_list)

test_ids = test_df["id"].values
test_ids = test_ids.reshape((len(test_ids), 1))

test_predicts = pd.DataFrame(data=test_predicts, columns=CLASSES)
test_predicts["id"] = test_ids
test_predicts = test_predicts[["id"] + CLASSES]
submit_path = submit_path_prefix + "-L{:4f}-A{:4f}.csv".format(val_loss, total_auc)
test_predicts.to_csv(submit_path, index=False)

Predicting testing results...


In [0]:
print("Predicting training results...")

train_ids = train_df["id"].values
train_ids = train_ids.reshape((len(train_ids), 1))

train_predicts = pd.DataFrame(data=train_fold_preditcions, columns=CLASSES)
train_predicts["id"] = train_ids
train_predicts = train_predicts[["id"] + CLASSES]
submit_path = submit_path_prefix + "-Train-L{:4f}-A{:4f}.csv".format(val_loss, training_auc)
train_predicts.to_csv(submit_path, index=False)

Predicting training results...
