In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, KFold
import random

from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.feature_extraction.text import TfidfVectorizer

from keras.preprocessing import text, sequence
from keras.layers import Embedding, SpatialDropout1D
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, Input
from keras.optimizers import RMSprop
import keras.backend as K
from keras.layers import Dense, Input, GRU, LSTM, Bidirectional, Dropout, CuDNNLSTM, CuDNNGRU, Conv1D, MaxPooling1D, GlobalMaxPooling1D
from sklearn.model_selection import train_test_split

Using TensorFlow backend.


In [2]:
# GOOGLE COLAB SETUP

# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#2. Get the file
data_path     = 'drive/My Drive/Colab Notebooks/adaptHAN/AOBDL_code/data'
codes_path    = 'drive/My Drive/Colab Notebooks/adaptHAN/AOBDL_code/codes'


#3. Read file as panda dataframe
train         = pd.read_csv(f'{data_path}/train_cleaned_no_punkt.csv') 
test_labelled = pd.read_csv(f'{data_path}/test_labelled_cleaned_no_punkt.csv') 
test_unlabelled = pd.read_csv(f'{data_path}/test_unlabelled_cleaned_no_punkt.csv') 

In [0]:
train['mal'] = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) >= 1  
train.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
train.comment_text.fillna("empty", inplace=True)

test_labelled['mal'] = test_labelled[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) >= 1  
test_labelled.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
test_labelled.comment_text.fillna("empty", inplace=True)

test_unlabelled.comment_text.fillna("empty", inplace=True)

# CHANGE TRAIN AND TEST, MIX TO GET SIMILAR DISTRIBUTION
rs=42
X_train1, X_test1, y_train1, y_test1  = train_test_split(train.drop('mal', axis=1), train.mal, stratify=train.mal, test_size=0.29, random_state=rs )
X_train2, X_test2, y_train2, y_test2  = train_test_split(test_labelled.drop('mal', axis=1), test_labelled.mal, stratify=test_labelled.mal, test_size=0.29, random_state=rs)

X = np.concatenate((X_train1.comment_text, X_train2.comment_text))
y = np.concatenate((y_train1, y_train2))

X_test = np.concatenate((X_test1.comment_text, X_test2.comment_text))
y_test = np.concatenate((y_test1, y_test2))

In [0]:
#X = train.comment_text
#y = train.mal

max_features = 40000
maxlen = 400
dropout_rate = 0
rs = 42
epochs = 4
batch_size = 250
embed_dim = 50
rec_units = 150
num_filters = 300

In [0]:
def gru_keras(max_features, maxlen, dropout_rate, embed_dim, rec_units):
    if K.backend == 'tensorflow':        
        K.clear_session()
    input_layer = Input(shape=(maxlen,))
    embedding_layer = Embedding(max_features, output_dim=embed_dim, trainable=True)(input_layer)
    x = SpatialDropout1D(dropout_rate)(embedding_layer)
    x = Conv1D(num_filters, 7, activation='relu', padding='same')(x)
    x = MaxPooling1D(2)(x)
    x = Conv1D(num_filters, 7, activation='relu', padding='same')(x)
    x = GlobalMaxPooling1D()(x)
    output_layer = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy',
                  optimizer=RMSprop(clipvalue=1, clipnorm=1),
                  metrics=['acc'])
    print( model.summary())
    return model

In [0]:
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=5, random_state=rs)
auc = []
roc = []
c = 0
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(X)

In [8]:
from sklearn.metrics import average_precision_score, roc_auc_score
for train_index, val_index in kf.split(X, y):
    print(f' fold {c}')
    X_train, X_val = X[train_index], X[val_index]
    y_train, y_val = y[train_index], y[val_index] 
    #tokenizer = text.Tokenizer(num_words=max_features)
    #tokenizer.fit_on_texts(pd.concat([X_train, unlab.comment_text], axis=0))
    list_tokenized_train = tokenizer.texts_to_sequences(X_train)
    list_tokenized_val = tokenizer.texts_to_sequences(X_val)
    X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
    X_val = sequence.pad_sequences(list_tokenized_val, maxlen=maxlen)
    model = gru_keras(max_features, maxlen, dropout_rate, embed_dim, rec_units)
    print('Fitting')
    model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, shuffle=False, verbose=1)
    probs = model.predict(X_val, batch_size=batch_size, verbose=1)
    auc_f = average_precision_score(y_val, probs)
    auc.append(auc_f)
    roc_f = roc_auc_score(y_val, probs)
    roc.append(roc_f)
    print(f' average precision {auc_f}')
    print(f' roc auc {roc_f}')
    c += 1

 fold 0
Instructions for updating:
Colocations handled automatically by placer.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 400)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 400, 50)           2000000   
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 400, 50)           0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 400, 300)          105300    
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 200, 300)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 200, 300)          630300    
______________________________________________________________

In [9]:
np.array(auc).mean()

0.8178391437895798

In [10]:
np.array(roc).mean()

0.955863628333614

In [11]:
X_train = X
y_train = y

#X_test = test_labelled.comment_text
#y_test = test_labelled.mal


tokenizer = text.Tokenizer(num_words=max_features, oov_token='unknown')
tokenizer.fit_on_texts(X_train)

list_tokenized_train = tokenizer.texts_to_sequences(X_train)
list_tokenized_test = tokenizer.texts_to_sequences(X_test)
X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)


model = gru_keras(max_features, maxlen, dropout_rate, embed_dim, rec_units)

#X_train = np.array(train_posts)
y_train = np.array(y_train)
#X_val =  np.array(val_posts)
y_test = np.array(y_test)

print('Fitting')
model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, shuffle=False, verbose=1)
probs = model.predict(X_test, batch_size=batch_size, verbose=1)
auc_f = average_precision_score(y_test, probs)
roc_f = roc_auc_score(y_test, probs)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 400)               0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 400, 50)           2000000   
_________________________________________________________________
spatial_dropout1d_6 (Spatial (None, 400, 50)           0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 400, 300)          105300    
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 200, 300)          0         
_________________________________________________________________
conv1d_12 (Conv1D)           (None, 200, 300)          630300    
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 300)               0         
__________

In [12]:
auc_f

0.826133145550906

In [13]:
roc_f

0.9616898956756055