In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split, KFold
import random

from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

from keras.preprocessing import text, sequence
from keras.layers import Embedding, SpatialDropout1D
from keras.models import Model, Sequential
from keras.layers import Dense, Embedding, Input
from keras.optimizers import RMSprop
import keras.backend as K
from keras.layers import Dense, Input, GRU, LSTM, Bidirectional, Dropout, CuDNNLSTM, CuDNNGRU, GlobalAveragePooling1D, GlobalMaxPool1D
from sklearn.metrics import average_precision_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.engine.topology import Layer, InputSpec
from keras import initializers as initializers, regularizers, constraints

from numpy.random import seed
from tensorflow import set_random_seed
import random as rn
import os

from numpy.random import shuffle

Using TensorFlow backend.


In [3]:
# GOOGLE COLAB SETUP

# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
#2. Get the file
data_path          = 'drive/My Drive/Colab Notebooks/adaptHAN/AOBDL_code/data'
codes_path         = 'drive/My Drive/Colab Notebooks/adaptHAN/AOBDL_code/codes'
cv_models_path     = 'drive/My Drive/Colab Notebooks/adaptHAN/AOBDL_code/cv_models'
models_path        = 'drive/My Drive/Colab Notebooks/adaptHAN/AOBDL_code/models'


#3. Read file as panda dataframe
train         = pd.read_csv(f'{data_path}/train_cleaned_no_punkt.csv') 
test_labelled = pd.read_csv(f'{data_path}/test_labelled_cleaned_no_punkt.csv') 
test_unlabelled = pd.read_csv(f'{data_path}/test_unlabelled_cleaned_no_punkt.csv') 

In [0]:
train['mal']         = train[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) >= 1  
train.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
train.comment_text.fillna("empty", inplace=True)

test_labelled['mal'] = test_labelled[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) >= 1  
test_labelled.drop(['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate'], axis=1, inplace=True)
test_labelled.comment_text.fillna("empty", inplace=True)

test_unlabelled.comment_text.fillna("empty", inplace=True)

# CHANGE TRAIN AND TEST, MIX TO GET SIMILAR DISTRIBUTION
from sklearn.model_selection import train_test_split
rs=42
X_train1, X_test1, y_train1, y_test1  = train_test_split(train.drop('mal', axis=1), train.mal, stratify=train.mal, test_size=0.29, random_state=rs )
X_train2, X_test2, y_train2, y_test2  = train_test_split(test_labelled.drop('mal', axis=1), test_labelled.mal, stratify=test_labelled.mal, test_size=0.29, random_state=rs)

X = np.concatenate((X_train1.comment_text, X_train2.comment_text))
y = np.concatenate((y_train1, y_train2))

X_test = np.concatenate((X_test1.comment_text, X_test2.comment_text))
y_test = np.concatenate((y_test1, y_test2))

X_unlab = np.array(test_labelled.comment_text)

In [0]:
#X = train.comment_text
#y = train.mal

max_features = 40000
maxlen       = 400
dropout_rate = 0.25
rs           = 42
epochs       = 4
batch_size   = 256
embed_dim    = 50
rec_units    = 150


seed(rs)
set_random_seed(rs)
rn.seed(rs)

os.environ['PYTHONHASHSEED']=str(rs)

In [0]:
def gru_keras(max_features, maxlen, dropout_rate, embed_dim, rec_units, reduction = 'average'):
    if K.backend == 'tensorflow':        
        K.clear_session()
    input_layer = Input(shape=(maxlen,))
    embedding_layer = Embedding(max_features, output_dim=embed_dim, trainable=True)(input_layer)
    x = SpatialDropout1D(dropout_rate)(embedding_layer)
    x = Bidirectional(CuDNNGRU(units=rec_units, return_sequences=True))(x)
    if reduction == 'average':
      x = GlobalAveragePooling1D()(x)
    elif reduction == 'maximum':
      x = GlobalMaxPool1D()(x)
    elif reduction == 'attention':
      x = AttentionWithContext()(x)
      
    output_layer = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy',
                  optimizer=RMSprop(clipvalue=1, clipnorm=1),
                  metrics=['acc'])
    #print( model.summary())
    return model

In [0]:
kf = StratifiedKFold(n_splits=5, random_state=rs)
auc = []
roc = []
c = 0
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train.comment_text[train['mal']==1])

In [0]:
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(train.comment_text[train['mal']==1])
tokenizer_non_mal = text.Tokenizer(num_words=max_features)
tokenizer_non_mal.fit_on_texts(train.comment_text[train['mal']!=1])


counts = pd.DataFrame({'word': list(tokenizer.word_counts.keys()),
                       'count': list(tokenizer.word_counts.values())})
counts = counts.sort_values(['count'], ascending=False)


counts_non_mal = pd.DataFrame({'word': list(tokenizer_non_mal.word_counts.keys()),
                              'non_mal_counts': list(tokenizer_non_mal.word_counts.values())})
counts_        = counts.merge(counts_non_mal, how='left', left_on='word', right_on='word')

counts_['count']          = counts_['count'] / max(counts_['count']) 
counts_['non_mal_counts'] = counts_['non_mal_counts'] / max(counts_['non_mal_counts']) 

counts_less = counts_.loc[(counts_['non_mal_counts']<0.01) ,:]
counts_less = counts_less[counts_less['count']>=0.001]
counts_less = counts_less[counts_less.word.apply(len)>=4]

In [0]:
unlab      = test_unlabelled[test_unlabelled.comment_text.str.contains(' | '.join(counts_less.word.values))]
unlab      = unlab[['comment_text']]
unlab['y'] = 1

In [0]:
kf = StratifiedKFold(n_splits=5, random_state=rs)
auc = []
roc = []
c = 0
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(np.concatenate((X,unlab['comment_text'])))

### Average Pooling

###Rule based




In [0]:
#X = np.array(X)
#y = np.array(y)

In [110]:
for train_index, val_index in kf.split(X, y):
    print(f' fold {c}')
    X_train, X_val       = X[train_index], X[val_index]
    y_train, y_val       = y[train_index], y[val_index] 
       
    X_train = np.array(X)
    y_train = np.array(y)
    
    X_train = np.concatenate((X_train, unlab['comment_text']))
    y_train = np.concatenate((y_train, unlab['y']))
    
    list_tokenized_train = tokenizer.texts_to_sequences(X_train)
    list_tokenized_val   = tokenizer.texts_to_sequences(X_val)
    X_train              = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
    X_val                = sequence.pad_sequences(list_tokenized_val, maxlen=maxlen)
    
    
    
    #import pdb
    #pdb.set_trace()
    
    
    model                = gru_keras(max_features, maxlen, dropout_rate, embed_dim, rec_units)
    print('Fitting')
    history              = model.fit(X_train, y_train, batch_size=batch_size, validation_data=(X_val, y_val), epochs=4, shuffle=True, verbose=1)
    probs                = model.predict(X_val, batch_size=batch_size, verbose=1)
    
    model.save_weights(f'{cv_models_path}/BGRU_avpool_SEMI_RULES_train_{c}.h5')
    
    auc_f                = average_precision_score(y_val, probs)
    auc.append(auc_f)
    roc_f                = roc_auc_score(y_val, probs)
    roc.append(roc_f)
    print(f' average precision {auc_f}')
    print(f' roc auc {roc_f}')
    c += 1
    del model

 fold 0
Fitting
Train on 235732 samples, validate on 31745 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
 average precision 0.8919143287580729
 roc auc 0.9605868087838187
 fold 1
Fitting
Train on 235732 samples, validate on 31745 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
 average precision 0.8834534358921754
 roc auc 0.9555452909634324
 fold 2
Fitting
Train on 235732 samples, validate on 31743 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
 average precision 0.8474606020039083
 roc auc 0.9472066849889339
 fold 3
Fitting
Train on 235732 samples, validate on 31743 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
 average precision 0.8295511259434667
 roc auc 0.954849936986786
 fold 4
Fitting
Train on 235732 samples, validate on 31743 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
 average precision 0.824490149924532
 roc auc 0.9679104644752918


In [0]:
data = pd.DataFrame({'acc':history.history['acc'],
                    'loss': history.history['loss'],
                    'val_acc': history.history['val_acc'],
                    'val_loss': history.history['val_loss']})

In [112]:
np.array(auc).mean()

0.8553739285044311

In [113]:
np.array(roc).mean()

0.9572198372396526

In [0]:
X_train   = X
y_train   = y


tokenizer = text.Tokenizer(num_words=max_features, oov_token='unknown')
tokenizer.fit_on_texts(X_train)

list_tokenized_train = tokenizer.texts_to_sequences(X_train)
list_tokenized_test  = tokenizer.texts_to_sequences(X_test)
X_train = sequence.pad_sequences(list_tokenized_train, maxlen=maxlen)
X_test  = sequence.pad_sequences(list_tokenized_test, maxlen=maxlen)

In [115]:
model   = gru_keras(max_features, maxlen, dropout_rate, embed_dim, rec_units, reduction='average')

y_train = np.array(y_train)
y_test  = np.array(y_test)

print('Fitting')
model.fit(X_train, y_train,   batch_size=batch_size, epochs=4, shuffle=True, verbose=1)
probs = model.predict(X_test, batch_size=batch_size, verbose=1)
auc_f = average_precision_score(y_test, probs)
roc_f = roc_auc_score(y_test, probs)
model.save_weights(f'{models_path}/BGRU_SEMI_RULES_onlyTrain.h5')
del model

Fitting
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


In [116]:
auc_f

0.8564520742886612

In [117]:
roc_f

0.9722216406035167

In [118]:
test_unlabelled.shape

(89186, 8)