In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import re
from sklearn.utils import shuffle
from sklearn.cross_validation import train_test_split
from unidecode import unidecode
from utils import *



In [2]:
def textcleaning(string):
    string = re.sub('http\S+|www.\S+', '',' '.join([i for i in string.split() if i.find('#')<0 and i.find('@')<0]))
    string = unidecode(string).replace('.', '. ').replace(',', ', ')
    string = re.sub('[^\'\"A-Za-z\- ]+', ' ', string)
    return ' '.join([i for i in re.findall("[\\w']+|[;:\-\(\)&.,!?\"]", string) if len(i)>1]).lower()

In [3]:
df = pd.read_csv('toxic-bm.csv')
df = df.dropna()
df.shape

(40911, 7)

In [4]:
for i in range(df.shape[0]):
    df.iloc[i,0] = textcleaning(df.iloc[i,0])

In [5]:
texts = df.iloc[:,0].tolist()
concat = ' '.join(texts).split()
vocabulary_size = len(list(set(concat)))
data, count, dictionary, rev_dictionary = build_dataset(concat, vocabulary_size)
print('vocab from size: %d'%(vocabulary_size))
print('Most common words', count[4:10])
print('Sample data', data[:10], [rev_dictionary[i] for i in data[:10]])

vocab from size: 65530
Most common words [('yang', 103249), ('anda', 68130), ('dan', 59109), ('tidak', 54237), ('untuk', 50427), ('di', 36812)]
Sample data [455, 91, 199, 4, 242, 9, 106, 835, 3243, 8165] ['penjelasan', 'mengapa', 'pengeditan', 'yang', 'dibuat', 'di', 'bawah', 'peminat', 'tegar', 'metallica']


In [6]:
def str_idx(corpus, dic, maxlen, UNK=3):
    X = np.zeros((len(corpus),maxlen))
    for i in range(len(corpus)):
        for no, k in enumerate(corpus[i].split()[:maxlen][::-1]):
            try:
                X[i,-1 - no]=dic[k]
            except Exception as e:
                X[i,-1 - no]=UNK
    return X

In [7]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
Y = df[list_classes].values
Y.shape

(40911, 6)

In [8]:
vectors = str_idx(texts, dictionary, 200)

In [9]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, Y, test_size = 0.2)

In [10]:
from keras.optimizers import Adam, RMSprop
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from keras.layers import GRU, BatchNormalization, Conv1D, MaxPooling1D
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, Conv1D, GRU
from keras.layers import Bidirectional, GlobalMaxPool1D, MaxPooling1D, Add, Flatten
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D, concatenate, SpatialDropout1D
from keras.models import Model, load_model
from keras import initializers, regularizers, constraints, optimizers, layers, callbacks
from keras import backend as K
from keras.engine import InputSpec, Layer

Using TensorFlow backend.


In [11]:
early_stop = EarlyStopping(monitor = "val_loss", mode = "min", patience = 5)
file_path = "best_model.hdf5"
check_point = ModelCheckpoint(file_path, monitor = "val_loss", verbose = 1, save_best_only = True, mode = "min")

In [12]:
inp = Input(shape = (None,))
x = Embedding(len(dictionary), 256, trainable=True)(inp)
x1 = SpatialDropout1D(0.2)(x)

x = Bidirectional(GRU(128, return_sequences = True))(x1)
x = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(x)
    
y = Bidirectional(LSTM(128, return_sequences = True))(x1)
y = Conv1D(64, kernel_size = 2, padding = "valid", kernel_initializer = "he_uniform")(y)
    
avg_pool1 = GlobalAveragePooling1D()(x)
max_pool1 = GlobalMaxPooling1D()(x)
    
avg_pool2 = GlobalAveragePooling1D()(y)
max_pool2 = GlobalMaxPooling1D()(y)
    
x = concatenate([avg_pool1, max_pool1, avg_pool2, max_pool2])

x = Dense(6, activation = "sigmoid")(x)
model = Model(inputs = inp, outputs = x)
model.compile(loss = "binary_crossentropy", optimizer = Adam(lr = 1e-4), metrics = ["accuracy"])
history = model.fit(train_X, train_Y, batch_size = 128, epochs = 10, validation_data = (test_X, test_Y), 
                    verbose = 1, callbacks = [check_point, early_stop])

Train on 32728 samples, validate on 8183 samples
Epoch 1/10

Epoch 00001: val_loss improved from inf to 0.14315, saving model to best_model.hdf5
Epoch 2/10

Epoch 00002: val_loss improved from 0.14315 to 0.13274, saving model to best_model.hdf5
Epoch 3/10

Epoch 00003: val_loss improved from 0.13274 to 0.08320, saving model to best_model.hdf5
Epoch 4/10

Epoch 00004: val_loss improved from 0.08320 to 0.07247, saving model to best_model.hdf5
Epoch 5/10

Epoch 00005: val_loss improved from 0.07247 to 0.06903, saving model to best_model.hdf5
Epoch 6/10

Epoch 00006: val_loss did not improve from 0.06903
Epoch 7/10

Epoch 00007: val_loss improved from 0.06903 to 0.06827, saving model to best_model.hdf5
Epoch 8/10

Epoch 00008: val_loss did not improve from 0.06827
Epoch 9/10

Epoch 00009: val_loss did not improve from 0.06827
Epoch 10/10

Epoch 00010: val_loss did not improve from 0.06827


In [13]:
model = load_model(file_path)

In [14]:
predicted = model.predict(test_X,batch_size=128,verbose=1)



In [15]:
around_predicted = np.around(predicted)

In [16]:
from sklearn import metrics
print(metrics.classification_report(test_Y,around_predicted))

             precision    recall  f1-score   support

          0       0.78      0.63      0.69       817
          1       0.58      0.19      0.28        81
          2       0.79      0.61      0.69       427
          3       0.00      0.00      0.00        30
          4       0.72      0.50      0.59       398
          5       0.00      0.00      0.00        80

avg / total       0.71      0.54      0.61      1833



  'precision', 'predicted', average, warn_for)


In [17]:
model.predict(str_idx(['bodoh lah anti sosial'], dictionary, 7))

array([[0.94984066, 0.7050257 , 0.88807845, 0.40766585, 0.8686688 ,
        0.55271745]], dtype=float32)

In [18]:
import json
with open('fast-text-toxic.json','w') as fopen:
    fopen.write(json.dumps({'dictionary':dictionary,'reverse_dictionary':rev_dictionary}))