## Setup

In [1]:
from imports import *
from global_variables import *
from load_files import *

## Preprocessing

In [2]:
X = train.comment_text.values

In [3]:
y = train.toxic.values

In [4]:
tokenizer = text.Tokenizer(MAX_FEATURES)
tokenizer.fit_on_texts(list(X))

In [5]:
tokenized = tokenizer.texts_to_sequences(X)

In [6]:
X_train_padded = sequence.pad_sequences(tokenized, MAX_TEXT_LENGTH)

## Embedding Matrix

In [7]:
EMBEDDING_DIMS   = 100
embedding_matrix = np.zeros((MAX_FEATURES, EMBEDDING_DIMS))
glove100         = open(GLOVE100, encoding='utf-8')

In [8]:
for line in glove100:
    
    values = line.split()
    word   = values[0]
    coefs  = np.asarray(values[1:], dtype='float32')
    
    EMBEDDINGS_W2V[word] = coefs

glove100.close()

In [9]:
for word, index in tokenizer.word_index.items():
    
    if index > MAX_FEATURES - 1:
        break
    
    else:
        embedding_vector = EMBEDDINGS_W2V.get(word)
        if embedding_vector is not None:
            embedding_matrix[index] = embedding_vector

## Embedding Layer

In [16]:
model = Sequential()

model.add(Embedding(
    MAX_FEATURES,
    EMBEDDING_DIMS,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False))

model.add(Dropout(0.2))

## Build Model

In [17]:
model.add(Conv1D(
    FILTERS_COUNT,
    FILTER_SIZE,
    padding='valid',
    activation='relu'))

model.add(MaxPooling1D())

model.add(Conv1D(
    FILTERS_COUNT,
    5,
    padding='valid',
    activation='relu'))

model.add(GlobalMaxPooling1D())

model.add(Dense(
    HIDDEN_DIMS,
    activation='relu'))

model.add(Dropout(0.2))

model.add(Dense(
    1,
    activation='sigmoid'))

In [18]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 100)         2000000   
_________________________________________________________________
dropout_2 (Dropout)          (None, None, 100)         0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, None, 250)         75250     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, None, 250)         0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 250)         312750    
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 250)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 250)              

In [19]:
model.compile(
    loss      = 'binary_crossentropy',
    optimizer = 'adam',
    metrics   = ['accuracy']
)

## Train Model

In [20]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_padded, y, test_size=0.15)

In [21]:
batch_size = 64
epochs = 3

In [23]:
model.fit(
    X_train, y_train,
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(X_valid, y_valid))

Train on 135635 samples, validate on 23936 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x19074e08188>

## Evaluate Model

In [39]:
loss, accuracy = model.evaluate(X_valid, y_valid, batch_size=128)



In [42]:
print('Loss    :', loss)
print('Accuracy:', accuracy)

Loss    : 0.11273200857766809
Accuracy: 0.9585144


In [43]:
X_test = test.comment_text.values

In [44]:
tokenized = tokenizer.texts_to_sequences(X_test)
X_test_padded = sequence.pad_sequences(tokenized, MAX_TEXT_LENGTH)

In [45]:
y_pred = model.predict(
    X_test_padded,
    verbose=1,
    batch_size=64)



In [46]:
predictions = []

for pred in y_pred:
    if pred < 0.5:
        predictions.append('Non_toxic')
    else:
        predictions.append('Toxic')

In [47]:
test['toxic'] = predictions

In [48]:
test.head(50)

Unnamed: 0,id,comment_text,toxic
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...,Toxic
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...,Non_toxic
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap...",Non_toxic
3,00017563c3f7919a,":If you have a look back at the source, the in...",Non_toxic
4,00017695ad8997eb,I don't anonymously edit articles at all.,Non_toxic
5,0001ea8717f6de06,Thank you for understanding. I think very high...,Non_toxic
6,00024115d4cbde0f,Please do not add nonsense to Wikipedia. Such ...,Non_toxic
7,000247e83dcc1211,:Dear god this site is horrible.,Toxic
8,00025358d4737918,""" \n Only a fool can believe in such numbers. ...",Non_toxic
9,00026d1092fe71cc,== Double Redirects == \n\n When fixing double...,Non_toxic
