In [2]:
import numpy as np
np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

from keras.models import Model
from keras.layers import Input, Dense, Dropout, Conv1D, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings
warnings.filterwarnings('ignore')

import os
import pathlib
os.environ['OMP_NUM_THREADS'] = '4'
dir = os.getcwd()
print(dir)

Using TensorFlow backend.


C:\Users\marti\Documents\Python_Scripts


In [18]:
EMBEDDING_FILE = os.path.join(dir, 'toxic_comments_data', 'crawl-300d-2M.vec')

train = pd.read_csv(os.path.join(dir, 'toxic_comments_data', 'train.csv'))
test = pd.read_csv(os.path.join(dir, 'toxic_comments_data', 'test.csv'))
submission = pd.read_csv(os.path.join(dir,'toxic_comments_data', 'sample_submission.csv'))

X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values


max_features = 30000
maxlen = 100
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)

In [3]:
def get_coefs(word, *arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding='utf-8'))

word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
embedding_matrix = np.zeros((nb_words, embed_size))
for word, i in word_index.items():
    if i >= max_features: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


class RocAucEvaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            score = roc_auc_score(self.y_val, y_pred)
            print("\n ROC-AUC - epoch: %d - score: %.6f \n" % (epoch+1, score))


def get_model():
    inp = Input(shape=(maxlen, ))
    x = Embedding(max_features, embed_size, weights=[embedding_matrix], trainable=False)(inp)
    x = SpatialDropout1D(0.2)(x)
    x1 = Bidirectional(GRU(128, return_sequences=True))(x)
    x2 = Bidirectional(GRU(64, return_sequences=True))(x)
    conc = concatenate([x1, x2])
    avg_pool = GlobalAveragePooling1D()(conc)
    max_pool = GlobalMaxPooling1D()(conc)
    conc = concatenate([avg_pool, max_pool])
    x = Dense(64, activation='relu')(conc)
    x = Dropout(0.2)(x)
    outp = Dense(6, activation="sigmoid")(x)

    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    return model

In [4]:
model = get_model()


batch_size = 32
epochs = 4

X_tra, X_val, y_tra, y_val = train_test_split(x_train, y_train, train_size=0.95, random_state=233)
RocAuc = RocAucEvaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                 callbacks=[RocAuc], verbose=2)

Train on 151592 samples, validate on 7979 samples
Epoch 1/4
 - 3061s - loss: 0.0534 - acc: 0.9808 - val_loss: 0.0473 - val_acc: 0.9815

 ROC-AUC - epoch: 1 - score: 0.984913 

Epoch 2/4
 - 2360s - loss: 0.0442 - acc: 0.9832 - val_loss: 0.0477 - val_acc: 0.9824

 ROC-AUC - epoch: 2 - score: 0.986456 

Epoch 3/4
 - 2521s - loss: 0.0414 - acc: 0.9839 - val_loss: 0.0434 - val_acc: 0.9834

 ROC-AUC - epoch: 3 - score: 0.987820 

Epoch 4/4
 - 2793s - loss: 0.0391 - acc: 0.9846 - val_loss: 0.0442 - val_acc: 0.9836

 ROC-AUC - epoch: 4 - score: 0.987418 



In [4]:
# keras library import  for Saving and loading model and weights
from keras.models import model_from_json
from keras.models import load_model

# serialize model to JSON
#  the keras model which is trained is defined as 'model' in this example
model_json = model.to_json()
with open("model_num.json", "w") as json_file:
    json_file.write(model_json)

# serialize weights to HDF5
model.save_weights("model_num.h5")
print("Saved model to disk")

NameError: name 'model' is not defined

In [12]:
from keras.models import model_from_json
from keras.models import load_model
# load json and create model
json_file = open('model_num.json', 'r')

loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights("model_num.h5")
print("Loaded model from disk")

# submission[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]] = y_pred
# submission.to_csv('submission.csv', index=False)

Loaded model from disk


In [34]:
print(y_pred[0])

[0.9992951  0.49124226 0.9698933  0.11353455 0.9058463  0.54024   ]


In [37]:
x_test[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,  2030,   378,  4878,   723,     8,    58, 20911,
          84,   888,   349,    16,  3439,    73,    21,     6,     5,
        6226,     6,  1555,     7,    56,   378,  5462,  1488,   578,
        5869,     5,    94,     6,     2,  3771,    30,   340,     6,
         742,    37,  4878,   723,     8,    35,  4222,    10,  1205,
         653,   400,   476, 17214,     9,   227,    15,   154,     5,
       20074,     8,   247, 23545,    48,  4329,    52,    24,     4,
        2108,   155,  2432,   578,  2428,    94,   218,   143,   490,
          85])

In [24]:
X_train = train["comment_text"].fillna("fillna").values
y_train = train[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]].values
X_test = test["comment_text"].fillna("fillna").values
tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))

In [69]:
test_text = [' The full podcast is basically Joe slowly realising how stupid she is.']
test_text = np.asarray(test_text)
test_text = tokenizer.texts_to_sequences(test_text)
test_text = sequence.pad_sequences(test_text, maxlen=maxlen)
y_pred = loaded_model.predict(test_text, batch_size=1)
print(y_pred)

[[8.9901292e-01 4.6842294e-03 4.4172296e-01 1.1461467e-04 7.0217258e-01
  4.6742605e-03]]


In [None]:
import time
max_features = 30000
maxlen = 100
comments_1 = pd.read_csv(os.path.join(dir, 'toxic_comments_data', 'Test_comments', 'comments_1.csv'))
comments_2 = pd.read_csv(os.path.join(dir, 'toxic_comments_data', 'Test_comments', 'comments_2.csv'))
comments_3 = pd.read_csv(os.path.join(dir, 'toxic_comments_data', 'Test_comments', 'comments_3.csv'))
comments_6 = pd.read_csv(os.path.join(dir, 'toxic_comments_data', 'Test_comments', 'comments_6.csv'))
comments_7 = pd.read_csv(os.path.join(dir, 'toxic_comments_data', 'Test_comments', 'comments_7.csv'))
test_set = pd.read_csv(os.path.join(dir, 'toxic_comments_data', 'Test_comments', 'test_set.csv'))
comments_1
for comment in comments_1.commentText.dropna():
# for comment in test_set.response_text.dropna():
    test_text = np.asarray([comment])
    test_text = tokenizer.texts_to_sequences(test_text)
    test_text = sequence.pad_sequences(test_text, maxlen=maxlen)
    y_pred = loaded_model.predict(test_text, batch_size=1)
    if np.max(y_pred)>0.7:
        print(comment)
        print(y_pred)

In [10]:
X_train = train["comment_text"].fillna("fillna").values
print(X_train[0])

Explanation
Why the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27
