In [None]:
%matplotlib inline
from keras.preprocessing.text import text_to_word_sequence
from utils import *
from __future__ import division, print_function
from keras.preprocessing.text import Tokenizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

## Load the data

In [None]:
path = "data/"
batch_size=64
max_features = 20000
maxlen = 100

In [None]:
_trainData = pd.read_csv(path+'train.csv')

In [None]:
list_sentences_train = _trainData["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
labels_train = _trainData[list_classes].values

In [None]:
_testData = pd.read_csv(path+'test.csv')

In [None]:
list_sentences_test = _testData["comment_text"].fillna("_na_").values

## Process sentences

In [None]:
special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

In [None]:
def text_to_wordlist(text, remove_stopwords=True, stem_words=True):
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    
    #Replace Numbers
    text=replace_numbers.sub('n',text)
    # Clean the text, with the option to remove stopwords and to stem words.
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [None]:
def get_glove_dataset(dataset):
    """Download the requested glove dataset from files.fast.ai
    and return a location that can be passed to load_vectors.
    """
    # see wordvectors.ipynb for info on how these files were
    # generated from the original glove data.
    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}
    glove_path = os.path.abspath('data/glove/results')
    %mkdir -p $glove_path
    return get_file(dataset,
                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',
                    cache_subdir=glove_path,
                    md5_hash=md5sums.get(dataset, None),
                    untar=True)

In [None]:
def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb'),encoding='latin1'),
        pickle.load(open(loc+'_idx.pkl','rb'),encoding='latin1'))

In [None]:
vecs, words, wordidx = load_vectors(get_glove_dataset('6B.50d'))


In [None]:
comments = []
for text in list_sentences_train:
    comments.append(text_to_wordlist(text))

In [None]:
print(comments[0])

In [None]:
test_comments=[]
for text in list_sentences_test:
    test_comments.append(text_to_wordlist(text))

## Vectorize words

In [None]:
tokenizer = Tokenizer(num_words=max_features,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)
# tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(comments + test_comments))
comments_sequence = tokenizer.texts_to_sequences(comments)

In [None]:
test_comments_sequence = tokenizer.texts_to_sequences(test_comments)    
X_train = sequence.pad_sequences(comments_sequence , maxlen=maxlen)
Test_train = sequence.pad_sequences(test_comments_sequence, maxlen=maxlen)

In [None]:
emb_mean,emb_std = vecs.mean(), vecs.std()
emb_mean,emb_std


In [None]:
word_index = tokenizer.word_index
nb_words = vecs.shape[0]
embed_size= vecs.shape[1]
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= nb_words: continue
    vec_idx = wordidx.get(word)
    if vec_idx is not None:
            embedding_vector = vecs[vec_idx]
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
   

In [None]:
## model

In [None]:
X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
Test_train = Test_train.reshape((Test_train.shape[0], 1, Test_train.shape[1]))

In [None]:
number_filters = 32

inp = Input(shape=(1, maxlen,))
x = Embedding(nb_words, embed_size, weights=[embedding_matrix], trainable=False)(inp)
x1 = Conv2D(number_filters, (3, embed_size), data_format='channels_first',padding='same')(x)
x1 = BatchNormalization()(x1)
x1 = Activation('relu')(x1)
x1 = MaxPooling2D((int(int(x1.shape[2])  / 1.5), 1), data_format='channels_first')(x1)
#x1 = Flatten()(x1)

x2 = Conv2D(number_filters, (4, embed_size), data_format='channels_first',padding='same')(x)
x2 = BatchNormalization()(x2)
x2 = Activation('elu')(x2)
x2 = MaxPooling2D((int(int(x2.shape[2])  / 1.5), 1), data_format='channels_first')(x2)
#x2 = Flatten()(x2)

x3 = Conv2D(number_filters, (5, embed_size), data_format='channels_first',padding='same')(x)
x3 = BatchNormalization()(x3)
x3 = Activation('relu')(x3)
x3 = MaxPooling2D((int(int(x3.shape[2])  / 1.5), 1), data_format='channels_first')(x3)
#x3 = Flatten()(x3)

x4 = Conv2D(number_filters, (6, embed_size), data_format='channels_first',padding='same')(x)
x4 = BatchNormalization()(x4)
x4 = Activation('elu')(x4)
x4 = MaxPooling2D((int(int(x4.shape[2])  / 1.5), 1), data_format='channels_first')(x4)
#x4 = Flatten()(x4)

x5 = Conv2D(number_filters, (7, embed_size), data_format='channels_first',padding='same')(x)
x5 = BatchNormalization()(x5)
x5 = Activation('relu')(x5)
x5 = MaxPooling2D((int(int(x5.shape[2])  / 1.5), 1), data_format='channels_first')(x5)
#x5 = Flatten()(x5)

# x6 = Conv2D(number_filters, (5, embed_size), data_format='channels_first')(x)
# x6 = BatchNormalization()(x6)
# x6 = Activation('elu')(x6)
# x6 = MaxPooling2D((int(int(x6.shape[2])  / 1.5), 1), data_format='channels_first')(x6)
# x6 = Flatten()(x6)

# x7 = Conv2D(number_filters, (6, embed_size), data_format='channels_first')(x)
# x7 = BatchNormalization()(x7)
# x7 = Activation('relu')(x7)
# x7 = MaxPooling2D((int(int(x7.shape[2])  / 1.5), 1), data_format='channels_first')(x7)
# x7 = Flatten()(x7)

# x8 = Conv2D(number_filters, (7, embed_size), data_format='channels_first')(x)
# x8 = BatchNormalization()(x8)
# x8 = Activation('elu')(x8)
# x8 = MaxPooling2D((int(int(x8.shape[2])  / 1.5), 1), data_format='channels_first')(x8)
# x8 = Flatten()(x8)

# x9 = Conv2D(number_filters, (8, embed_size), data_format='channels_first')(x)
# x9 = BatchNormalization()(x9)
# x9 = Activation('relu')(x9)
# x9 = MaxPooling2D((int(int(x9.shape[2])  / 1.5), 1), data_format='channels_first')(x9)
# x9 = Flatten()(x9)

# x10 = Conv2D(number_filters, (9, embed_size), data_format='channels_first')(x)
# x10 = BatchNormalization()(x10)
# x10 = Activation('elu')(x10)
# x10 = MaxPooling2D((int(int(x10.shape[2])  / 1.5), 1), data_format='channels_first')(x10)
# x10 = Flatten()(x10)

x = merge([x1, x2, x3, x4, x5])
x = BatchNormalization()(x)
Conv2D(number_filters*2, (3, 3), data_format='channels_first',padding='same')(x)
x = BatchNormalization()(x)
x = Activation('relu')(x)
x = MaxPooling2D((1, 1), data_format='channels_first')(x)
x = Flatten()(x)
x = Dropout(0.1)(x)
x = Dense(128, activation="elu")(x)
#x = Dropout(0.1)(x)
# x = Dense(256, activation="relu")(x)
# x = Dropout(0.1)(x)
x = Dense(6, activation="softmax")(x)
#x = Dense(6, activation="sigmoid")(x)
model2 = Model(inputs=inp, outputs=x)
#model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.summary()

In [None]:
model2.fit(X_train, labels_train, batch_size=64, epochs=1)

#y_test = model2.predict([X_te], batch_size=1024, verbose=1)

In [None]:
model.layers[0].trainable=True

In [None]:
preds = model2.predict(Test_train)
sample_submission = pd.read_csv(f'{path}sample_submission.csv')
sample_submission[list_classes] = preds
sample_submission.to_csv('submission_textcnn.csv', index=False)

In [None]:
model2.save_weights("model_textcnn.h5")