In [13]:
%matplotlib inline
from keras.preprocessing.text import text_to_word_sequence
from utils import *
from __future__ import division, print_function
import sys, os, re, csv, codecs, numpy as np, pandas as pd

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers import Bidirectional, GlobalMaxPool1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

## Load the data

In [14]:
path = "data/"
batch_size=64
max_features = 20000
maxlen = 100

In [15]:
_trainData = pd.read_csv(path+'train.csv')

In [16]:
list_sentences_train = _trainData["comment_text"].fillna("_na_").values
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]
labels_train = _trainData[list_classes].values

In [17]:
_testData = pd.read_csv(path+'test.csv')

In [18]:
list_sentences_test = _testData["comment_text"].fillna("_na_").values

## Process sentences

In [19]:
special_character_removal=re.compile(r'[^a-z\d ]',re.IGNORECASE)
replace_numbers=re.compile(r'\d+',re.IGNORECASE)

In [20]:
def text_to_wordlist(text, remove_stopwords=True, stem_words=True):
    #Remove Special Characters
    text=special_character_removal.sub('',text)
    
    #Replace Numbers
    text=replace_numbers.sub('n',text)
    # Clean the text, with the option to remove stopwords and to stem words.
    # Convert words to lower case and split them
    text = text.lower().split()

    # Optionally, remove stop words
    if remove_stopwords:
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Optionally, shorten words to their stems
    if stem_words:
        text = text.split()
        stemmer = SnowballStemmer('english')
        stemmed_words = [stemmer.stem(word) for word in text]
        text = " ".join(stemmed_words)
    
    # Return a list of words
    return(text)

In [21]:
def get_glove_dataset(dataset):
    """Download the requested glove dataset from files.fast.ai
    and return a location that can be passed to load_vectors.
    """
    # see wordvectors.ipynb for info on how these files were
    # generated from the original glove data.
    md5sums = {'6B.50d': '8e1557d1228decbda7db6dfd81cd9909',
               '6B.100d': 'c92dbbeacde2b0384a43014885a60b2c',
               '6B.200d': 'af271b46c04b0b2e41a84d8cd806178d',
               '6B.300d': '30290210376887dcc6d0a5a6374d8255'}
    glove_path = os.path.abspath('data/glove/results')
    %mkdir -p $glove_path
    return get_file(dataset,
                    'http://files.fast.ai/models/glove/' + dataset + '.tgz',
                    cache_subdir=glove_path,
                    md5_hash=md5sums.get(dataset, None),
                    untar=True)

In [22]:
def load_vectors(loc):
    return (load_array(loc+'.dat'),
        pickle.load(open(loc+'_words.pkl','rb'),encoding='latin1'),
        pickle.load(open(loc+'_idx.pkl','rb'),encoding='latin1'))

In [23]:
vecs, words, wordidx = load_vectors(get_glove_dataset('6B.50d'))


A subdirectory or file -p already exists.
Error occurred while processing: -p.
A subdirectory or file c:\Dev\learning\data\glove\results already exists.
Error occurred while processing: c:\Dev\learning\data\glove\results.


In [24]:
comments = []
for text in list_sentences_train:
    comments.append(text_to_wordlist(text))

In [25]:
print(comments[0])

nonsens kiss geek said true ill account termin


In [26]:
test_comments=[]
for text in list_sentences_test:
    test_comments.append(text_to_wordlist(text))

## Vectorize words

In [27]:
tokenizer = Tokenizer(num_words=max_features,filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n\'', lower=True)
# tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(comments + test_comments))
comments_sequence = tokenizer.texts_to_sequences(comments)

In [28]:
test_comments_sequence = tokenizer.texts_to_sequences(test_comments)    
X_train = sequence.pad_sequences(comments_sequence , maxlen=maxlen)
Test_train = sequence.pad_sequences(test_comments_sequence, maxlen=maxlen)

In [29]:
emb_mean,emb_std = vecs.mean(), vecs.std()
emb_mean,emb_std


(0.020940498, 0.6441043)

In [30]:
word_index = tokenizer.word_index
nb_words = vecs.shape[0]
embed_size= vecs.shape[1]
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_size))
for word, i in word_index.items():
    if i >= nb_words: continue
    vec_idx = wordidx.get(word)
    if vec_idx is not None:
            embedding_vector = vecs[vec_idx]
            if embedding_vector is not None: embedding_matrix[i] = embedding_vector
   

In [31]:
## model

In [32]:
#X_train = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
#Test_train = Test_train.reshape((Test_train.shape[0], 1, Test_train.shape[1]))

In [34]:
inp = Input(shape=(maxlen,))
x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(50, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(50, activation="relu")(x)
x = Dropout(0.1)(x)
x = Dense(6, activation="sigmoid")(x)
model = Model(inputs=inp, outputs=x)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         (None, 100)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 100, 50)           20000000  
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 100)          40400     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 50)                5050      
_________________________________________________________________
dropout_1 (Dropout)          (None, 50)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 6)                 306       
Total para

In [None]:
model.fit(X_train, labels_train, batch_size=256, epochs=1)

#y_test = model2.predict([X_te], batch_size=1024, verbose=1)

Epoch 1/1

In [None]:
model.save_weights("model_LSTM_1.h5")

In [None]:
model.layers[0].trainable=True

In [None]:
model.fit(X_train, labels_train, batch_size=64, epochs=1)

In [None]:
model.save_weights("model_LSTM_2.h5")

In [None]:
preds = model.predict(Test_train)
sample_submission = pd.read_csv(f'{path}sample_submission.csv')
sample_submission[list_classes] = preds
sample_submission.to_csv('submission_textlstm.csv', index=False)