In [1]:
import pandas as pd
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation, GRU, concatenate
from keras.layers import Bidirectional, GlobalMaxPool1D, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.models import Model
from keras import initializers, regularizers, constraints, optimizers, layers
import numpy as np

Using TensorFlow backend.


In [2]:
df_train = pd.read_csv("data/train.csv")
df_test  = pd.read_csv("data/test.csv")

In [3]:
texts_train = df_train["Comment"]
texts_test  = df_test["Comment"]
y_train = df_train["Insult"].values

In [4]:
maxlen = 50
vocab_size = 1000
glove_path = "glove/glove.twitter.27B."
embed_dim = 50

In [5]:


tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(list(texts_train))
tokenized_train = tokenizer.texts_to_sequences(texts_train)
tokenized_test = tokenizer.texts_to_sequences(texts_test)
X_train = pad_sequences(tokenized_train, maxlen=maxlen)
X_test = pad_sequences(tokenized_test, maxlen=maxlen)



In [6]:
def get_coefs(word,*arr): 
    return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(glove_path+str(embed_dim)+'d.txt'))
all_embs = np.stack(embeddings_index.values())
emb_mean,emb_std = all_embs.mean(), all_embs.std()

In [7]:
word_index = tokenizer.word_index
nb_words = min(vocab_size, len(word_index))
embedding_matrix = np.random.normal(emb_mean, emb_std, (nb_words, embed_dim))
for word, i in word_index.items():
    if i >= nb_words: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector


In [8]:
# network

lstm_nodes = 32

inp = Input(shape = (maxlen,))
x = Embedding(nb_words, embed_dim, weights=[embedding_matrix])(inp)
x = Bidirectional(LSTM(lstm_nodes, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = Bidirectional(LSTM(lstm_nodes, return_sequences=True, dropout=0.1, recurrent_dropout=0.1))(x)
x = GlobalMaxPool1D()(x)
x = Dense(1, activation = "sigmoid")(x)
model = Model(inputs=inp, outputs=x, name = "lstm")
model.compile(loss='binary_crossentropy', optimizer='adam',metrics=['accuracy'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 50)                0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 50, 50)            50000     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 64)            21248     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 50, 64)            24832     
_________________________________________________________________
global_max_pooling1d_1 (Glob (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 96,145
Trainable params: 96,145
Non-trainable params: 0
_________________________________________________________________


In [9]:
history = model.fit(X_train, y_train,batch_size=16, epochs=20, verbose = 2, validation_split=0.1);

Train on 3552 samples, validate on 395 samples
Epoch 1/20
 - 20s - loss: 0.5446 - acc: 0.7416 - val_loss: 0.4489 - val_acc: 0.7899
Epoch 2/20
 - 18s - loss: 0.4047 - acc: 0.8201 - val_loss: 0.3748 - val_acc: 0.8481
Epoch 3/20
 - 18s - loss: 0.3547 - acc: 0.8502 - val_loss: 0.3729 - val_acc: 0.8456
Epoch 4/20
 - 19s - loss: 0.3310 - acc: 0.8590 - val_loss: 0.3636 - val_acc: 0.8430
Epoch 5/20
 - 22s - loss: 0.3088 - acc: 0.8719 - val_loss: 0.3827 - val_acc: 0.8354
Epoch 6/20
 - 23s - loss: 0.2924 - acc: 0.8801 - val_loss: 0.3762 - val_acc: 0.8456
Epoch 7/20
 - 21s - loss: 0.2764 - acc: 0.8818 - val_loss: 0.3813 - val_acc: 0.8481
Epoch 8/20
 - 21s - loss: 0.2692 - acc: 0.8953 - val_loss: 0.3724 - val_acc: 0.8481
Epoch 9/20
 - 22s - loss: 0.2507 - acc: 0.9026 - val_loss: 0.4557 - val_acc: 0.8076
Epoch 10/20
 - 23s - loss: 0.2326 - acc: 0.9048 - val_loss: 0.4172 - val_acc: 0.8430
Epoch 11/20
 - 22s - loss: 0.2188 - acc: 0.9113 - val_loss: 0.4206 - val_acc: 0.8430
Epoch 12/20
 - 21s - loss: 

In [11]:
y_predict = model.predict([X_test], batch_size=1024, verbose=1)



In [16]:
y_pred = np.zeros_like(y_predict)
y_pred[y_predict > 0.5] = 1


In [18]:
df_test["Insult"] = y_pred
df_test


Unnamed: 0,id,Date,Comment,Insult
0,1,20120603163526Z,"""like this if you are a tribe fan""",0.0
1,2,20120531215447Z,"""you're idiot.......................""",1.0
2,3,20120823164228Z,"""I am a woman Babs, and the only ""war on women...",0.0
3,4,20120826010752Z,"""WOW & YOU BENEFITTED SO MANY WINS THIS YEAR F...",1.0
4,5,20120602223825Z,"""haha green me red you now loser whos winning ...",1.0
5,6,20120603202442Z,"""\nMe and God both hate-faggots.\n\nWhat's the...",0.0
6,7,20120603163604Z,"""Oh go kiss the ass of a goat....and you DUMMY...",0.0
7,8,20120602223902Z,"""Not a chance Kid, you're wrong.""",0.0
8,9,20120528064125Z,"""On Some real Shit FUck LIVE JASMIN!!!""",0.0
9,10,20120603071243Z,"""ok but where the hell was it released?you all...",0.0
