In [1]:
import pandas as pd

from src.SemEvalData import SemEvalData
from src.JigsawData import JigsawData
from nltk import tokenize
import nltk
import keras
from keras.preprocessing.text import Tokenizer,  text_to_word_sequence
from keras.engine.topology import Layer
from keras import initializers as initializers, regularizers, constraints
from keras.callbacks import Callback, ModelCheckpoint
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding, Input, Dense, LSTM, GRU, Bidirectional, TimeDistributed, Dropout, Flatten, SpatialDropout1D
from keras import backend as K
from keras import optimizers
from keras.models import Model
from src.Attention import Attention
import re
import numpy as np
import os
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.metrics import roc_auc_score, accuracy_score
from src.preprocessing import get_embeddings_index, get_embeddings_matrix, getSpansByToxicWords
from keras import Sequential
from test_sentence import preprocess_lstm, test_lime, vectorize, Transform, getPredictedWordsFromSentence
import pickle
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer

In [5]:
##set to .env
MAX_FEATURES = 200000 # maximum number of unique words that should be included in the tokenized word index
MAX_WORD_NUM = 40     # maximum number of letters in sentence?
EMBED_SIZE = 50  ## same value as in dimension of glove
VAL_SPLIT = 0.2  
REG_PARAM = 1e-13
l2_reg = regularizers.l2(REG_PARAM)

In [5]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/patrycja/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [6]:
## load data
train_data_semeval = SemEvalData(MAX_WORD_NUM)
train_data_semeval.load_data("data/tsd_trial.csv")
train_df_preprocessed = train_data_semeval.preprocess()

In [7]:
train_df_preprocessed

Unnamed: 0,spans,text,toxicity,toxic_words,original_text,sentences,diff,toxicity_sentence
0,"[15, 16, 17, 18, 19, 27, 28, 29, 30, 31]",because hes a moron and a bigot. its not any m...,1,"[moron, bigot]",Because he's a moron and a bigot. It's not any...,"[because hes a moron and a bigot., its not any...","[10, 36]","[1.0, 0.0]"
1,"[29, 30, 31, 32, 33, 34]",how about we stop protecting idiots and let na...,1,[idiots],How about we stop protecting idiots and let na...,[how about we stop protecting idiots and let n...,[],"[1.0, 0.0]"
2,"[166, 167, 168, 169, 170, 171]","if people were smart, they would boycott th...",1,[idiots],"If people were smart, they would Boycott th...","[if people were smart, they would boycott t...",[],[1.0]
3,"[87, 88, 89, 90, 91, 92]",trump claimed that russia will never invade th...,1,[stupid],Trump Claimed that Russia will never invade th...,[trump claimed that russia will never invade t...,[],[1.0]
4,[],as long as your willing to pay a lot more for ...,0,[],As long as your willing to pay a lot more for ...,[as long as your willing to pay a lot more for...,[148],"[0.0, 0.0, 0.0]"
...,...,...,...,...,...,...,...,...
685,"[129, 130, 131, 132, 133, 134]",but ... trumps not bluffing. hes prepared to g...,1,[stupid],But ... Trump's not bluffing. He's prepared to...,"[but ... trumps not bluffing., hes prepared to...","[13, 32, 151, 159, 166]","[0.0, 0.0, 0.0, 1.0, 0.0]"
686,"[126, 127, 128, 129, 130, 131]",cant believe the limited knowledge of this art...,1,[stupid],Can't believe the limited knowledge of this Ar...,[cant believe the limited knowledge of this ar...,[3],"[0.0, 1.0]"
687,"[24, 25, 26, 27, 28, 29]",i think it conservative idiots who cannot reac...,1,[idiots],I think it conservative idiots who cannot reac...,[i think it conservative idiots who cannot rea...,[],[1.0]
688,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",youre an id*ot...go away.,1,[youre an id*ot],You're an id*ot...Go away.,[youre an id*ot...go away.],[3],[1.0]


In [8]:
paras = []
labels = []
texts = []
sent_lens = []
sent_nums = []

##tokenize words
len_tr = len(train_df_preprocessed)
# result = train_df_preprocessed.append(extra_train_df, ignore_index=True, sort=False)
result = train_df_preprocessed
train_data = {
    'sentence':  result.sentences.sum(),
    'toxicity_sentence': result.toxicity_sentence.sum()
        }

train_df = pd.DataFrame (train_data, columns = ['sentence','toxicity_sentence'])
###

In [9]:
sentences =[]
for i in train_df.sentence:
    sentences.append(nltk.word_tokenize(i))

In [10]:
#filter token that are not alphabetic
sentences_filter=[]
for i, w in enumerate(sentences):
    sentences[i] = [word for word in sentences[i] if word.isalpha()]

In [11]:
#filter stop words
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
for i, w in enumerate(sentences):
    sentences[i] = [w for w in sentences[i] if not w in stop_words]

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/patrycja/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [12]:
sentences = [x for x in sentences if x!=[]]

In [13]:
tokenizer = Tokenizer(num_words=MAX_FEATURES,lower=True, split=" ")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
word_counts = tokenizer.word_counts

In [14]:
import gensim
word_vectors = gensim.models.KeyedVectors.load_word2vec_format('data/GoogleNews-vectors-negative300.bin', binary=True, limit = 1000000)

In [15]:
EMBEDDING_DIM = 300
vocabulary_size=min(len(word_index)+1,MAX_FEATURES)
embedding_matrix = np.zeros((vocabulary_size, EMBEDDING_DIM))

In [16]:
absent_words = 0
for word, i in word_index.items():
    if i>=MAX_FEATURES:
        continue
    try:
        embedding_vector = word_vectors[word]
        embedding_matrix[i] = embedding_vector
    except KeyError:
        embedding_matrix[i]=0
        absent_words+=1

In [17]:
### save toknizer to file so that it could be used again
import pickle

# saving

#with open('tokenizer_nn.pickle', 'wb') as handle:

#    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:
data_index = np.zeros((len(sentences), MAX_WORD_NUM), dtype='int32')
for i, sentence in enumerate(sentences):
    for k, word in enumerate(sentence):
        try:
            if k<MAX_WORD_NUM and tokenizer.word_index[word]<MAX_FEATURES:
                data_index[i,k] = tokenizer.word_index[word]
        except:
            #print(word)
            pass

In [19]:
print(data_index[:5])
indices = np.arange(data_index.shape[0])
np.random.shuffle(indices)
data = data_index[indices].copy()
##IMPORTANT
data = data.astype(np.float32)
labels = train_df.toxicity_sentence.iloc[indices]
# labels = labels.astype(np.float32)
nb_validation_samples = int(VAL_SPLIT * data.shape[0])
x_train = data[:-nb_validation_samples]
y_train = np.vstack(labels[:-nb_validation_samples])
x_val = data[-nb_validation_samples:]
y_val = np.vstack(labels[-nb_validation_samples:])

[[  34   63  762    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [1785    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [  47  763   18   52  764 1081 1786 1787 1082    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [  44 1788 1083  765 1084    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]
 [   2  564    6 1085  449 1789  564 1790 1791    4    9   48 1792  179
    18 1086    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0  

In [20]:
y_softmax_train = np.zeros((y_train.shape[0], 2))
y_softmax_val = np.zeros((y_val.shape[0], 2))

In [21]:
for i in range(0, y_softmax_train.shape[0]):
    if y_train[i] == 0:
        y_softmax_train[i][0] = 1
    else :
        y_softmax_train[i][1] = 1

In [22]:
for i in range(0, y_softmax_val.shape[0]):
    if y_val[i] == 0:
        y_softmax_val[i][0] = 1
    else :
        y_softmax_val[i][1] = 1

### Build model

In [23]:
model = Sequential()
model.add(Embedding(len(word_index)+1 ,EMBEDDING_DIM,weights=[embedding_matrix], input_length=MAX_WORD_NUM, trainable= True, name='embedding'))
model.add(SpatialDropout1D(0.3))
model.add(Bidirectional(LSTM(EMBEDDING_DIM,dropout=0.3, recurrent_dropout=0.3), name='bidirectional'))
model.add(Dense(EMBEDDING_DIM, activation='relu', name='dense'))
model.add(Dropout(0.8))
model.add(Dense(EMBEDDING_DIM, activation='relu', name='dense2'))
model.add(Dropout(0.8))
model.add(Dense(2, activation='softmax', name='dense_final'))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 40, 300)           1349100   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 40, 300)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 600)               1442400   
_________________________________________________________________
dense (Dense)                (None, 300)               180300    
_________________________________________________________________
dropout (Dropout)            (None, 300)               0         
_________________________________________________________________
dense2 (Dense)               (None, 300)               90300     
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0

In [24]:
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['acc']) ##adam
checkpoint = ModelCheckpoint('best_model.h5', verbose=-2, monitor='val_loss',save_best_only=True, mode='auto')
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 40, 300)           1349100   
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 40, 300)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 600)               1442400   
_________________________________________________________________
dense (Dense)                (None, 300)               180300    
_________________________________________________________________
dropout (Dropout)            (None, 300)               0         
_________________________________________________________________
dense2 (Dense)               (None, 300)               90300     
_________________________________________________________________
dropout_1 (Dropout)          (None, 300)               0

In [25]:
history = model.fit(x_train, y_softmax_train, validation_data=(x_val, y_softmax_val), epochs=25, batch_size=1024,shuffle=True, callbacks=[checkpoint])
print(history.history.keys())
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

Epoch 1/25
Epoch 2/25
Epoch 3/25


KeyboardInterrupt: 

### Evaluation

In [3]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

from lime import lime_tabular

In [6]:
## load data
test_data_semeval = SemEvalData(MAX_WORD_NUM)
test_data_semeval.load_data("data/tsd_trial.csv")
test_df_preprocessed = test_data_semeval.preprocess()

In [7]:
result = test_df_preprocessed
test_data = {
    'sentence':  result.sentences.sum(),
    'toxicity_sentence': result.toxicity_sentence.sum()
        }

test_df = pd.DataFrame (test_data, columns = ['sentence','toxicity_sentence'])

In [8]:
model = keras.models.load_model("lstm_drop_jul_train.h5")

In [9]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 40, 300)           5606100   
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 40, 300)           0         
_________________________________________________________________
bidirectional (Bidirectional (None, 600)               1442400   
_________________________________________________________________
dense (Dense)                (None, 300)               180300    
_________________________________________________________________
dropout_4 (Dropout)          (None, 300)               0         
_________________________________________________________________
dense2 (Dense)               (None, 300)               90300     
_________________________________________________________________
dropout_5 (Dropout)          (None, 300)              

In [10]:
#y_pred = np.argmax(model.predict(x_test), axis=1)
#y_true = y_test
#print(y_pred)
#print(classification_report(y_true, y_pred))

Adapting explanation to sentences

In [11]:
test_df_preprocessed[:18]

Unnamed: 0,spans,text,toxicity,toxic_words,original_text,sentences,diff,toxicity_sentence
0,"[15, 16, 17, 18, 19, 27, 28, 29, 30, 31]",because hes a moron and a bigot. its not any m...,1,"[moron, bigot]",Because he's a moron and a bigot. It's not any...,"[because hes a moron and a bigot., its not any...","[10, 36]","[1.0, 0.0]"
1,"[29, 30, 31, 32, 33, 34]",how about we stop protecting idiots and let na...,1,[idiots],How about we stop protecting idiots and let na...,[how about we stop protecting idiots and let n...,[],"[1.0, 0.0]"
2,"[166, 167, 168, 169, 170, 171]","if people were smart, they would boycott th...",1,[idiots],"If people were smart, they would Boycott th...","[if people were smart, they would boycott t...",[],[1.0]
3,"[87, 88, 89, 90, 91, 92]",trump claimed that russia will never invade th...,1,[stupid],Trump Claimed that Russia will never invade th...,[trump claimed that russia will never invade t...,[],[1.0]
4,[],as long as your willing to pay a lot more for ...,0,[],As long as your willing to pay a lot more for ...,[as long as your willing to pay a lot more for...,[148],"[0.0, 0.0, 0.0]"
5,"[8, 9, 10, 11, 12]",only an idiot would use and believe anything t...,1,[idiot],Only an idiot would use and believe anything t...,[only an idiot would use and believe anything ...,[],[1.0]
6,"[265, 266, 267, 268, 269, 270, 271, 272, 273, ...",thanks a lot douchebag. youre the reason the p...,1,[o try to turn salem into some kind of new-stu...,Thanks a lot douchebag. You're the reason the ...,"[thanks a lot douchebag., youre the reason the...","[27, 467]","[0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0]"
7,[],kick all the non human criminal illegals out o...,0,[],kick all the non human criminal illegals out o...,[kick all the non human criminal illegals out ...,[],[0.0]
8,"[38, 39, 40, 41, 42, 43]",because driving under ontario laws is stupid e...,1,[stupid],Because driving under Ontario laws is stupid e...,[because driving under ontario laws is stupid ...,[],[1.0]
9,"[277, 278, 279, 280, 281, 282, 283, 284, 285, ...",youre wrong. the delay between retirement and...,1,[dont make ignorant statements],You're wrong. The delay between retirement an...,"[youre wrong., the delay between retirement an...","[3, 264, 280]","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0]"


In [None]:
test_df_preprocessed["predicted_span"]=[test_lime(sentences) for sentences in test_df_preprocessed["original_text"] ]


['hes', 'a'] because hes a moron and a bigot. its not any more complicated than that. []

['how', 'we', 'stop', 'protecting'] how about we stop protecting idiots and let nature add some bleach to the gene pool. we can always submit their names for the darwin awards. []

[] if people  were  smart, they would  boycott this  inept  airline,  but   they  are  not  smart,  so   rogue  businesses  like  this  one,   still thrive   taking the idiots  for  a ride... []

[] trump claimed that russia will never invade the ukraine, when russia already has - how stupid can people be? []

['call', 'we', 'get', 'to'] as long as your willing to pay a lot more for products you buy, then fine. but you better not be going to costco and walmart to buy stuff because its cheaper. if so, we get to call you a hypocritical wanker. []

['would', 'idiot', 'believe', 'anything'] only an idiot would use and believe anything this this republican propaganda machine publishes: www.realclearpolitics.com []

['thanks


['laziness'] liberalism leads to stupidity and laziness, leading largassitis. []

[] yes, remember the idiocy, hydrogen would make petroleum obsolete. []

[] beijing millionaires and wanna be beijing millionairesses who squeeze money from the rest of us! and stupid blind white men with blinders! []

['article', 'would', 'you', 'so', 'if', 'realize', 'reffer', 'guarantee', 'marijuana', 'alcohol'] are u stupid... did you not read article small amount of marijuana...... massive amounts of meth and herion .... yep blame it on weed.... you obviously  never smoked any. if so you would realize a reffer head would have been to lazy and hungry to steal a boat.. as a matter of fact. i guarantee  alcohol  which is legal same as marijuana  played huge part in this... not defending the meth heads or tweakers out there,  but really your simple minded antics are what is wrong with our community... []

[] this clearly is why junior is not qualified to be pm........hes an idiot ! []

['include'] secti


[] he cancelled because bprder security is bad for the cartels that fund his crooked ass! nice try though, wapodn! []

['learn'] rebuke convicts idiots, the simple will learn to obey. []

['what', 'geez'] geez, what an idiot. []

[] oh my here we go again.....another day in the lives of hysterical hypocrite liberals and their russian boogey man conspiracies. democrats obvisiously  so butthurt over trumps win that all they have left is impeachment nonsense and verbal attacks to distract the public from their own gross ignorance and anti america, un american direction of their political party. []

['to', 'the', 'pathological', 'guy', 'is', 'a'] if youre dumb enough to believe him on this, then ive got a bridge in brooklyn i will sell you... . the guy is a pathological liar.  dont believe anything he says.  its all lies. []

['know', 'really', 'it', 'regards'] i know it....but, damn it.....hes really good at it.  regards, gary []

['that'] really?  are you that stupid? []

[] we have had

In [None]:
test_df_preprocessed["Pscore"] = [ 1 if (len(s) == 0 and len(ps) == 0) 
                             else 0 if len(ps) == 0 
                             else len( set(s).intersection(set(ps) ))/ len(set(ps))  for s, ps in zip(test_df_preprocessed["spans"],test_df_preprocessed["predicted_span"]) ]

In [None]:
test_df_preprocessed["Rscore"] = [ 1 if (len(s) == 0 and len(ps) == 0) 
                             else 0 if len(s) == 0 
                             else len( set(s).intersection(set(ps) ))/ len(set(s))  for s, ps in zip(test_df_preprocessed["spans"],test_df_preprocessed["predicted_span"]) ]

In [None]:
test_df_preprocessed["Fscore"] = [ 0 if (p == 0 and r == 0)
    else 2 * p *r /(p + r) for p, r in zip(test_df_preprocessed["Pscore"], test_df_preprocessed["Rscore"] )]

In [None]:
test_df_preprocessed

In [None]:
F_score= np.mean(test_df_preprocessed["Fscore"])

In [None]:
F_score