# Initialise Random variables and Tensor Board

In [None]:
# -*- coding: utf-8 -*-
"""
Created on Tue May  1 15:12:54 2018

@author: basharm
"""

import numpy as np
import tensorflow as tf
import random as rn

#SEED = 100
SEED = 123

#reference: https://keras.io/getting-started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development
# The below is necessary in Python 3.2.3 onwards to
# have reproducible behavior for certain hash-based operations.
# See these references for further details:
# https://docs.python.org/3.4/using/cmdline.html#envvar-PYTHONHASHSEED
# https://github.com/keras-team/keras/issues/2280#issuecomment-306959926

import os
os.environ['PYTHONHASHSEED'] = '0'

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(SEED)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

rn.seed(SEED)

# Force TensorFlow to use single thread.
# Multiple threads are a potential source of
# non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res

session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)

from keras import backend as K

# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed
tf.reset_default_graph()
tf.set_random_seed(SEED)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

# Rest of code follows ...

# Preprocessing

In [None]:
import re
import html
re1 = re.compile(r' +')

def textFixup(aText):
    aText = aText.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ', '-').replace('\\', ' \\ ').replace('â€™', "'")
    return re1.sub(' ', html.unescape(aText))

In [None]:
#from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

#r_tokenizer = RegexpTokenizer(r'\w+')
p_stemmer = PorterStemmer()

def preprocess_aTweet(tweet):
    tweet = tweet.lower()
    tweet = textFixup(tweet)
    #tokens = r_tokenizer.tokenize(tweet)
    tokens = word_tokenize(tweet)
    tokens = [p_stemmer.stem(t) for t in tokens]
    return ' '.join(tokens)

# Loading Data

In [None]:
import pandas as pd
def load_data_and_labels_csv(fileLoc):
    examples = []
    labels = []
    df = pd.read_csv(fileLoc)
    for i in df.index:
        examples.append(preprocess_aTweet(df['text'][i]))
        if df['Abuse'][i] == 0:
            labels.append(0)
        else:
            labels.append(1)
    return examples, labels
    
X, y = load_data_and_labels_csv('U:\\Research\\Projects\\sef\\datamining\\mlonlineabuse\\To_Label\\LabelledTweets_TillNow.csv')

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

ytrain = np.array(y_train)
ytest = np.array(y_test)

# Transforming data suitable for model format

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
num_words = 100000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)
xtrain = tokenizer.texts_to_sequences(X_train)
maxlen = max(map(lambda x: len(x),xtrain))
xtrain = pad_sequences(xtrain, maxlen=maxlen)

xtest = tokenizer.texts_to_sequences(X_test)
xtest = pad_sequences(xtest, maxlen=maxlen)

# Loading word embedding and mapping data to that word embedding

In [None]:
from gensim.models import KeyedVectors
model_ug_cbow = KeyedVectors.load('U:\\Research\\Projects\\sef\\datamining\\mlonlineabuse\\WordEmbedding\\RandomTweet_200d_mincount_100\\vectors.txt')

embeddings_index = {}
for w in model_ug_cbow.wv.vocab.keys():
    embeddings_index[w] = model_ug_cbow.wv[w]

embedding_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Creating CNN model and training it for 10 epoc

In [None]:
from keras.layers import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import Input, concatenate, Activation
from keras.models import Model

def create_cnn_model():
    tweet_input = Input(shape=(maxlen,), dtype='int32')
    tweet_encoder = Embedding(num_words, 200, weights=[embedding_matrix], input_length=maxlen, trainable=True)(tweet_input)
    tweet_encoder = Dropout(0.5)(tweet_encoder)
    
    bigram_branch = Conv1D(filters=128, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
    bigram_branch = GlobalMaxPooling1D()(bigram_branch)
    bigram_branch = Dropout(0.5)(bigram_branch)
    
    trigram_branch = Conv1D(filters=256, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
    trigram_branch = GlobalMaxPooling1D()(trigram_branch)
    trigram_branch = Dropout(0.2)(trigram_branch)
    
    fourgram_branch = Conv1D(filters=512, kernel_size=5, padding='valid', activation='relu', strides=1)(tweet_encoder)
    fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
    fourgram_branch = Dropout(0.2)(fourgram_branch)
    
    merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

    merged = Dense(256, activation='relu')(merged)
    merged = Dropout(0.5)(merged)
    
    merged = Dense(1)(merged)
    output = Activation('sigmoid')(merged)
    
    model = Model(inputs=[tweet_input], outputs=[output])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    #model.summary()
    return model

cnn_model = create_cnn_model()
cnn_model.fit(xtrain, ytrain, epochs=10, batch_size=32, verbose=1)

# Evaluating the model with test dataset

In [None]:
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_curve, auc, roc_auc_score

p = cnn_model.predict(xtest,verbose=1)
#p = np.mean([p1], axis=0)

#p2 = lstm_model.predict(xtest,verbose=1)
#p = np.mean([p2], axis=0)

predicted = [int(round(x[0])) for x in p]
predicted = np.array(predicted)
actual = ytest

tp = np.count_nonzero(predicted * actual)
tn = np.count_nonzero((predicted - 1) * (actual - 1))
fp = np.count_nonzero(predicted * (actual - 1))
fn = np.count_nonzero((predicted - 1) * actual)

print('True Positive', tp)
print('True Negative', tn)
print('False Positive', fp)
print('False Negative', fn)

accuracy = (tp + tn) / (tp + fp + fn + tn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
fmeasure = (2 * precision * recall) / (precision + recall)
cohen_kappa_score = cohen_kappa_score(predicted, actual)
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predicted)
auc_val = auc(false_positive_rate, true_positive_rate)
roc_auc_val = roc_auc_score(actual, predicted)

print('Accuracy', accuracy)
print('Precision', precision)
print('Recall', recall)
print('f-measure', fmeasure)
print('cohen_kappa_score', cohen_kappa_score)
print('auc', auc_val)
print('roc_auc', roc_auc_val)

print("Average of ROC-AUC score: %.3f" % roc_auc_score(ytest, p))