# Initialise Random variables and Tensor Board

In [1]:
# -*- coding: utf-8 -*-
"""
Created on Tue May  1 15:12:54 2018

@author: basharm
"""

import numpy as np
import tensorflow as tf
import random as rn

#SEED = 100
SEED = 123

#reference: https://keras.io/getting-started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development
# The below is necessary in Python 3.2.3 onwards to
# have reproducible behavior for certain hash-based operations.
# See these references for further details:
# https://docs.python.org/3.4/using/cmdline.html#envvar-PYTHONHASHSEED
# https://github.com/keras-team/keras/issues/2280#issuecomment-306959926

import os
os.environ['PYTHONHASHSEED'] = '0'

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(SEED)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

rn.seed(SEED)

# Force TensorFlow to use single thread.
# Multiple threads are a potential source of
# non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res

session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)

from keras import backend as K

# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed
tf.reset_default_graph()
tf.set_random_seed(SEED)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

# Rest of code follows ...

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


# Preprocessing

In [2]:
import re
import html
re1 = re.compile(r' +')

def textFixup(aText):
    aText = aText.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>','u_n').replace(' @.@ ','.').replace(
        ' @-@ ', '-').replace('\\', ' \\ ').replace('â€™', "'")
    return re1.sub(' ', html.unescape(aText))

In [3]:
#from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer

#r_tokenizer = RegexpTokenizer(r'\w+')
p_stemmer = PorterStemmer()

def preprocess_aTweet(tweet):
    tweet = tweet.lower()
    tweet = textFixup(tweet)
    #tokens = r_tokenizer.tokenize(tweet)
    tokens = word_tokenize(tweet)
    tokens = [p_stemmer.stem(t) for t in tokens]
    return ' '.join(tokens)

# Loading Data

In [4]:
import pandas as pd
def load_data_and_labels_csv(fileLoc):
    examples = []
    labels = []
    df = pd.read_csv(fileLoc)
    for i in df.index:
        examples.append(preprocess_aTweet(df['text'][i]))
        if df['label'][i] == 0:
            labels.append(0)
        else:
            labels.append(1)
    return examples, labels
    
X_train, y_train = load_data_and_labels_csv('U:\\Research\\Projects\\sef\\datamining\\mlonlineabuse\\WorkingFolder\\Train\\train_final.csv')

X_test, y_test = load_data_and_labels_csv('U:\\Research\\Projects\\sef\\datamining\\mlonlineabuse\\WorkingFolder\\Test\\test_final.csv')

ytrain = np.array(y_train)
ytest = np.array(y_test)

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
count_vectorizer = CountVectorizer(binary=True)
count_vectorizer.fit(X_train)
#count_vectorizer.vocabulary_

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [6]:
train_vectors = count_vectorizer.transform(X_train)
X_train_vectors = train_vectors.toarray()
test_vectors = count_vectorizer.transform(X_test)
X_test_vectors = test_vectors.toarray()

In [7]:
categories = list([0,1])
y_train_vectors = []
for e in ytrain:
    output_empty = [0] * len(categories)
    output_row = list(output_empty)
    output_row[categories.index(e)] = 1
    y_train_vectors.append(output_row)

In [8]:
y_test_vectors = []
for e in ytest:
    output_empty = [0] * len(categories)
    output_row = list(output_empty)
    output_row[categories.index(e)] = 1
    y_test_vectors.append(output_row)

# Creating DNN model and training it for 10 epoc

In [9]:
import numpy as np
import tflearn
import tensorflow as tf
import random
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_curve, auc, roc_auc_score
from keras.layers import Dense, Dropout

def create_dnn_model():
    
    inputSize = len(X_train_vectors[0])
    outputSize = len(y_train_vectors[0])
    
    # reset underlying graph data
    tf.reset_default_graph()
    # Build neural network
    net = tflearn.input_data(shape=[None, inputSize])
    net = Dropout(0.5)(net)
    net = tflearn.fully_connected(net, 8)
    net = Dropout(0.5)(net)
    net = tflearn.fully_connected(net, 8)
    net = Dropout(0.5)(net)
    net = tflearn.fully_connected(net, 8)
    net = Dropout(0.5)(net)
    net = tflearn.fully_connected(net, 8)
    net = Dropout(0.5)(net)
    net = tflearn.fully_connected(net, 8)
    net = tflearn.fully_connected(net, outputSize, activation='softmax')
    net = tflearn.regression(net,learning_rate=0.04)

    # Define model and setup tensorboard
    model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')
    
    return model

dnn_model = create_dnn_model()
dnn_model.fit(X_train_vectors, y_train_vectors, n_epoch=35, batch_size=32, show_metric=True)


Training Step: 4374  | total loss: [1m[32m0.05876[0m[0m | time: 0.412s
| Adam | epoch: 035 | loss: 0.05876 - acc: 0.9773 -- iter: 3968/3988
Training Step: 4375  | total loss: [1m[32m0.05380[0m[0m | time: 0.415s
| Adam | epoch: 035 | loss: 0.05380 - acc: 0.9796 -- iter: 3988/3988
--


# Evaluating the model with test dataset

In [10]:
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_curve, auc, roc_auc_score

actual = np.argmax(y_test_vectors, axis=1)
#print(actual)
predictions = dnn_model.predict(X_test_vectors)
predicted = np.argmax(predictions, axis=1)
predicted = np.array(predicted)

tp = np.count_nonzero(predicted * actual)
tn = np.count_nonzero((predicted - 1) * (actual - 1))
fp = np.count_nonzero(predicted * (actual - 1))
fn = np.count_nonzero((predicted - 1) * actual)

print('True Positive\t' + str(tp))
print('True Negative\t' + str(tn))
print('False Positive\t' + str(fp))
print('False Negative\t' + str(fn))

accuracy = (tp + tn) / (tp + fp + fn + tn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
fmeasure = (2 * precision * recall) / (precision + recall)
cohen_kappa_score = cohen_kappa_score(predicted, actual)
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predicted)
auc_val = auc(false_positive_rate, true_positive_rate)
roc_auc_val = roc_auc_score(actual, predicted)

print('Accuracy\t' + str(accuracy))
print('Precision\t' + str(precision))
print('Recall\t' + str(recall))
print('f-measure\t' + str(fmeasure))
print('cohen_kappa_score\t' + str(cohen_kappa_score))
print('auc\t' + str(auc_val))
print('roc_auc\t' + str(roc_auc_val))

#print("Average of ROC-AUC score: %.3f" % roc_auc_score(ytest, predictions))

True Positive	146
True Negative	664
False Positive	97
False Negative	100
Accuracy	0.8043694141012909
Precision	0.6008230452674898
Recall	0.5934959349593496
f-measure	0.5971370143149285
cohen_kappa_score	0.46796310748872927
auc	0.7330160358108181
roc_auc	0.7330160358108181
