# Import common libraries

In [None]:
# -*- coding: utf-8 -*-
"""
@author: basharm
"""
import tensorflow as tf
import numpy as np
import pandas as pd
import random as rn
import spacy
import re
import html

BASE = 'C:\\Users\\basharm\\PythonJupyter\\CoVID19CodeGit\\data\\sentiment_data\\'


# Initialise Random variables and Tensor Board

In [None]:
#SEED = 100
SEED = 123

#reference: https://keras.io/getting-started/faq/#how-can-i-obtain-reproducible-results-using-keras-during-development
# The below is necessary in Python 3.2.3 onwards to
# have reproducible behavior for certain hash-based operations.
# See these references for further details:
# https://docs.python.org/3.4/using/cmdline.html#envvar-PYTHONHASHSEED
# https://github.com/keras-team/keras/issues/2280#issuecomment-306959926

import os
os.environ['PYTHONHASHSEED'] = '0'

# The below is necessary for starting Numpy generated random numbers
# in a well-defined initial state.

np.random.seed(SEED)

# The below is necessary for starting core Python generated random numbers
# in a well-defined state.

rn.seed(SEED)

# Force TensorFlow to use single thread.
# Multiple threads are a potential source of
# non-reproducible results.
# For further details, see: https://stackoverflow.com/questions/42022950/which-seeds-have-to-be-set-where-to-realize-100-reproducibility-of-training-res

session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)

from keras import backend as K

# The below tf.set_random_seed() will make random number generation
# in the TensorFlow backend have a well-defined initial state.
# For further details, see: https://www.tensorflow.org/api_docs/python/tf/set_random_seed
tf.reset_default_graph()
tf.set_random_seed(SEED)

sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

# Rest of code follows ...

# Preprocessing
Preprocessing is seperately done using Clean_Sentiment_Tweets_Bashar.ipynb

# Loading Data

In [None]:
print(BASE)
print(BASE+'train_pp.csv')
print(BASE+'test_pp.csv')

In [None]:
df_train = pd.read_csv(BASE+'train_pp.csv', encoding='utf8')
df_train.sample(5)

In [None]:
df_train = df_train[['target','text']]
df_train.head(5)

In [None]:
#(0 = negative, 2 = neutral, 4 = positive)
df_train['target'].value_counts()

In [None]:
df_test = pd.read_csv(BASE+'test_pp.csv', encoding='utf8')
df_test.sample(5)

In [None]:
df_test = df_test[['target', 'text']]
df_test.head(5)

In [None]:
df_test['target'].value_counts()

# Transforming data suitable for model format

In [None]:
X_train = list(df_train['text'].astype(str))
X_test = list(df_test['text'].astype(str))

In [None]:
#X_train

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
num_words = 100000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)
xtrain = tokenizer.texts_to_sequences(X_train)
maxlen = max(map(lambda x: len(x),xtrain))
xtrain = pad_sequences(xtrain, maxlen=maxlen)

xtest = tokenizer.texts_to_sequences(X_test)
xtest = pad_sequences(xtest, maxlen=maxlen)

In [None]:
ytrain = (np.array(df_train['target'])/4).astype(int)
ytrain

In [None]:
ytest = (np.array(df_test['target'])/4).astype(int)
ytest

# Loading word embedding and mapping data to that word embedding

In [None]:
from gensim.models import KeyedVectors
model_ug_cbow = KeyedVectors.load(BASE+'RandomTweet_200d_mincount_100\\vectors.txt')
print('Loaded en')


embeddings_index = {}
for w in model_ug_cbow.wv.vocab.keys():
    embeddings_index[w] = model_ug_cbow.wv[w]

embedding_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

### Preparing class level to CNN accessible format.

In [None]:
num_classes = len(df_train['target'].unique())
num_classes

In [None]:
x = [0, 0]
for val in ytrain:
    x[val]+=1
x

In [None]:
ytrain_mc = []
for val in ytrain:
    ov = [0]*num_classes
    ov[val] = 1
    ytrain_mc.append(ov)
ytrain_mc = np.array(ytrain_mc)
ytrain_mc[400:405]

In [None]:
x = [0, 0]
for val in ytrain_mc:
    x[np.argmax(val)]+=1
x

# Creating CNN model and training it for 10 epoc

In [None]:
from keras.layers import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import Input, concatenate, Activation
from keras.models import Model

def create_cnn_model():
    tweet_input = Input(shape=(maxlen,), dtype='int32')

    tweet_encoder = Embedding(num_words, 200, weights=[embedding_matrix], input_length=maxlen, trainable=True)(tweet_input)
    tweet_encoder = Dropout(0.5)(tweet_encoder)
    
    bigram_branch = Conv1D(filters=128, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
    bigram_branch = GlobalMaxPooling1D()(bigram_branch)
    bigram_branch = Dropout(0.5)(bigram_branch)
    
    trigram_branch = Conv1D(filters=256, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
    trigram_branch = GlobalMaxPooling1D()(trigram_branch)
    trigram_branch = Dropout(0.2)(trigram_branch)
    
    fourgram_branch = Conv1D(filters=512, kernel_size=5, padding='valid', activation='relu', strides=1)(tweet_encoder)
    fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
    fourgram_branch = Dropout(0.2)(fourgram_branch)
    
    merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

    merged = Dense(256, activation='relu')(merged)
    merged = Dropout(0.5)(merged)
    
    #merged = Dense(1)(merged)
    merged = Dense(num_classes)(merged)
    output = Activation('sigmoid')(merged)
    
    model = Model(inputs=[tweet_input], outputs=[output])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    #model.summary()
    return model

cnn_model = create_cnn_model()
cnn_model.fit(xtrain, ytrain_mc, epochs=10, batch_size=32, verbose=2)

## store the trained model
1. store tokenizer
2. store model architecture
3. store model weights
4. store maxlen

### 1. Store Tokenizer

In [None]:
STORE_PATH = 'C:\\Users\\basharm\\PythonJupyter\\CoVID19CodeGit\\StoredModels\\CNN\\'

In [None]:
import pickle
with open(STORE_PATH+'tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

### 2. Store Model Architecture

In [None]:
# serialize model architecture to JSON
model_json = cnn_model.to_json()
with open(STORE_PATH+"model.json", "w") as json_file:
    json_file.write(model_json)

### 3. Store Model Weights

In [None]:
# serialize weights to HDF5
cnn_model.save_weights(STORE_PATH+"model.h5")
print("Saved model to disk")

### 4. Store maxlen

In [None]:
np.save(STORE_PATH+'maxlen', maxlen)

# Predict on Test data

In [None]:
len(X_test)

In [None]:
X_test[:5]

In [None]:
p = cnn_model.predict(xtest,verbose=0)
p[:10]

In [None]:
predicted = [np.argmax(x) for x in p]
predicted[:25]

In [None]:
set(predicted)

# Evaluate

In [None]:
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_curve, auc, roc_auc_score

predicted = np.array(predicted)
actual = ytest

tp = np.count_nonzero(predicted * actual)
tn = np.count_nonzero((predicted - 1) * (actual - 1))
fp = np.count_nonzero(predicted * (actual - 1))
fn = np.count_nonzero((predicted - 1) * actual)

print('True Positive', tp)
print('True Negative', tn)
print('False Positive', fp)
print('False Negative', fn)

accuracy = (tp + tn) / (tp + fp + fn + tn)
precision = tp / (tp + fp)
recall = tp / (tp + fn)
fmeasure = (2 * precision * recall) / (precision + recall)
cohen_kappa_score = cohen_kappa_score(predicted, actual)
false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predicted)
auc_val = auc(false_positive_rate, true_positive_rate)
roc_auc_val = roc_auc_score(actual, predicted)

print('Accuracy', accuracy)
print('Precision', precision)
print('Recall', recall)
print('f-measure', fmeasure)
print('cohen_kappa_score', cohen_kappa_score)
print('auc', auc_val)
print('roc_auc', roc_auc_val)

#print("Average of decision tree ROC-AUC score: %.3f" % roc_auc_score(ytest, p))

In [None]:
df_pred = pd.DataFrame(zip(predicted, X_test), columns=['label', 'text'])
df_pred.sample(5)

In [None]:
len(df_pred[df_pred['label']==1])

In [None]:
model_name = '_CNN_'

In [None]:
fo_name = base+'TestingDataset\\Predicted\\'+lang+model_name+task
fo_name

In [None]:
df_pred.to_csv(fo_name, encoding='utf8', index=None)

In [None]:
df_pred.head()

In [None]:
sum(df_pred['label'])

In [None]:
# Performance test
from sklearn.metrics import classification_report
actual = ytest
print(classification_report(actual, predicted))

# Load Unprocessed Test Data

In [None]:
df_test_ori = pd.read_csv(base+'TestingDataset\\hasoc2019_'+lang+'_test.tsv', encoding='utf8', sep='\t')
df_test_ori.head()