# Import common libraries

In [1]:
import tensorflow as tf
import numpy as np
import pandas as pd
import random as rn
from model_persistance import ModelPersistance
from evaluate_classification import EvaluateBinaryClassification

C:\Users\User\anaconda3\lib\site-packages\numpy\.libs\libopenblas.NOIJJG62EMASZI6NYURL6JBKM4EVBGM7.gfortran-win_amd64.dll
C:\Users\User\anaconda3\lib\site-packages\numpy\.libs\libopenblas.PYQHXLVVQ7VESDPUVUADXEVJOBGHJPAY.gfortran-win_amd64.dll
  stacklevel=1)


# Initialise Random variables

In [2]:
SEED = 123
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Loading Data

In [3]:
BASE = 'D:\\ResearchDataGtx1060\\SentimentData\\Hate\\'
fins_train = ['random_hate_train.csv']
fins_test = ['eastasian_hate_test.csv']
track = 0

In [4]:
# We apply only this preprocessing because our data is already preprocessed
def cleanNonAscii(text):
    '''
    Remove Non ASCII characters from the dataset.
    Arguments:
        text: str
    returns: 
        text: str
    '''
    return ''.join(i for i in text if ord(i) < 128)

In [5]:
df_train = pd.read_csv(BASE+fins_train[track])
#df.columns=['label', 'text']
df_train.head()

Unnamed: 0,label,text
0,1,<user> if you are one of the <number> mil <has...
1,0,best <hashtag> law of attraction </hashtag> <h...
2,1,<hashtag> michelle obama </hashtag> is the mos...
3,0,smiling because life is good rite now ! <repea...
4,0,ã ¢ â  â ¤ ã ¯ â ¸ â  ã ¢ â  â ¤ ã ¯ â ¸ â ...


In [6]:
df_train.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,2242
1,2242


In [7]:
len(df_train)

4484

In [8]:
df_train['text'] = df_train['text'].apply(cleanNonAscii)

In [9]:
X_train, y_train = df_train['text'].values, df_train['label'].values

In [10]:
df_test = pd.read_csv(BASE+fins_test[track])
#df_test.columns = ['label', 'text']
df_test.head()

Unnamed: 0,label,text
0,1,<user> <user> the chinese are probably sprayin...
1,0,rt <user> : unpatriotic losers are tweeting ou...
2,1,<user> thus <hashtag> 2019 n co v </hashtag> i...
3,0,north korea closes borders to avoid coronaviru...
4,1,<user> this is a declaration of war . it prove...


In [11]:
df_test['text'] = df_test['text'].apply(cleanNonAscii)

In [12]:
df_test.head()

Unnamed: 0,label,text
0,1,<user> <user> the chinese are probably sprayin...
1,0,rt <user> : unpatriotic losers are tweeting ou...
2,1,<user> thus <hashtag> 2019 n co v </hashtag> i...
3,0,north korea closes borders to avoid coronaviru...
4,1,<user> this is a declaration of war . it prove...


In [13]:
X_test, y_test = df_test['text'].values, df_test['label'].values

# Transforming data suitable for model format

In [14]:
# X_new = []
# X_new.extend(X_train)
# X_new.extend(X_test)

In [15]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
num_words = 100000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)
xtrain = tokenizer.texts_to_sequences(X_train)
maxlen = max(map(lambda x: len(x),xtrain))
xtrain = pad_sequences(xtrain, maxlen=maxlen)

xtest = tokenizer.texts_to_sequences(X_test)
xtest = pad_sequences(xtest, maxlen=maxlen)

### Loading word embedding and mapping data to that word embedding

In [16]:
from gensim.models import KeyedVectors
W2V_BASE = 'D:\\ResearchDataGtx1060\\TwitterDataAustralia\\\W2V_AusTweets_200d_MinCount100\\'
model_ug_cbow = KeyedVectors.load(W2V_BASE+'vectors.txt')

# W2V_BASE = 'D:\\ResearchDataGtx1060\\HASOC2020Datasets\\eng\\w2v_sentiTweets_200d_minCount10\\'
# model_ug_cbow = KeyedVectors.load(W2V_BASE+'vectors.txt')

embeddings_index = {}
for w in model_ug_cbow.wv.vocab.keys():
    embeddings_index[w] = model_ug_cbow.wv[w]

embedding_matrix = np.zeros((num_words, 200))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [30]:
xtrain

array([[   0,    0,    0, ..., 1254,   21, 1255],
       [   0,    0,    0, ...,    7, 2860,    1],
       [   0,    0,    0, ...,    1, 4420,    1],
       ...,
       [   0,    0,    0, ...,  214,  207,    1],
       [   0,    0,    0, ..., 3822,  259, 2035],
       [   0,    0,    0, ...,   42,  465,    1]])

### Creating CNN model and training it for n epocs

In [17]:
from keras.layers import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import Input, concatenate, Activation
from keras.models import Model

def create_cnn_model():
    tweet_input = Input(shape=(maxlen,), dtype='int32')
    
    print('loading word vectors')
    #tweet_encoder = Embedding(num_words, 200, weights=[embedding_matrix], input_length=maxlen, trainable=True)(tweet_input)
    tweet_encoder = Embedding(num_words, 200, input_length=maxlen, trainable=True)(tweet_input)    
    tweet_encoder = Dropout(0.5)(tweet_encoder)
    
    bigram_branch = Conv1D(filters=128, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
    bigram_branch = GlobalMaxPooling1D()(bigram_branch)
    bigram_branch = Dropout(0.5)(bigram_branch)
    
    trigram_branch = Conv1D(filters=256, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
    trigram_branch = GlobalMaxPooling1D()(trigram_branch)
    trigram_branch = Dropout(0.2)(trigram_branch)
    
    fourgram_branch = Conv1D(filters=512, kernel_size=5, padding='valid', activation='relu', strides=1)(tweet_encoder)
    fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
    fourgram_branch = Dropout(0.2)(fourgram_branch)
    
    merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)

    merged = Dense(256, activation='relu')(merged)
    merged = Dropout(0.5)(merged)
    
    merged = Dense(1)(merged)
    output = Activation('sigmoid')(merged)
    
    model = Model(inputs=[tweet_input], outputs=[output])
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    #model.summary()
    return model

cnn_model = create_cnn_model()
cnn_model.fit(xtrain, y_train, epochs=3, batch_size=32, verbose=1)

loading word vectors
Epoch 1/3
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x24ecfed5388>

### Evaluating the model with test dataset

In [18]:
p = cnn_model.predict(xtest,verbose=1)
predicted = [int(round(x[0])) for x in p]
actual = y_test

ebc = EvaluateBinaryClassification(gnd_truths = actual, predictions = predicted)
print(ebc.get_full_report())

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
EvaluateBinaryClassification Object Created

Total Samples	7796
Positive Samples	3898
Negative Samples	3898
True Positive	2612
True Negative	1994
False Positive	1904
False Negative	1286
Accuracy	0.5908158029758851
Precision	0.5783879539415412
Recall	0.6700872242175474
F1 Measure	0.6208699786070834
Cohen Kappa Score	0.1816316059517702
Area Under Curve	0.590815802975885

              precision    recall  f1-score   support

           0       0.61      0.51      0.56      3898
           1       0.58      0.67      0.62      3898

    accuracy                  

In [19]:
ebc.save_full_report(model_name='CNN_no_w2v', path='domain_adaptation_rerun_randomhate_eastasianhate_')

### Store the trained model

In [20]:
'''
mp = ModelPersistance(store_path = BASE + 'stored_models\\cnn_w2v_mincount10')
mp.store_model(tokenizer=tokenizer, model=cnn_model, max_len=maxlen)
'''

"\nmp = ModelPersistance(store_path = BASE + 'stored_models\\cnn_w2v_mincount10')\nmp.store_model(tokenizer=tokenizer, model=cnn_model, max_len=maxlen)\n"

### Load Stored Model to Predict on Unknown Data

In [21]:
'''
mp = ModelPersistance(store_path = BASE + 'stored_models\\cnn_w2v_mincount10')
tokenizer, cnn_model, maxlen = mp.restore_model()
'''

"\nmp = ModelPersistance(store_path = BASE + 'stored_models\\cnn_w2v_mincount10')\ntokenizer, cnn_model, maxlen = mp.restore_model()\n"

### Load Unknown Data and Predict

In [22]:
'''
UNKNOWN_CSV = BASE+'prepro_hasoc_2020_en_test.csv'
df_unk = pd.read_csv(UNKNOWN_CSV, encoding='utf8')
df_unk.head(5)
'''

"\nUNKNOWN_CSV = BASE+'prepro_hasoc_2020_en_test.csv'\ndf_unk = pd.read_csv(UNKNOWN_CSV, encoding='utf8')\ndf_unk.head(5)\n"

In [23]:
'''
X_unk = list(df_unk['text'].astype(str))
xunk = tokenizer.texts_to_sequences(X_unk)
xunk = pad_sequences(xunk, maxlen=maxlen)
#loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
p_unk = cnn_model.predict(xunk,verbose=0)
p_unk[:10]
'''

"\nX_unk = list(df_unk['text'].astype(str))\nxunk = tokenizer.texts_to_sequences(X_unk)\nxunk = pad_sequences(xunk, maxlen=maxlen)\n#loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])\np_unk = cnn_model.predict(xunk,verbose=0)\np_unk[:10]\n"

In [24]:
'''
pred_unk = [int(round(x[0])) for x in p_unk]
pred_unk = np.array(pred_unk)
sum(pred_unk)
'''

'\npred_unk = [int(round(x[0])) for x in p_unk]\npred_unk = np.array(pred_unk)\nsum(pred_unk)\n'

### Store the prediction

In [25]:
'''
LANGUAGE = 'EN'
SUBTASK_NAME = 'A'
pred_fname = 'submission_{}_{}.csv'.format(LANGUAGE, SUBTASK_NAME)
BASE+'Predictions\\'+pred_fname
'''

"\nLANGUAGE = 'EN'\nSUBTASK_NAME = 'A'\npred_fname = 'submission_{}_{}.csv'.format(LANGUAGE, SUBTASK_NAME)\nBASE+'Predictions\\'+pred_fname\n"

In [26]:
'''
df_unk[['tweet_id', 'task1', 'ID']]
'''

"\ndf_unk[['tweet_id', 'task1', 'ID']]\n"

In [27]:
'''
i2t = ['NOT', 'HOF']
df_unk['task1'] = [i2t[i] for i in pred_unk]
df_unk = df_unk[['tweet_id', 'task1', 'ID']]
df_unk.to_csv(BASE+'Predictions\\'+pred_fname, encoding='utf8', index=None)
'''

"\ni2t = ['NOT', 'HOF']\ndf_unk['task1'] = [i2t[i] for i in pred_unk]\ndf_unk = df_unk[['tweet_id', 'task1', 'ID']]\ndf_unk.to_csv(BASE+'Predictions\\'+pred_fname, encoding='utf8', index=None)\n"