# CNN Explained Step by Step

In this notebook, from very high level, we show what is happening in a CNN model

### Import common libraries

In [39]:
# General libraries
import numpy as np
import pandas as pd
import random as rn

# neural network libraries
import tensorflow as tf # tensorflow backend
from keras.layers import Input # for input layer
from keras.layers.embeddings import Embedding # for embedding
from keras.layers import import Dropout # for random dropout
from keras.layers import Conv1D, GlobalMaxPooling1D # for convolution layer
from keras.layers import concatenate # for concatenation
from keras.layers import Activation # for activation layer
from keras.layers import Dense # for fully connected layer
from keras.models import Model # Model groups layers into an object with training and inference features.

# libraries for data formatting
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

# word embedding loading library
from gensim.models import KeyedVectors

# custom libraries
from model_persistance import ModelPersistance
from evaluate_classification import EvaluateBinaryClassification

# Initialise Random variables

In [24]:
SEED = 123
np.random.seed(SEED)
tf.random.set_seed(SEED)

# Loading Data

In [3]:
# define input sources
BASE = 'D:\\ResearchDataGtx1060\\SentimentData\\Racism\\'
fins_train = ['train.csv']
fins_test = ['test.csv']
track = 0

In [4]:
# read train data in a dataframe
df_train = pd.read_csv(BASE+fins_train[track])
df_train.head()

Unnamed: 0,text,label
0,rt <user> : deconstructed lemon tart . basical...,0
1,argh <elongated> i want to kick in the televis...,1
2,<hashtag> mkr </hashtag>,0
3,". <user> no , no . <repeated> this is my view ...",1
4,i am just so embarrassed for her . <hashtag> m...,0


In [5]:
# check howmany positive and negative examples are there
df_train.groupby('label').count()

Unnamed: 0_level_0,text
label,Unnamed: 1_level_1
0,2763
1,857


In [6]:
# check the total number of examples
len(df_train)

3620

In [7]:
# separate independent variabe (text) and dependednt variable (label)
X_train, y_train = df_train['text'].values, df_train['label'].values

In [8]:
# check what we got in independent variable (text)
X_train[:5]

array(['rt <user> : deconstructed lemon tart . basically a pile of crap on a plate <hashtag> mkr </hashtag> <url>',
       'argh <elongated> i want to kick in the television set right now , kat you despicable rat <hashtag> mkr </hashtag>',
       '<hashtag> mkr </hashtag>',
       '. <user> no , no . <repeated> this is my view on how to move equality forward . must root out those who claim equality while working against it .',
       'i am just so embarrassed for her . <hashtag> mkr </hashtag>'],
      dtype=object)

In [9]:
# check what we got in dependent variabl
y_train[:5]

array([0, 1, 0, 1, 0], dtype=int64)

In [10]:
# read test data
df_test = pd.read_csv(BASE+fins_test[track])
df_test.head()

Unnamed: 0,text,label
0,<hashtag> mkr </hashtag> yum there cooking up ...,0
1,<url> / / <user>,0
2,<user> why is kat being so nasty ? just showin...,1
3,<user> if katie and nikki scored a point for e...,0
4,<user> i ' d shopped off the website before <h...,0


In [11]:
# seperate variabes of test data
X_test, y_test = df_test['text'].values, df_test['label'].values

# Transforming data suitable for model format

In [20]:
num_words = 100000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)
xtrain = tokenizer.texts_to_sequences(X_train) # coverts text to numbers
print(xtrain[:2])

[[14, 2, 160, 148, 176, 705, 6, 1041, 15, 417, 24, 6, 260, 1, 3, 1, 23], [1501, 89, 4, 97, 8, 896, 21, 5, 1042, 442, 123, 90, 34, 12, 3014, 3015, 1, 3, 1]]


In [21]:
maxlen = max(map(lambda x: len(x), xtrain))
xtrain = pad_sequences(xtrain, maxlen=maxlen) # padding each row to make them same length
print(xtrain[:2])

[[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0   14    2  160  148
   176  705    6 1041   15  417   24    6  260    1    3    1   23]
 [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0 1501   89    4   97    8  896
    21    5 1042  442  123   90   34   12 3014 3015    1    3    1]]


In [22]:
# convert the text of test data
xtest = tokenizer.texts_to_sequences(X_test) # coverts text to numbers
xtest = pad_sequences(xtest, maxlen=maxlen) # padding each row to make them same length

### Loading word embedding and mapping data to that word embedding

In [25]:
W2V_BASE = 'D:\\ResearchDataGtx1060\\TwitterDataAustralia\\\W2V_AusTweets_200d_MinCount100\\'
model_ug_cbow = KeyedVectors.load(W2V_BASE+'vectors.txt')

In [30]:
# check 20 words in the word embedding list
list(model_ug_cbow.wv.vocab.keys())[:20]

['bad',
 'australia',
 'anti',
 'science',
 'experts',
 'government',
 'america',
 'corona',
 'virus',
 'breaking']

In [31]:
# now we need an embedding matrix to use in our cnn. 
# first, let's  have a list of word to embedding vector map
embeddings_index = {}
for w in model_ug_cbow.wv.vocab.keys():
    embeddings_index[w] = model_ug_cbow.wv[w]

In [35]:
# check the embedding for the work 'bad'
embeddings_index['bad']

array([-1.0316982e+00,  2.1833405e-01, -5.6165594e-01,  1.9352080e-01,
       -9.3115920e-01, -5.2442926e-01, -4.1290745e-01, -1.1854973e-01,
        8.6853719e-01, -8.1930059e-01, -6.0529087e-02,  5.9996063e-01,
        3.3220369e-02, -3.7161040e-01,  8.7345284e-01,  6.0169494e-01,
       -1.9462686e+00,  5.9856260e-01,  6.3711649e-01,  1.7781970e-01,
       -3.1967349e-03, -3.7669054e-01, -6.0284066e-01,  1.4469020e-01,
       -4.8053089e-01, -1.7896198e+00, -9.6595472e-01,  9.1273896e-02,
       -8.8656074e-01, -5.3717041e-01,  4.8754922e-01,  8.8729465e-01,
        8.0223280e-01, -2.0967886e+00,  7.5208932e-01,  3.0945593e-01,
       -1.5034870e+00,  3.0100110e-01, -1.5653280e+00, -6.6279548e-01,
        8.9251941e-01,  1.3549447e-01, -3.9278197e-01,  1.3906772e-03,
        5.0559813e-01,  8.8537502e-01,  8.4796727e-01, -1.2684748e+00,
        9.9836671e-01,  1.4374197e+00,  1.2900113e+00,  1.2452897e+00,
       -7.0647120e-01,  2.0335373e-01,  1.1188688e+00,  2.6864177e-01,
      

In [37]:
# second, creat an empty embedding matrix with all zeros
embedding_matrix = np.zeros((num_words, 200))
embedding_matrix[:2]

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0.,

In [38]:
# third, for each workd in our training dataset vocabulary (i.e. in tokenizer),
# put the corresponding word embeeding to the matrix
# if a word is not in the pretrained embedding, the corresponding entry will remain zeros
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
embedding_matrix[:2]

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0. 

### Creating CNN model and training it for n epocs

##### add input layer

In [43]:
tweet_input = Input(shape=(maxlen,), dtype='int32')
tweet_input

<tf.Tensor 'input_2:0' shape=(None, 69) dtype=int32>

In [None]:
'''
someone coming from C/C++/Java programming background may find inner fucntion of python a bit confusing. 
Therefore, we need to explain it before going to the next line of code.

f(a, b) and f(a)(b) are not the same thing in pthon:
f(a, b) is a simple function that takes two parameters, i.e. f(a, b) calls f with two parameters a and b.
f(a)(b) is a nested function that takes one parameter for outer fucnation and one parametr for inner function. 
That is, f(a)(b) calls f with one parameter a, which then returns another function, 
which is then called with one parameter b. Consider the following nested fucntion for example:

def func(a):
    def func2(b):
        return a + b
    return func2
    
When you call 'func()' it returns the inner functon 'func2'. Then you call that inncer function.
>>func2 = func(1)
>>func2(2)
>>3

If you don't need the inner function later on, then there's no need to save it into a variable. 
You can just call them one after the other.
>>func(1)(2)
>>3
'''

##### add embedding layer

In [44]:
tweet_encoder = Embedding(num_words, 
                          200, 
                          weights=[embedding_matrix], 
                          input_length=maxlen, 
                          trainable=True)(tweet_input)
tweet_encoder

<tf.Tensor 'embedding_1/embedding_lookup/Identity_1:0' shape=(None, 69, 200) dtype=float32>

In [45]:
# add a random dropout layer
tweet_encoder = Dropout(0.5)(tweet_encoder)
tweet_encoder

<tf.Tensor 'dropout_1/cond/Identity:0' shape=(None, 69, 200) dtype=float32>

##### add convolutional layer

In [46]:
# add convolutin
bigram_branch = Conv1D(filters=128, kernel_size=3, padding='valid', activation='relu', strides=1)(tweet_encoder)
bigram_branch

<tf.Tensor 'conv1d/Relu:0' shape=(None, 67, 128) dtype=float32>

In [47]:
# add max pooling
bigram_branch = GlobalMaxPooling1D()(bigram_branch)
bigram_branch

<tf.Tensor 'global_max_pooling1d/Max:0' shape=(None, 128) dtype=float32>

In [48]:
# add random dropout to this brunch
bigram_branch = Dropout(0.5)(bigram_branch)
bigram_branch

<tf.Tensor 'dropout_2/cond/Identity:0' shape=(None, 128) dtype=float32>

In [49]:
# reapead these steps to add another brunch of convolution
trigram_branch = Conv1D(filters=256, kernel_size=4, padding='valid', activation='relu', strides=1)(tweet_encoder)
trigram_branch = GlobalMaxPooling1D()(trigram_branch)
trigram_branch = Dropout(0.2)(trigram_branch)
trigram_branch

<tf.Tensor 'dropout_3/cond/Identity:0' shape=(None, 256) dtype=float32>

In [50]:
# add another brunch of convolution
fourgram_branch = Conv1D(filters=512, kernel_size=5, padding='valid', activation='relu', strides=1)(tweet_encoder)
fourgram_branch = GlobalMaxPooling1D()(fourgram_branch)
fourgram_branch = Dropout(0.2)(fourgram_branch)
fourgram_branch

<tf.Tensor 'dropout_4/cond/Identity:0' shape=(None, 512) dtype=float32>

In [51]:
# now concatenet outputs from convolution brunches (128+256+512=896)
merged = concatenate([bigram_branch, trigram_branch, fourgram_branch], axis=1)
merged

<tf.Tensor 'concatenate/concat:0' shape=(None, 896) dtype=float32>

##### add hidden dense layer

In [52]:
merged = Dense(256, activation='relu')(merged)
merged = Dropout(0.5)(merged)
merged

<tf.Tensor 'dropout_5/cond/Identity:0' shape=(None, 256) dtype=float32>

##### add output dense layerm

In [53]:
merged = Dense(1)(merged)
output = Activation('sigmoid')(merged)
output

<tf.Tensor 'activation/Sigmoid:0' shape=(None, 1) dtype=float32>

##### Now use 'Model' to group layers into an object with training and inference features.

In [54]:
model = Model(inputs=[tweet_input], outputs=[output]) # creat model object
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # comple model object
model

<tensorflow.python.keras.engine.functional.Functional at 0x286e42d3fc8>

In [55]:
# let us see the model summary
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 69)]         0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 69, 200)      20000000    input_2[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 69, 200)      0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 67, 128)      76928       dropout_1[0][0]                  
_______________________________________________________________________________________

##### fit the model

In [56]:
# now fit the model to the training dataset
model.fit(xtrain, y_train, epochs=10, batch_size=32, verbose=1)

Epoch 1/10
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x286e41e21c8>

### Evaluating the model with test dataset

In [58]:
p = model.predict(xtest,verbose=1)
predicted = [int(round(x[0])) for x in p]
actual = y_test

ebc = EvaluateBinaryClassification(gnd_truths = actual, predictions = predicted)
print(ebc.get_full_report())

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'arguments' object has no attribute 'posonlyargs'
EvaluateBinaryClassification Object Created

Total Samples	906
Positive Samples	222
Negative Samples	684
True Positive	143
True Negative	634
False Positive	50
False Negative	79
Accuracy	0.8576158940397351
Precision	0.7409326424870466
Recall	0.6441441441441441
F1 Measure	0.689156626506024
Cohen Kappa Score	0.5973999131926504
Area Under Curve	0.7855223644697329

              precision    recall  f1-score   support

           0       0.89      0.93      0.91       684
           1       0.74      0.64      0.69       222

    accuracy                           

In [59]:
ebc.save_full_report(model_name='CNN_w2v')

### Store the trained model

In [None]:
'''
mp = ModelPersistance(store_path = BASE + 'stored_models\\cnn_w2v_mincount10')
mp.store_model(tokenizer=tokenizer, model=cnn_model, max_len=maxlen)
'''

### Load Stored Model to Predict on Unknown Data

In [None]:
'''
mp = ModelPersistance(store_path = BASE + 'stored_models\\cnn_w2v_mincount10')
tokenizer, cnn_model, maxlen = mp.restore_model()
'''

### Load Unknown Data and Predict

In [None]:
'''
UNKNOWN_CSV = BASE+'prepro_hasoc_2020_en_test.csv'
df_unk = pd.read_csv(UNKNOWN_CSV, encoding='utf8')
df_unk.head(5)
'''

In [None]:
'''
X_unk = list(df_unk['text'].astype(str))
xunk = tokenizer.texts_to_sequences(X_unk)
xunk = pad_sequences(xunk, maxlen=maxlen)
#loaded_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
p_unk = cnn_model.predict(xunk,verbose=0)
p_unk[:10]
'''

In [None]:
'''
pred_unk = [int(round(x[0])) for x in p_unk]
pred_unk = np.array(pred_unk)
sum(pred_unk)
'''

### Store the prediction

In [None]:
'''
LANGUAGE = 'EN'
SUBTASK_NAME = 'A'
pred_fname = 'submission_{}_{}.csv'.format(LANGUAGE, SUBTASK_NAME)
BASE+'Predictions\\'+pred_fname
'''

In [None]:
'''
df_unk[['tweet_id', 'task1', 'ID']]
'''

In [None]:
'''
i2t = ['NOT', 'HOF']
df_unk['task1'] = [i2t[i] for i in pred_unk]
df_unk = df_unk[['tweet_id', 'task1', 'ID']]
df_unk.to_csv(BASE+'Predictions\\'+pred_fname, encoding='utf8', index=None)
'''