# Notebook for classifying claims and non-claims with BiLSTM and CNN

In [1]:
from scripts.load_corpus import DaxenbergerModified, StabGurevychCorpus

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import metrics

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM, Bidirectional
from tensorflow.keras.layers import Conv1D, Flatten, MaxPooling1D


Loading corpus from script class

In [2]:
#corpus_1 = StabGurevychCorpus()
#df_all = corpus_1.df_all
#print(df_all.head)

corpus_2 = DaxenbergerModified()
df_all = corpus_2.df_all
print(df_all.head)

<bound method NDFrame.head of                                                    text  target
0     How can anyone expect children could do well a...       0
1     Firstly , I think that the new high school wil...       1
2     With technological advances , children have mo...       0
3     Nowadays , many professors conduct research wh...       0
4     In today ' s world there are many great and us...       0
...                                                 ...     ...
7046  Last but not least , knowledge is worth mentio...       1
7047  To illustrate this point , I can write about m...       0
7048  Consider a circumstance in which a student who...       0
7049  in my opinion , reducing stress by listening t...       1
7050  In addition , the basic economic course can al...       1

[7051 rows x 2 columns]>


Hyperparameters used in BiLSTM and CNN models

In [3]:
vocab_size = 25000
embedding_dim = 300
input_n = 50 # padded maximum length for each sample

Encoding words from corpus

In [4]:
encoding = [one_hot(words,vocab_size) for words in df_all.iloc[:, 0]]

print(encoding)

[[23925, 21232, 18845, 7254, 14601, 3197, 19973, 8275, 5513, 10329, 1599, 13809, 14075, 13483, 17488, 23224], [1467, 15505, 8378, 9635, 12401, 2255, 4778, 10329, 11930, 21496, 18758, 4353, 5074, 16402, 8614, 9372, 8140, 21709, 22200], [4756, 15172, 18543, 14601, 14075, 18814, 4649, 7735, 11046, 23380, 10175], [11509, 14726, 6709, 11093, 2469, 10675, 5838, 21709, 92, 21034, 13334], [21709, 20590, 15287, 10134, 9027, 15831, 15789, 14726, 20885, 818, 6846, 2503, 15535, 4445, 5442, 2944, 5442, 12401, 3635, 818, 21563, 818, 6184], [15505, 5133, 6658, 14075, 897, 7735, 19870, 18758, 13173, 21034, 7735, 22539, 4756, 10207, 21034, 16502, 21709, 9942, 3977, 897, 3807, 8614, 7549, 3477, 18397, 17006, 7735, 12649, 18758, 14265, 6426], [15505, 13555, 9635, 21709, 15022, 11453, 12401, 17046, 18397, 6658, 24820, 7735, 8841, 21709], [20010, 7559, 783, 22923, 2853], [20404, 15505, 8378, 21709, 23106, 8146, 818, 17873, 2426, 5902, 18503, 21709, 22284, 16797, 18814, 9059, 21709, 5169, 16797], [20190, 12

Padding to bring all the sequences to the same length

In [5]:
emb_doc = pad_sequences(encoding, padding='pre', maxlen=input_n)
print(emb_doc)

[[    0     0     0 ... 13483 17488 23224]
 [    0     0     0 ...  8140 21709 22200]
 [    0     0     0 ... 11046 23380 10175]
 ...
 [    0     0     0 ... 21709 10363 10329]
 [    0     0     0 ...   818 12979 16797]
 [    0     0     0 ... 18424  7735 24785]]


Converting data back to arrays and splitting it into train, validation, and 
test set

In [6]:
X = np.array(emb_doc)
y = np.array(df_all.iloc[:, 1])

random_seeds = [0, 1, 2, 3, 4]

X_train, X_rem, y_train, y_rem = train_test_split(X, y,
                                                  train_size=0.7,
                                                  random_state=random_seeds[4])
print(X_train.shape)

X_tr = np.column_stack((X_train, y_train))

positive_entries = X_tr[X_tr[:, -1] == 1]
negative_entries = X_tr[X_tr[:, -1] == 0]

min_size = min(len(positive_entries), len(negative_entries))

if len(positive_entries) > len(negative_entries):
    positive_entries = positive_entries[:min_size]
else:
    negative_entries = negative_entries[:min_size]

X_tr = np.concatenate((positive_entries, negative_entries))
np.random.shuffle(X_tr)
X_train = X_tr[:, :50]
y_train = X_tr[:,-1]

print(X_train.shape)
print(y_train.shape)

print(X_rem.shape)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, 
                                                    test_size=0.5,
                                                    random_state=42)
print(X_valid.shape)
print(X_test.shape)

(4935, 50)
(2924, 50)
(2924,)
(2116, 50)
(1058, 50)
(1058, 50)


## BiLSTM

Defining BiLSTM model

In [7]:
bilstm_model = Sequential()
bilstm_model.add(Embedding(vocab_size, embedding_dim, input_length=input_n)) 
bilstm_model.add(Bidirectional(LSTM(100)))
bilstm_model.add(Dense(1, activation='sigmoid'))
bilstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(bilstm_model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 300)           7500000   
                                                                 
 bidirectional (Bidirection  (None, 200)               320800    
 al)                                                             
                                                                 
 dense (Dense)               (None, 1)                 201       
                                                                 
Total params: 7821001 (29.83 MB)
Trainable params: 7821001 (29.83 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


Training the Bi-LSTM model

In [8]:
bilstm_model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=10, batch_size=512)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x28a46ab10>

Evaluation with validation set:

Prediction results:
1 = claim
0 = non-claim

In [9]:
y_pred = [value for value in (bilstm_model.predict(X_valid) > 0.5).astype("int32")]
cr = metrics.classification_report(y_valid.tolist(), y_pred, digits=4)
print(cr)

              precision    recall  f1-score   support

           0     0.8651    0.6413    0.7366       750
           1     0.4641    0.7565    0.5753       308

    accuracy                         0.6749      1058
   macro avg     0.6646    0.6989    0.6560      1058
weighted avg     0.7484    0.6749    0.6896      1058



Evaluation with test set:

Prediction results:
1 = claim
0 = non-claim

In [10]:
y_pred_test = [value for value in (bilstm_model.predict(X_test) > 0.5).astype("int32")]
cr = metrics.classification_report(y_test.tolist(), y_pred_test, digits=4)
print(cr)

              precision    recall  f1-score   support

           0     0.8429    0.6422    0.7290       735
           1     0.4719    0.7276    0.5725       323

    accuracy                         0.6682      1058
   macro avg     0.6574    0.6849    0.6507      1058
weighted avg     0.7296    0.6682    0.6812      1058



## CNN

Defining CNN model

In [11]:
cnn_model = Sequential()
cnn_model.add(Embedding(vocab_size, embedding_dim, input_length=input_n))
cnn_model.add(Conv1D(filters=128, kernel_size=4, padding='same', activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Conv1D(filters=64, kernel_size=4, padding='same', activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Flatten())
cnn_model.add(Dense(256, activation='relu'))
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 50, 300)           7500000   
                                                                 
 conv1d (Conv1D)             (None, 50, 128)           153728    
                                                                 
 max_pooling1d (MaxPooling1  (None, 25, 128)           0         
 D)                                                              
                                                                 
 conv1d_1 (Conv1D)           (None, 25, 64)            32832     
                                                                 
 max_pooling1d_1 (MaxPoolin  (None, 12, 64)            0         
 g1D)                                                            
                                                                 
 conv1d_2 (Conv1D)           (None, 12, 32)           

Training the CNN model

In [12]:
cnn_model.fit(X_train, y_train, validation_split=0.1, epochs=10, 
              batch_size=512, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x29a42bf50>

Evaluation with validation set:

Prediction results: \
1 = claim \
0 = non-claim

In [13]:
y_pred = [value for value in (cnn_model.predict(X_valid) > 0.5).astype("int32")]
cr = metrics.classification_report(y_valid.tolist(), y_pred, digits=4)
print(cr)

              precision    recall  f1-score   support

           0     0.8293    0.6347    0.7190       750
           1     0.4339    0.6818    0.5303       308

    accuracy                         0.6484      1058
   macro avg     0.6316    0.6582    0.6247      1058
weighted avg     0.7142    0.6484    0.6641      1058



Evaluation with test set:

Prediction results: \
1 = claim \
0 = non-claim 

In [14]:
y_pred_test = [value for value in (cnn_model.predict(X_test) > 0.5).astype("int32")]
cr = metrics.classification_report(y_test.tolist(), y_pred_test, digits=4)
print(cr)

              precision    recall  f1-score   support

           0     0.8122    0.6707    0.7347       735
           1     0.4634    0.6471    0.5401       323

    accuracy                         0.6635      1058
   macro avg     0.6378    0.6589    0.6374      1058
weighted avg     0.7057    0.6635    0.6753      1058



corpus 1 \
model , macro F1, claim F1

1st iteration \
BiLSTM: 0.6534 0.5829 \
CNN: 0.6122; 0.5393

2nd iteration \
BiLSTM: 0.6405; 0.5816 \
CNN: 0.6184; 0.5420

3rd iteration \
BiLSTM: 0.6417; 0.5605 \
CNN: 0.6061; 0.5593

4th iteration \
BiLSTM: 0.6388; 0.5749 \
CNN: 0.6313; 0.5737

5th iteration \
BiLSTM: 0.6446, 0.6098 \
CNN: 0.5876; 0.5586

In [15]:
# average results for corpus 1

bilstm_mean_macro_f1 = (0.6534 + 0.6405 + 0.6417 + 0.6388 + 0.6446) / 5
bilstm_mean_claim_f1 = (0.5829 + 0.5816 + 0.5605 + 0.5749 + 0.6098) / 5

cnn_mean_macro_f1 = (0.6122 + 0.6184 + 0.6061 + 0.6313 + 0.5876) / 5
cnn_mean_claim_f1 = (0.5393 + 0.5420 + 0.5593 + 0.5737 + 0.5586) / 5

print("BiLSTM")
print(bilstm_mean_macro_f1)
print(bilstm_mean_claim_f1)
print("CNN")
print(cnn_mean_macro_f1)
print(cnn_mean_claim_f1)

BiLSTM
0.6437999999999999
0.58194
CNN
0.61112
0.55458


corpus 2 \
model , macro F1, claim F1

1st iteration \
BiLSTM: 0.6638; 0.5911 \
CNN: 0.6430; 0.5803

2nd iteration \
BiLSTM: 0.6470; 0.5687 \
CNN: 0.5975; 0.5545

3rd iteration \
BiLSTM 0.6223; 0.5345 \
CNN: 0.6263; 0.5316

4th iteration \
BiLSTM: 0.6311; 0.5533 \
CNN: 0.6476; 0.5530

5th iteration \
BiLSTM: 0.6568; 0.5413 \
CNN: 0.6314; 0.5569

In [16]:
# average results for corpus 2

bilstm_mean_macro_f1 = (0.6638 + 0.6470 + 0.6223 + 0.6311 + 0.6568) / 5
bilstm_mean_claim_f1 = (0.5911 + 0.5687 + 0.5345 + 0.5533 + 0.5413) / 5

cnn_mean_macro_f1 = (0.6430 + 0.5975 + 0.6263 + 0.6476 + 0.6314) / 5
cnn_mean_claim_f1 = (0.5803 + 0.5545 + 0.5316 + 0.5530 + 0.5569) / 5

print("BiLSTM")
print(bilstm_mean_macro_f1)
print(bilstm_mean_claim_f1)
print("CNN")
print(cnn_mean_macro_f1)
print(cnn_mean_claim_f1) 

BiLSTM
0.6442
0.5577799999999999
CNN
0.62916
0.55526
