# Notebook for classifying claims and non-claims with BiLSTM and CNN

In [34]:
from scripts.load_corpus import DaxenbergerModified
from scripts.load_corpus import StabGurevychCorpus

import numpy as np

from sklearn.model_selection import train_test_split
from sklearn import metrics

from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Bidirectional

from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D


Loading corpus from script class

In [35]:
#corpus_1 = StabGurevychCorpus()
#df_all = corpus_1.df_all
#print(df_all.head)

corpus_2 = DaxenbergerModified()
df_all = corpus_2.df_all
print(df_all.head)

Length of minority and majority 

<bound method NDFrame.head of                                                    text  target
0     Finally , some assert that the ability to plan...       0
1     For example , a girl , who is interested in li...       0
2     As a result , experiences of difficulties teac...       1
3     Nowadays , the difference between school and s...       0
4     Last but not least , the Internet offers a mor...       0
...                                                 ...     ...
4181  To give a brief conclusion , Modern technogy h...       1
4182  The main advantage of high-tech medical care i...       1
4183  Nobody likes attending boring conferences , or...       0
4184  Hence , from this case , we are capable of sta...       1
4185  However , since the budget and the recourse ar...       0

[4186 rows x 2 columns]>


Hyperparameters used in BiLSTM and CNN models

In [36]:
vocab_size = 25000
embedding_dim = 300
input_n = 50 # padded maximum length for each sample

Encoding words from corpus

In [37]:
encoding = [one_hot(words,vocab_size) for words in df_all.iloc[:, 0]]

print(encoding)

[[24273, 9307, 11176, 8518, 13335, 3877, 17205, 3264, 24475, 12685, 5804, 8145, 17915, 5365, 22697, 9163, 20787], [13339, 16178, 9163, 8442, 7822, 6775, 16092, 18179, 14750, 6775, 6423, 15036, 22587, 22444, 21332, 12521, 7149, 6775, 1970, 17205, 14165, 9892, 15543, 8626, 24475, 6457, 12144, 5804, 2096, 13335, 8442, 18771, 11227, 23541, 16119, 24475, 10929, 1345], [20684, 9163, 18552, 18777, 21228, 19172, 22857, 5634, 17363, 17205, 11488, 9163, 3486, 5652, 24475, 12263, 21199, 6433, 13335, 20350], [23224, 13335, 15269, 573, 10561, 24475, 11658, 6775, 3024, 24475, 3024], [7357, 12521, 4222, 20118, 13335, 23138, 23474, 9163, 24538, 4826, 24475, 15250, 17208, 21228, 4090], [3479, 13193, 11519, 13767, 8145, 23572, 17205, 11649, 11447, 8518, 659, 9208, 18212, 7318, 16302, 1886, 13189, 3089, 12921], [4091, 6775, 17253, 24653, 14166, 8518, 16523, 20723, 5804, 8145, 302, 17721, 13335, 16456, 10987], [8210, 12110, 7507, 21228, 19979, 9231, 17492, 8650, 4986, 14166, 24373, 24264, 14610, 8518, 760

Padding to bring all the sequences to the same length

In [38]:
emb_doc = pad_sequences(encoding, padding='pre', maxlen=input_n)
print(emb_doc)

[[    0     0     0 ... 22697  9163 20787]
 [    0     0     0 ... 24475 10929  1345]
 [    0     0     0 ...  6433 13335 20350]
 ...
 [    0     0     0 ... 18179 13189  9974]
 [    0     0     0 ... 17205 11741 17760]
 [    0     0     0 ...  7665  4037  7559]]


Converting data back to arrays and splitting it into train, validation, and 
test set

In [39]:
X = np.array(emb_doc)
y = np.array(df_all.iloc[:, 1])

print(X)
print(y)

X_train, X_rem, y_train, y_rem = train_test_split(X, y,
                                                  train_size=0.7,
                                                  random_state=42)
print(X_train.shape)
print(X_rem.shape)
X_valid, X_test, y_valid, y_test = train_test_split(X_rem,y_rem, 
                                                    test_size=0.5,
                                                    random_state=42)
print(X_valid.shape)
print(X_test.shape)

[[    0     0     0 ... 22697  9163 20787]
 [    0     0     0 ... 24475 10929  1345]
 [    0     0     0 ...  6433 13335 20350]
 ...
 [    0     0     0 ... 18179 13189  9974]
 [    0     0     0 ... 17205 11741 17760]
 [    0     0     0 ...  7665  4037  7559]]
[0 0 1 ... 0 1 0]
(2930, 50)
(1256, 50)
(628, 50)
(628, 50)


## BiLSTM

Defining BiLSTM model

In [40]:
bilstm_model = Sequential()
bilstm_model.add(Embedding(vocab_size, embedding_dim, input_length=input_n)) 
bilstm_model.add(Bidirectional(LSTM(100)))
bilstm_model.add(Dense(1, activation='sigmoid'))
bilstm_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(bilstm_model.summary())

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 50, 300)           7500000   
                                                                 
 bidirectional_2 (Bidirecti  (None, 200)               320800    
 onal)                                                           
                                                                 
 dense_6 (Dense)             (None, 1)                 201       
                                                                 
Total params: 7821001 (29.83 MB)
Trainable params: 7821001 (29.83 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


Training the Bi-LSTM model

In [41]:
bilstm_model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=10, batch_size=512)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x290aa94d0>

Evaluation with validation set:

Prediction results:
1 = claim
0 = non-claim

In [42]:
y_pred = [value for value in (bilstm_model.predict(X_valid) > 0.5).astype("int32")]
cr = metrics.classification_report(y_valid.tolist(), y_pred, digits=4)
print(cr)

              precision    recall  f1-score   support

           0     0.6578    0.6429    0.6502       308
           1     0.6636    0.6781    0.6708       320

    accuracy                         0.6608       628
   macro avg     0.6607    0.6605    0.6605       628
weighted avg     0.6608    0.6608    0.6607       628



Evaluation with test set:

Prediction results:
1 = claim
0 = non-claim

In [43]:
y_pred_test = [value for value in (bilstm_model.predict(X_test) > 0.5).astype("int32")]
cr = metrics.classification_report(y_test.tolist(), y_pred_test, digits=4)
print(cr)

              precision    recall  f1-score   support

           0     0.6722    0.6634    0.6678       306
           1     0.6840    0.6925    0.6883       322

    accuracy                         0.6783       628
   macro avg     0.6781    0.6780    0.6780       628
weighted avg     0.6783    0.6783    0.6783       628



## CNN

Defining CNN model

In [44]:
cnn_model = Sequential()
cnn_model.add(Embedding(vocab_size, embedding_dim, input_length=input_n))
cnn_model.add(Conv1D(filters=128, kernel_size=4, padding='same', activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Conv1D(filters=64, kernel_size=4, padding='same', activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Conv1D(filters=32, kernel_size=4, padding='same', activation='relu'))
cnn_model.add(MaxPooling1D(pool_size=2))
cnn_model.add(Flatten())
cnn_model.add(Dense(256, activation='relu'))
cnn_model.add(Dense(1, activation='sigmoid'))
cnn_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
cnn_model.summary()

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 50, 300)           7500000   
                                                                 
 conv1d_6 (Conv1D)           (None, 50, 128)           153728    
                                                                 
 max_pooling1d_6 (MaxPoolin  (None, 25, 128)           0         
 g1D)                                                            
                                                                 
 conv1d_7 (Conv1D)           (None, 25, 64)            32832     
                                                                 
 max_pooling1d_7 (MaxPoolin  (None, 12, 64)            0         
 g1D)                                                            
                                                                 
 conv1d_8 (Conv1D)           (None, 12, 32)           

Training the CNN model

In [45]:
cnn_model.fit(X_train, y_train, validation_split=0.1, epochs=10, 
              batch_size=512, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x298574950>

Evaluation with validation set:

Prediction results: \
1 = claim \
0 = non-claim

In [46]:
y_pred = [value for value in (cnn_model.predict(X_valid) > 0.5).astype("int32")]
cr = metrics.classification_report(y_valid.tolist(), y_pred, digits=4)
print(cr)

              precision    recall  f1-score   support

           0     0.6338    0.7305    0.6787       308
           1     0.6960    0.5938    0.6408       320

    accuracy                         0.6608       628
   macro avg     0.6649    0.6621    0.6598       628
weighted avg     0.6655    0.6608    0.6594       628



Evaluation with test set:

Prediction results: \
1 = claim \
0 = non-claim 

In [47]:
y_pred_test = [value for value in (cnn_model.predict(X_test) > 0.5).astype("int32")]
cr = metrics.classification_report(y_test.tolist(), y_pred_test, digits=4)
print(cr)

              precision    recall  f1-score   support

           0     0.6369    0.7222    0.6769       306
           1     0.6975    0.6087    0.6501       322

    accuracy                         0.6640       628
   macro avg     0.6672    0.6655    0.6635       628
weighted avg     0.6680    0.6640    0.6631       628



corpus 1 \
model , macro F1, claim F1

1st iteration \
BiLSTM 0.6614 ; 0.6503 \
CNN 0.6525 ; 0.6422

2nd iteration \
BiLSTM 0.6719 ; 0.6802 \
CNN 0.6488 ; 0.6785

3rd iteration \
BiLSTM 0.6689 ; 0.6773 \
CNN 0.6718 ; 0.6474

4th iteration \
BiLSTM 0.6868 ; 0.6798 \
CNN 0.6391 ; 0.6739

5th iteration \
BiLSTM 0.6660 ; 0.6744 \
CNN 0.6553 ; 0.6292

In [48]:
# average results for corpus 1

bilstm_mean_macro_f1 = (0.6614 + 0.6719 + 0.6689 + 0.6868 + 0.6660) / 5
bilstm_mean_claim_f1 = (0.6503 + 0.6802 + 0.6773 + 0.6798 + 0.6744) / 5

cnn_mean_macro_f1 = (0.6525 + 0.6488 + 0.6718 + 0.6391 + 0.6553) / 5
cnn_mean_claim_f1 = (0.6422 + 0.6785 + 0.6474 + 0.6739 + 0.6292) / 5

print("BiLSTM")
print(bilstm_mean_macro_f1)
print(bilstm_mean_claim_f1)
print("CNN")
print(cnn_mean_macro_f1)
print(cnn_mean_claim_f1)

BiLSTM
0.671
0.6723999999999999
CNN
0.6535
0.6542399999999999


corpus 2 \
model , macro F1, claim F1

1st iteration \
BiLSTM 0.6586 ; 0.6738 \
CNN 0.6711 ; 0.6879

2nd iteration \ 
BiLSTM 0.6695 ; 0.6411 \
CNN 0.6717 ; 0.6963

3rd iteration \
BiLSTM 0.6876 ; 0.6776 \
CNN 0.6803 ; 0.7179

4th iteration \
BiLSTM 0.6842 ; 0.6972 \
CNN 0.6746 ; 0.6881

5th iteration \
BiLSTM 0.6841 ; 0.6982 \
CNN 0.6624 ; 0.7193

In [49]:
# average results for corpus 2

bilstm_mean_macro_f1 = (0.6586 + 0.6695 + 0.6876 + 0.6842 + 0.6841) / 5
bilstm_mean_claim_f1 = (0.6738 + 0.6411 + 0.6776 + 0.6972 + 0.6982) / 5

cnn_mean_macro_f1 = (0.6711 + 0.6717 + 0.6803 + 0.6746 + 0.6624) / 5
cnn_mean_claim_f1 = (0.6879 + 0.6963 + 0.7179 + 0.6881 + 0.7193) / 5

print("BiLSTM")
print(bilstm_mean_macro_f1)
print(bilstm_mean_claim_f1)
print("CNN")
print(cnn_mean_macro_f1)
print(cnn_mean_claim_f1) 

BiLSTM
0.6768
0.6775800000000001
CNN
0.67202
0.7019
