# Imports and Loading Data

Links to Google Colab containing other claim spotting models can be found below:


*   [Transformer/BERT-based](https://colab.research.google.com/github/idirlab/claimspotter/blob/master/adv_transformer/adv_transformer-notebook.ipynb)
*   [SVM](https://colab.research.google.com/github/idirlab/claimspotter/blob/master/svm/svm-notebook.ipynb)



In [1]:
# Copyright (C) 2020 IDIR Lab - UT Arlington
#
#     This program is free software: you can redistribute it and/or modify
#     it under the terms of the GNU General Public License v3 as published by
#     the Free Software Foundation.
#
#     This program is distributed in the hope that it will be useful,
#     but WITHOUT ANY WARRANTY; without even the implied warranty of
#     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#     GNU General Public License for more details.
#
#     You should have received a copy of the GNU General Public License
#     along with this program.  If not, see <https://www.gnu.org/licenses/>.
#
# Contact Information:
#     See: https://idir.uta.edu/cli.html
#
#     Chengkai Li
#     Box 19015
#     Arlington, TX 76019
#

# %tensorflow_version 2.x
import os
from shutil import rmtree
import math
import pandas as pd
import numpy as np
import tensorflow as tf
print(tf.config.list_physical_devices('GPU'))
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from urllib.request import urlretrieve
from zipfile import ZipFile

print(tf.config.list_physical_devices())
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.experimental.TPUStrategy(tpu)
except ValueError:
    tpu_strategy = None
    print('Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')


2024-02-16 16:05:24.501171: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-16 16:05:24.590335: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-16 16:05:24.590370: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-16 16:05:24.598004: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-16 16:05:24.610893: I tensorflow/core/platform/cpu_feature_guar

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'), PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!


Define parameters for text vectorization

In [2]:
max_len = 500
top_words = 5000
max_words = 10000
path_to_data = './'
glove_dir = './'
embedding_dim = 300
embedding_file_name = 'glove.6B.300d.txt'

Obtain and process data

In [6]:
urlretrieve("https://github.com/idirlab/claimspotter/raw/master/data/two_class/kfold_25ncs.json", "claimbuster_dataset.json")
urlretrieve("http://nlp.stanford.edu/data/glove.6B.zip", "glove6b.zip", )
with ZipFile('glove6b.zip', 'r') as z:
    z.extractall()
os.remove('glove6b.zip')

In [3]:
dataset_loc = path_to_data + 'claimbuster_dataset.json'
data = pd.read_json(dataset_loc)

##### Read in labels and values (texts) for training and testing data


In [4]:
labels = data.label
texts = data.text

##### Vectorize data

In [5]:
tokenizer = Tokenizer(num_words = max_words)
tokenizer.fit_on_texts(texts)

sequences = tokenizer.texts_to_sequences(texts)                
word_index = tokenizer.word_index   
data = pad_sequences(sequences, maxlen=max_len)
labels = np.asarray(labels)             

##### Shuffle data and labels

In [6]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

##### Convert labels to categorical values

In [7]:
labels_categorical = to_categorical(labels)
X_train = data
Y_train = labels_categorical

# Creating Model

##### Parse the GloVe word-embedding and normalize embedding matrix
[https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html](https://blog.keras.io/using-pre-trained-word-embeddings-in-a-keras-model.html)

In [8]:
# create dictionary to map word -> embedding vector
embeddings_index = {}                                        
f = open(os.path.join(glove_dir, embedding_file_name))
i = 0
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print("Found %s word vectors." % len(embeddings_index)) 

embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

Found 400000 word vectors.


In [9]:
embedding_matrix = preprocessing.scale(embedding_matrix)

##### Define the model

In [10]:
def create_model(max_words, embedding_dim, max_len, embedding_matrix):
    hidden_dim = 300 
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=max_len))
    model.add(Bidirectional(LSTM(hidden_dim)))
    model.add(Dropout(0.5))
    model.add(Dense(2, activation='softmax'))
    
    # Load embeddings
    model.layers[0].set_weights([embedding_matrix])
    model.layers[0].trainable = False   
    
    # Compile
    model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['acc'])
    
    return model

##### Define functions for computing performance metrics

In [11]:
def compute_average_precision(labels, scores, cutoff=None):
    # https://github.com/apepa/clef2019-factchecking-task1/blob/master/scorer/task1.py#L52
    combined = sorted([(scores[i], labels[i]) for i in range(len(scores))], reverse=True)
    combined = combined if cutoff is None else combined[:(cutoff if cutoff < len(combined) else len(combined))]
    labels = [x[1] for x in combined]
    precisions = []
    num_correct = 0
    num_positive = sum(labels)
    for i, x in enumerate(combined):
        if cutoff is not None and i >= cutoff:
            break
        if x[1] == 1:
            num_correct += 1
            precisions.append(num_correct / (i + 1))
    if precisions:
        avg_prec = sum(precisions) / num_positive
    else:
        avg_prec = 0.0
    return avg_prec

In [12]:
def compute_dcg_term(i, labels, ver=1):
    # Difference between version 0 and 1: https://en.wikipedia.org/wiki/Discounted_cumulative_gain#Discounted_Cumulative_Gain
    return labels[i - 1] / math.log2(i + 1) if ver == 0 else ((1 << labels[i - 1]) - 1) / math.log2(i + 1)

def compute_ndcg(labels, scores, cutoff=None):
    # Precondition: for each index i, scores[i] corresponds with labels[i]
    ver = 0
    combined = sorted([(scores[i], labels[i]) for i in range(len(scores))], reverse=True)
    combined = combined if cutoff is None else combined[:(cutoff if cutoff < len(combined) else len(combined))]
    labels = [x[1] for x in combined]
    dcg = sum([compute_dcg_term(i, labels, ver=ver) for i in range(1, len(labels) + 1, 1)])
    ideal_labels = sorted(labels, reverse=True)
    idcg = sum([compute_dcg_term(i, ideal_labels, ver=ver) for i in range(1, len(labels) + 1, 1)])
    return dcg / idcg

##### K-Folds model training and evaluation

In [13]:
n_folds = 4
shuffle = True
random_state = 1
predicted_y_list = []
true_y_list = []
cfs_probabilities = []

# Train BiLSTM

In [14]:
for train_index, test_index in StratifiedKFold(n_splits=n_folds, shuffle=shuffle, random_state=random_state).split(X_train, labels):
    x_train, x_test = X_train[train_index], X_train[test_index]
    y_train, y_test = Y_train[train_index], Y_train[test_index]
    

    # Enter TPU env:
    print("x_train dimensions: ", x_train.shape)
    print("y_train dimensions: ", y_train.shape)
    if tpu_strategy is not None:
        with tpu_strategy.scope():
            # instantiate model
            model = create_model(max_words, embedding_dim, max_len, embedding_matrix)
            
            # train model
            print("Training ....")
            history = model.fit(x_train, y_train, epochs = 15)
            
            # print fold classification report
            y_hat = model.predict(x_test, verbose = 0)
            cfs_probs = y_hat[:, 1]
            y_hat_classes = tf.argmax(y_hat, axis=1).numpy()
            y_test_classes = tf.argmax(y_test, axis=1).numpy()
    else:
        model = create_model(max_words, embedding_dim, max_len, embedding_matrix)
        print("Training ....")
        history = model.fit(x_train, y_train, epochs = 15)
        # print fold classification report
        y_hat = model.predict(x_test, verbose = 0)
        cfs_probs = y_hat[:, 1]
        y_hat_classes = tf.argmax(y_hat, axis=1).numpy()
        y_test_classes = tf.argmax(y_test, axis=1).numpy()
    
    print(classification_report(y_test_classes, y_hat_classes, ))
    print("Average precision: ", compute_average_precision(y_test_classes, y_hat_classes))
    print("ndcg: ", compute_ndcg(y_test_classes, cfs_probs))
    
    # store predicted and true values for aggregated classification report
    predicted_y_list.extend(y_hat_classes)
    true_y_list.extend(y_test_classes)
    cfs_probabilities.extend(cfs_probs)

x_train dimensions:  (7255, 500)
y_train dimensions:  (7255, 2)


2024-02-16 16:07:29.198252: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 38374 MB memory:  -> device: 0, name: NVIDIA A100-PCIE-40GB, pci bus id: 0000:c1:00.0, compute capability: 8.0
2024-02-16 16:07:35.204229: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Training ....
Epoch 1/15


2024-02-16 16:07:43.964806: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904
2024-02-16 16:07:47.721891: I external/local_xla/xla/service/service.cc:168] XLA service 0x7fa4e4b9e320 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-02-16 16:07:47.721924: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA A100-PCIE-40GB, Compute Capability 8.0
2024-02-16 16:07:48.355788: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1708096069.231657 4179544 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
              precision    recall  f1-score   support

           0       0.90      0.93      0.91      1728
           1       0.81      0.74      0.77       691

    accuracy                           0.88      2419
   macro avg       0.86      0.83      0.84      2419
weighted avg       0.87      0.88      0.87      2419

Average precision:  0.9565475704143185
ndcg:  0.9808090047439703
x_train dimensions:  (7255, 500)
y_train dimensions:  (7255, 2)
Training ....
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
              precision    recall  f1-score   support

           0       0.92      0.86      0.89      1728
           1       0.71      0.81      0.76       691

    accuracy                       

In [15]:
print("Classification report for all models")
print(classification_report(true_y_list, predicted_y_list, digits=4))

Classification report for all models
              precision    recall  f1-score   support

           0     0.9032    0.9142    0.9087      6910
           1     0.7787    0.7551    0.7667      2764

    accuracy                         0.8687      9674
   macro avg     0.8410    0.8346    0.8377      9674
weighted avg     0.8676    0.8687    0.8681      9674



In [16]:
precisions = compute_average_precision(true_y_list, predicted_y_list)
print(precisions)

0.9516860967094833


In [17]:
ndcg = compute_ndcg(true_y_list, cfs_probabilities)
print(ndcg)

0.9833205508722823


##### Training model on full dataset and save it.

In [None]:
# Reset saved_models dir
rmtree("./saved_models", ignore_errors=True)
os.makedirs("saved_models", exist_ok=False)

In [None]:
with tpu_strategy.scope():
    full_model = create_model(max_words, embedding_dim, max_len, embedding_matrix)

In [None]:
with tpu_strategy.scope():
    history = full_model.fit(X_train, Y_train, epochs = 14)

In [None]:
full_model.save(os.path.join("./saved_models/", 'Full_BiLSTM.h5'))