In [17]:
import pandas as pd
import numpy as np
import nltk
import tensorflow as tf
import keras.backend as K
import tensorflow_addons as tfa
import keras
import torch

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score

seed = 2000
np.random.seed(seed)
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import BatchNormalization, Dropout, Flatten, Dense, Embedding
from tensorflow.keras.preprocessing import sequence
from keras.wrappers.scikit_learn import KerasClassifier

from focal_loss import SparseCategoricalFocalLoss
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification, AdamW

import warnings
warnings.filterwarnings("ignore")

num_i = 1

In [3]:
train_df = pd.read_csv('/Users/nitanshjain/Documents/Projects/CASE/codefiles/subtask_1_data/train_subtask1_preprocessed_{}.csv'.format(num_i))
print(train_df.head())

               index                                               text  label
0     train_01_0_892  state alleged hacked sabata petros chale thirt...      1
1    train_01_1_2714  chale allegedly chased group thirty people hac...      0
2   train_01_10_2619     farmworkers strike resumed tuesday demands met      1
3  train_01_100_2680  demonstrators filed permit hold rally saturday...      1
4  train_01_101_3090  footage attack included pregnant woman hit pro...      1


In [10]:
dev_df = pd.read_csv('/Users/nitanshjain/Documents/Projects/CASE/codefiles/subtask_1_data/dev_subtask1_preprocessed_{}.csv'.format(num_i))
dev_df.head()

dev_df_labels = pd.read_csv('/Users/nitanshjain/Documents/Projects/CASE/tanfiona CausalNewsCorpus master data-V2/dev_subtask1.csv')
labels = dev_df_labels['label'].values
print(labels)
print(dev_df.head())

[1 1 0 1 1 1 1 1 0 1 0 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 1 0 1 0
 0 1 1 1 0 1 1 0 0 1 0 0 0 1 1 0 1 1 0 1 0 1 1 0 0 0 1 0 0 0 0 1 1 1 1 0 0
 1 0 0 1 1 0 1 1 0 1 0 1 0 0 0 1 0 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 1 1 1 0 1
 0 1 0 1 1 0 1 0 1 1 1 0 1 0 0 1 1 1 0 0 0 1 1 1 1 1 1 0 1 1 1 0 0 0 1 0 1
 0 0 0 0 0 0 0 0 1 1 1 1 1 1 0 1 0 1 0 0 1 1 0 0 1 0 1 0 0 0 0 1 1 1 0 0 0
 0 1 1 1 0 1 1 0 1 0 0 1 1 0 0 0 0 0 0 0 1 1 1 1 1 1 1 0 1 1 1 1 0 1 0 1 0
 0 1 1 1 0 0 0 1 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 1 1 0 1 0 0 1 1 1 0 0 0 1 1
 0 1 1 0 0 1 0 0 0 1 1 0 0 1 1 1 0 1 0 1 0 1 1 1 0 0 0 0 1 0 1 0 0 0 1 0 0
 1 1 1 0 1 1 0 0 1 0 1 0 0 0 1 0 0 1 1 1 1 1 1 0 0 1 0 1 0 1 0 0 1 1 1 1 0
 0 1 0 1 1 1 0]
               index                                               text
0    train_10_0_2136  movement catapulted headlines early august sem...
1     train_10_1_350  several thousand protesters took streets six p...
2   train_10_10_3104  protest saving medha life also preserving peop...
3  train_10_100_1188 

In [26]:
x_train = train_df.text.to_numpy().reshape(-1)
y_train = train_df.label.to_numpy().reshape(-1,1)
x_dev = dev_df.text.to_numpy().reshape(-1)
y_dev = dev_df_labels.label.to_numpy().reshape(-1,1)

print(x_train.shape)
print(y_train.shape)
print(x_dev.shape)
print(y_dev.shape)


# creating the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', num_labels=2, output_attentions=True)

(3075,)
(3075, 1)
(340,)
(340, 1)


In [27]:
# encoding the train and dev values using roberta
def roberta_encode(texts, tokenizer):
    MAX_LEN = 128
    
    ct = len(texts)
    
    input_ids = np.ones((ct, MAX_LEN), dtype='int32')       
    attention_mask = np.zeros((ct, MAX_LEN), dtype='int32')
    token_type_ids = np.zeros((ct, MAX_LEN), dtype='int32') # Not used in text classification

    for k, text in enumerate(texts):
        # Tokenize
        tok_text = tokenizer.tokenize(str(text))
        
        # Truncate and convert tokens to numerical IDs
        enc_text = tokenizer.convert_tokens_to_ids(tok_text[:(MAX_LEN-2)])
        
        input_length = len(enc_text) + 2
        input_length = input_length if input_length < MAX_LEN else MAX_LEN
        
        # Add tokens [CLS] and [SEP] at the beginning and the end
        input_ids[k,:input_length] = np.asarray([0] + enc_text + [2], dtype='int32')
        
        # Set to 1s in the attention input
        attention_mask[k,:input_length] = 1

    return {
        'input_word_ids': input_ids,
        'input_mask': attention_mask,
        'input_type_ids': token_type_ids
    }

In [28]:
# encoding the train and dev values using above function
x_train = roberta_encode(x_train, tokenizer)
x_dev = roberta_encode(x_dev, tokenizer)

y_train = np.asarray(y_train, dtype='int32')
y_dev = np.asarray(y_dev, dtype='int32')

In [29]:
# function to use f1_macro as a metric while compiling neural model
def binary_f1(y_true, y_pred):
    
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    
    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())
    f1_val = 2 * (precision * recall)/(precision + recall + K.epsilon())
    
    return f1_val

In [30]:
# building model with parameter n_categories that represents the number of classes
def build_model(n_categories):
    MAX_LEN = 128
    input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
    input_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')
    input_type_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_type_ids')
    
    

    # Import RoBERTa model from HuggingFace
    roberta_model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
    # custom_objects = {"TFRobertaForSequenceClassification": TFRobertaForSequenceClassification}
    # config = roberta_model_copy.get_config()
    # with tf.keras.utils.custom_object_scope(custom_objects):
    #     roberta_model = TFRobertaForSequenceClassification.from_config(config)
    x = roberta_model(input_word_ids, attention_mask=input_mask, token_type_ids=input_type_ids)

    # Huggingface transformers have multiple outputs, 
    # embeddings are the first one,
    # so let's slice out the first position
    x = x[0]

    x = tf.keras.layers.Dropout(rate=0.1)(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    x = tf.keras.layers.Dense(n_categories, activation='sigmoid')(x)

    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=x)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
        loss=SparseCategoricalFocalLoss(gamma=2),
        metrics=[binary_f1])
    
    

    return model

In [31]:
# building and providing arch of the model
tf.config.run_functions_eagerly(True)
model_copy = build_model(2)
model_copy.summary()

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "model_2"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 128)]        0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 128)]        0           []                               
                                                                                                  
 tf_roberta_for_sequence_classi  TFSequenceClassifie  124647170  ['input_word_ids[0][0]',         
 fication_2 (TFRobertaForSequen  rOutput(loss=None,               'input_mask[0][0]',       

In [32]:
batch_size = 16

# to save the best model
checkpoint_filepath = '/Users/nitanshjain/Documents/Projects/CASE/codefiles/subtask_1/checkpoints/model-improvement-roberta-h5-{epoch:02d}-{val_f1_macro:.2f}.h5'

# parameters based on which model is being saved
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_binary_f1', 
    save_best_only=True,
    save_weights_only=False,
    mode='max'
    )

# fitting the model to training data
model_copy.fit(x=x_train,
                y=y_train,
                batch_size=batch_size,
                epochs=10,
                callbacks=[model_checkpoint_callback],
                validation_data=(x_dev, y_dev),
                shuffle=True,
                verbose=1, 
                )
        

Epoch 1/10
  1/193 [..............................] - ETA: 1:43:34 - loss: 0.1733 - binary_f1: 0.6429

KeyboardInterrupt: 

In [None]:
# loading the best model
from keras.models import load_model
model = build_model(3)

# used as because roberta is a custom object, if not mentioned code throws an error
with tf.keras.utils.CustomObjectScope({'TFRobertaForSequenceClassification': TFRobertaForSequenceClassification.from_pretrained('roberta-base')}):
    model.load_weights('/Users/nitanshjain/Documents/Projects/Sem_Eval/semeval2023task3/codefiles/subtask1/roberta/saved_models/semeval.h5')  
model.summary(print_fn=print)
