# Importing Libraries

In [None]:
# !pip install tensorflow_addons
# !pip install nlpaug
# !pip install focal_loss
# !pip install transformers
# !pip install torch 
# !pip install nltk

In [None]:
import pandas as pd
import numpy as np
import nltk
import tensorflow as tf
import tensorflow_addons as tfa
import keras
import torch

import nlpaug
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score

seed = 2000
np.random.seed(seed)
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import BatchNormalization, Dropout, Flatten, Dense, Embedding
from tensorflow.keras.preprocessing import sequence
from keras.wrappers.scikit_learn import KerasClassifier

from focal_loss import SparseCategoricalFocalLoss
from transformers import AlbertConfig, AlbertTokenizer, TFAlbertForSequenceClassification


import warnings
warnings.filterwarnings("ignore")

# Reading and Handling the Data

In [None]:
# reading the only english train data
train_sub1_df = pd.read_csv('semeval/en_train_subtask_1.csv')
print(train_sub1_df.shape)
train_sub1_df.head()
print(train_sub1_df.genre.value_counts())

# reading data from all the languages including english post translation
final_train_sub1_df = pd.read_csv('semeval/final_train_subtask_1.csv')
print(final_train_sub1_df.shape)
final_train_sub1_df.head()
print(final_train_sub1_df.genre.value_counts())

# reading the only english dev data
dev_sub1_df = pd.read_csv('semeval/en_dev_subtask_1.csv')
print(dev_sub1_df.shape)
dev_sub1_df.head()
print(dev_sub1_df.genre.value_counts())

# test_sub1_df = pd.read_csv('/Users/nitanshjain/Documents/Projects/Sem_Eval/semeval2023task3/preprocessed_data/extra_test.csv')
# print(test_sub1_df.shape)

In [None]:
# Label Encoding the genre columns in all three dataframes
le = LabelEncoder()
train_sub1_df['genre'] = le.fit_transform(train_sub1_df['genre'])
print(train_sub1_df.genre.value_counts())

le = LabelEncoder()
final_train_sub1_df['genre'] = le.fit_transform(final_train_sub1_df['genre'])
print(final_train_sub1_df.genre.value_counts())

le = LabelEncoder()
dev_sub1_df['genre'] = le.fit_transform(dev_sub1_df['genre'])
print(dev_sub1_df.genre.value_counts())
dev_sub1_df.head()

In [None]:
# Performing Undersampling on final_train_sub1_df
n0 = 864
n1 = 68
final_train_sub1_df = final_train_sub1_df.drop(final_train_sub1_df[final_train_sub1_df['genre'].eq(0)].sample(n0).index)
final_train_sub1_df = final_train_sub1_df.drop(final_train_sub1_df[final_train_sub1_df['genre'].eq(1)].sample(n1).index)
print(final_train_sub1_df.genre.value_counts())
final_train_sub1_df.shape

In [None]:
# # 
# new_train_df = pd.DataFrame(final_train_sub1_df.loc[final_train_sub1_df['genre']==0])
# new_train_df = new_train_df.append(final_train_sub1_df.loc[final_train_sub1_df['genre']==1])
# new_train_df = new_train_df.append(final_train_sub1_df.loc[final_train_sub1_df['genre']==2])
# new_train_df.genre.value_counts()

# Augmentation of Data

In [None]:
# augmentation of textual data
aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute", aug_max=3)

In [None]:
# creating x train, y train and x dev, y dev
x_train = final_train_sub1_df.preprocessed_articles + final_train_sub1_df.preprocessed_headlines
y_train = final_train_sub1_df[['genre']]

x_dev = dev_sub1_df.preprocessed_articles + dev_sub1_df.preprocessed_headlines
y_dev = dev_sub1_df[['genre']]

In [None]:
# creating augmented sentences and labels using aug object

augmented_sentences=[]
augmented_sentences_labels=[]
count = 0
for i in x_train.index:
    count+=1
    if count%10==0:
        print("Running count = ", count)
    if y_train.genre[i]==0:
        try:
            temps=aug.augment(x_train[i], n=2)
            for sent in temps:
                augmented_sentences.append(sent)        # appending the augmented sentences to list
                augmented_sentences_labels.append(0)    # appending the label '0' to list
        except:
            continue
    if y_train.genre[i]==1:
        try:
            temps=aug.augment(x_train[i], n=2)
            for sent in temps:
                augmented_sentences.append(sent)        # appending the augmented sentences to list
                augmented_sentences_labels.append(1)    # appending the label '1' to list
        except:
            continue
    if y_train.genre[i]==2:
        try:
            temps=aug.augment(x_train[i], n=2)
            for sent in temps:
                augmented_sentences.append(sent)        # appending the augmented sentences to list
                augmented_sentences_labels.append(2)    # appending the label '2' to list
        except:
            continue

In [None]:
# creating augmented dataframes
x_train_aug = pd.Series()
y_train_aug = pd.Series()
x_train_aug = x_train_aug.append(pd.Series(augmented_sentences), ignore_index=True)
y_train_aug = y_train_aug.append(pd.Series(augmented_sentences_labels), ignore_index=False)

In [None]:
# saving the augmented dataframes to csv
aug_df=pd.concat([x_train_aug,y_train_aug],axis=1)
aug_df.to_csv('aug.csv')

# Implementing the Albert Model

In [None]:
# reading the augmented dataframes
aug_df = pd.read_csv('semeval/aug.csv')
x_train_aug = aug_df['0']
y_train_aug = aug_df['1']

In [None]:
# Calculating Classweights
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(
    class_weight = "balanced",
    classes = np.unique(y_train_aug),
    y = y_train_aug
)

class_weights = dict(zip(np.unique(y_train_aug), class_weights))
class_weights

In [None]:
# reshaping the train and dev values
x_train = x_train_aug.to_numpy().reshape(-1)
y_train = y_train_aug.to_numpy().reshape(-1,1)
x_dev = x_dev.to_numpy().reshape(-1)
y_dev = y_dev.to_numpy().reshape(-1,1)

# creating the tokenizer
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2', num_labels=3, output_attentions=True)

In [None]:
# encoding the train and dev values using roberta
def roberta_encode(texts, tokenizer):
    MAX_LEN = 512
    
    ct = len(texts)
    
    input_ids = np.ones((ct, MAX_LEN), dtype='int32')       
    attention_mask = np.zeros((ct, MAX_LEN), dtype='int32')
    token_type_ids = np.zeros((ct, MAX_LEN), dtype='int32') # Not used in text classification

    for k, text in enumerate(texts):
        # Tokenize
        tok_text = tokenizer.tokenize(str(text))
        
        # Truncate and convert tokens to numerical IDs
        enc_text = tokenizer.convert_tokens_to_ids(tok_text[:(MAX_LEN-2)])
        
        input_length = len(enc_text) + 2
        input_length = input_length if input_length < MAX_LEN else MAX_LEN
        
        # Add tokens [CLS] and [SEP] at the beginning and the end
        input_ids[k,:input_length] = np.asarray([0] + enc_text + [2], dtype='int32')
        
        # Set to 1s in the attention input
        attention_mask[k,:input_length] = 1

    return {
        'input_word_ids': input_ids,
        'input_mask': attention_mask,
        'input_type_ids': token_type_ids
    }

In [None]:
# encoding the train and dev values using above function
x_train = roberta_encode(x_train, tokenizer)
x_dev = roberta_encode(x_dev, tokenizer)

y_train = np.asarray(y_train, dtype='int32')
y_dev = np.asarray(y_dev, dtype='int32')

In [None]:
# function to use f1_macro as a metric while compiling neural model
def f1_macro(y_true, y_pred_func):
    y_pred_func = np.argmax(y_pred_func, axis=1).astype(int)
    macro_f1 = f1_score(y_true, y_pred_func, average='macro')
    return macro_f1

def f1_micro(y_true, y_pred_func):
    y_pred_func = np.argmax(y_pred_func, axis=1).astype(int)
    micro_f1 = f1_score(y_true, y_pred_func, average='micro')
    return micro_f1

In [None]:
# building model with parameter n_categories that represents the number of classes
def build_model(n_categories):
    MAX_LEN = 512
    input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
    input_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')
    input_type_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_type_ids')
    
    

    # Import RoBERTa model from HuggingFace
   albert_model = TFAlbertForSequenceClassification.from_pretrained('albert-base-v2')
    # custom_objects = {"TFAlbertForSequenceClassification": TFAlbertForSequenceClassification}
    # config =albert_model_copy.get_config()
    # with tf.keras.utils.custom_object_scope(custom_objects):
    #    albert_model = TFAlbertForSequenceClassification.from_config(config)
    x =albert_model(input_word_ids, attention_mask=input_mask, token_type_ids=input_type_ids)

    # Huggingface transformers have multiple outputs, 
    # embeddings are the first one,
    # so let's slice out the first position
    x = x[0]

    x = tf.keras.layers.Dropout(rate=0.1)(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.Dense(n_categories, activation='softmax')(x)

    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=x)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
        loss=SparseCategoricalFocalLoss(gamma=2),
        metrics=[f1_macro, f1_micro, 'accuracy'])
    
    

    return model

In [None]:
# building and providing arch of the model
model_copy = build_model(3)
model_copy.summary()

In [None]:
tf.config.run_functions_eagerly(True)

In [None]:
batch_size = 16

# to save the best model
checkpoint_filepath = 'checkpoints/model-improvement-roberta-h5-{epoch:02d}-{val_f1_macro:.2f}.h5'

# parameters based on which model is being saved
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_f1_macro', 
    save_best_only=True,
    save_weights_only=False,
    mode='max'
    )

# fitting the model to training data
model_copy.fit(x=x_train,
                y=y_train,
                batch_size=batch_size,
                epochs=20,
                callbacks=[model_checkpoint_callback],
                validation_data=(x_dev, y_dev),
                shuffle=True,
                verbose=1, class_weight=class_weights
                )
        

# Loading the Best Model and Preparing Submissions for dev and test data

In [None]:
# loading the best model
from keras.models import load_model
model = build_model(3)

# used as because roberta is a custom object, if not mentioned code throws an error
with tf.keras.utils.CustomObjectScope({'TFRobertaForSequenceClassification': TFRobertaForSequenceClassification.from_pretrained('roberta-base')}):
    model.load_weights('/Users/nitanshjain/Documents/Projects/Sem_Eval/semeval2023task3/codefiles/subtask1/roberta/saved_models/semeval.h5')  
model.summary(print_fn=print)


## Dev Data

In [None]:
# prediciting model based on dev data using the best model acheived before
y_pred = model.predict(x_dev)

# converting the predicted values to the required format
y_pred_final = list()
y_pred_sub = list()
for val in y_pred:
    if val[0]>val[1] and val[0]>val[2]:
        y_pred_final.append(0)
        y_pred_sub.append('opinion')
    elif val[1]>val[0] and val[1]>val[2]:
        y_pred_final.append(1)
        y_pred_sub.append('reporting')
    else:
        y_pred_final.append(2)
        y_pred_sub.append('satire')

print(y_pred_final)

# calculating the f1 score and classification report
from sklearn.metrics import f1_score, classification_report
print('Macro f1_score = {}'.format(f1_score(y_dev, y_pred_final, average='macro')))
print('Micro f1_score = {}'.format(f1_score(y_dev, y_pred_final, average='micro')))
print(classification_report(y_dev, y_pred_final))

In [None]:
# getting the id of each article in dev data
y_dev_id = dev_sub1_df.id

# creating a dataframe with the predicted values using dictionary
dict = {'article_id': y_dev_id, 'class': y_pred_sub} 
submit_df = pd.DataFrame(dict) 
submit_df.set_index('article_id', inplace=True)
print(submit_df.head())

# saving the dataframe to a txt file in the required format
submit_df.to_csv('dev_data_25_16.txt', sep='\t')

## Test Data

In [None]:
# loading the test data
test_df = pd.read_csv('/Users/nitanshjain/Documents/Projects/Sem_Eval/semeval2023task3/preprocessed_data/subtask1/en_test_subtask_1.csv')
print(test_df.shape)
print(test_df.head())

In [None]:
# encoding the test data
x_test = test_df.preprocessed_headlines + test_df.preprocessed_articles
x_test = roberta_encode(x_test, tokenizer)

In [None]:
# predicting the test data
y_test = model_copy.predict(x_test)

# converting the predicted values to the required format
y_test_final = list()
y_test_sub = list()
for val in y_test:
    if val[0]>val[1] and val[0]>val[2]:
        y_test_final.append(0)
        y_test_sub.append('opinion')
    elif val[1]>val[0] and val[1]>val[2]:
        y_test_final.append(1)
        y_test_sub.append('reporting')
    else:
        y_test_final.append(2)
        y_test_sub.append('satire')

print(y_test_final) 

In [None]:
# getting the id of each article in test data
y_test_id = test_df.id

# creating a dataframe with the predicted values using dictionary
dict_test = {'article_id': y_test_id, 'class': y_test_sub} 
submit_test_df = pd.DataFrame(dict_test) 
submit_test_df.set_index('article_id', inplace=True)
submit_test_df.head()

# saving the dataframe to a txt file in the required format
submit_test_df.to_csv('2_test_data_25_16.txt', sep='\t')