# Importing Libraries

In [1]:
# !pip install tensorflow_addons
# !pip install nlpaug
# !pip install focal_loss
# !pip install transformers
# !pip install torch 
# !pip install nltk

Collecting torch
  Downloading torch-1.13.1-cp310-none-macosx_10_9_x86_64.whl (135.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.3/135.3 MB[0m [31m352.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:10[0m
Installing collected packages: torch
Successfully installed torch-1.13.1


In [4]:
import pandas as pd
import numpy as np
import nltk
import tensorflow as tf
import tensorflow_addons as tfa
import keras
import torch

import nlpaug
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as nafc
from nlpaug.util import Action


from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, f1_score

seed = 2000
np.random.seed(seed)
from tensorflow.keras import regularizers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import BatchNormalization, Dropout, Flatten, Dense, Embedding
from tensorflow.keras.preprocessing import sequence
from keras.wrappers.scikit_learn import KerasClassifier

from focal_loss import SparseCategoricalFocalLoss
from transformers import RobertaTokenizer, TFRobertaForSequenceClassification, AdamW

import warnings
warnings.filterwarnings("ignore")

  from .autonotebook import tqdm as notebook_tqdm


# Reading and Handling the Data

In [6]:
# reading the only english train data
train_sub1_df = pd.read_csv('semeval/en_train_subtask_1.csv')
print(train_sub1_df.shape)
train_sub1_df.head()
print(train_sub1_df.genre.value_counts())

# reading data from all the languages including english post translation
final_train_sub1_df = pd.read_csv('semeval/final_train_subtask_1.csv')
print(final_train_sub1_df.shape)
final_train_sub1_df.head()
print(final_train_sub1_df.genre.value_counts())

# reading the only english dev data
dev_sub1_df = pd.read_csv('semeval/en_dev_subtask_1.csv')
print(dev_sub1_df.shape)
dev_sub1_df.head()
print(dev_sub1_df.genre.value_counts())

# test_sub1_df = pd.read_csv('/Users/nitanshjain/Documents/Projects/Sem_Eval/semeval2023task3/preprocessed_data/extra_test.csv')
# print(test_sub1_df.shape)

(433, 10)
opinion      382
reporting     41
satire        10
Name: genre, dtype: int64
(1437, 11)
opinion      1073
reporting     271
satire         93
Name: genre, dtype: int64
(83, 10)
reporting    54
opinion      20
satire        9
Name: genre, dtype: int64


In [7]:
# Label Encoding the genre columns in all three dataframes
le = LabelEncoder()
train_sub1_df['genre'] = le.fit_transform(train_sub1_df['genre'])
print(train_sub1_df.genre.value_counts())

le = LabelEncoder()
final_train_sub1_df['genre'] = le.fit_transform(final_train_sub1_df['genre'])
print(final_train_sub1_df.genre.value_counts())

le = LabelEncoder()
dev_sub1_df['genre'] = le.fit_transform(dev_sub1_df['genre'])
print(dev_sub1_df.genre.value_counts())
dev_sub1_df.head()

0    382
1     41
2     10
Name: genre, dtype: int64
0    1073
1     271
2      93
Name: genre, dtype: int64
1    54
0    20
2     9
Name: genre, dtype: int64


Unnamed: 0,id,genre,headlines,articles,preprocessed_headlines,pos_tags_headlines,er_tags_headlines,preprocessed_articles,pos_tags_articles,er_tags_articles
0,820791520,1,George III Lost America.\n,Theresa May Could Lose the United Kingdom Over...,george iii lose america,"[(george, 'NNP'), (iii, 'NNP'), (lose, 'VB'), ...","[(george iii, 'PERSON', 380), (america, 'GPE',...",theresa may could lose the united kingdom over...,"[(theresa, 'NN'), (may, 'MD'), (could, 'MD'), ...","[(the united kingdom, 'GPE', 384), (the europe..."
1,821040551,1,Queen Elizabeth Would Be Evacuated in Event of...,If Britain leaves the European Union without a...,queen elizabeth would be evacuate in event of ...,"[(queen, 'NNP'), (elizabeth, 'NNP'), (would, '...","[(elizabeth, 'PERSON', 380), (brexit riot repo...",if britain leave the european union without tr...,"[(if, 'IN'), (britain, 'NNP'), (leave, 'VBP'),...","[(britain, 'GPE', 384), (the european union, '..."
2,813552066,1,"You insult us, ambassador: Woody Johnson flagr...",With three months until Britain leaves the Eur...,you insult us ambassador woody johnson flagran...,"[(you, 'PRP'), (insult, 'VBP'), (us, 'NNP'), (...","[(woody johnson, 'PERSON', 380), (peter, 'PERS...",with three month until britain leave the europ...,"[(with, 'IN'), (three, 'CD'), (month, 'NN'), (...","[(three month, 'DATE', 391), (britain, 'GPE', ..."
3,817176202,1,"The British People, as Well as the Politicians...",The British Parliament just handed Prime Minis...,the british people as well as the politician d...,"[(the, 'DT'), (british, 'JJ'), (people, 'NNS')...","[(british, 'NORP', 381)]",the british parliament just hand prime ministe...,"[(the, 'DT'), (british, 'JJ'), (parliament, 'N...","[(british, 'NORP', 381), (british, 'NORP', 381..."
4,820419869,1,No break from Brexit: RT takes a look at lates...,As British MPs are told that their February br...,no break from brexit rt take look at late deve...,"[(no, 'DT'), (break, 'NN'), (from, 'IN'), (bre...","[(brexit rt, 'ORG', 383)]",as british mp be tell that their february brea...,"[(as, 'IN'), (british, 'NNP'), (mp, 'NNP'), (b...","[(british, 'NORP', 381), (february, 'DATE', 39..."


In [None]:
# Performing Undersampling on final_train_sub1_df
n0 = 864
n1 = 68
final_train_sub1_df = final_train_sub1_df.drop(final_train_sub1_df[final_train_sub1_df['genre'].eq(0)].sample(n0).index)
final_train_sub1_df = final_train_sub1_df.drop(final_train_sub1_df[final_train_sub1_df['genre'].eq(1)].sample(n1).index)
print(final_train_sub1_df.genre.value_counts())
final_train_sub1_df.shape

0    209
1    203
2     93
Name: genre, dtype: int64


(505, 11)

In [None]:
# # 
# new_train_df = pd.DataFrame(final_train_sub1_df.loc[final_train_sub1_df['genre']==0])
# new_train_df = new_train_df.append(final_train_sub1_df.loc[final_train_sub1_df['genre']==1])
# new_train_df = new_train_df.append(final_train_sub1_df.loc[final_train_sub1_df['genre']==2])
# new_train_df.genre.value_counts()

0    209
1    203
2     93
Name: genre, dtype: int64

# Augmentation of Data

In [None]:
# augmentation of textual data
aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute", aug_max=3)

In [None]:
# creating x train, y train and x dev, y dev
x_train = final_train_sub1_df.preprocessed_articles + final_train_sub1_df.preprocessed_headlines
y_train = final_train_sub1_df[['genre']]

x_dev = dev_sub1_df.preprocessed_articles + dev_sub1_df.preprocessed_headlines
y_dev = dev_sub1_df[['genre']]

In [None]:
# creating augmented sentences and labels using aug object

augmented_sentences=[]
augmented_sentences_labels=[]
count = 0
for i in x_train.index:
    count+=1
    if count%10==0:
        print("Running count = ", count)
    if y_train.genre[i]==0:
        try:
            temps=aug.augment(x_train[i], n=2)
            for sent in temps:
                augmented_sentences.append(sent)        # appending the augmented sentences to list
                augmented_sentences_labels.append(0)    # appending the label '0' to list
        except:
            continue
    if y_train.genre[i]==1:
        try:
            temps=aug.augment(x_train[i], n=2)
            for sent in temps:
                augmented_sentences.append(sent)        # appending the augmented sentences to list
                augmented_sentences_labels.append(1)    # appending the label '1' to list
        except:
            continue
    if y_train.genre[i]==2:
        try:
            temps=aug.augment(x_train[i], n=2)
            for sent in temps:
                augmented_sentences.append(sent)        # appending the augmented sentences to list
                augmented_sentences_labels.append(2)    # appending the label '2' to list
        except:
            continue

Running count =  10
Running count =  20
Running count =  30
Running count =  40
Running count =  50
Running count =  60
Running count =  70
Running count =  80
Running count =  90
Running count =  100
Running count =  110
Running count =  120
Running count =  130
Running count =  140
Running count =  150
Running count =  160
Running count =  170
Running count =  180
Running count =  190
Running count =  200
Running count =  210
Running count =  220
Running count =  230
Running count =  240
Running count =  250
Running count =  260
Running count =  270
Running count =  280
Running count =  290
Running count =  300
Running count =  310
Running count =  320
Running count =  330
Running count =  340
Running count =  350
Running count =  360
Running count =  370
Running count =  380
Running count =  390
Running count =  400
Running count =  410
Running count =  420
Running count =  430
Running count =  440
Running count =  450
Running count =  460
Running count =  470
Running count =  480
R

In [None]:
# creating augmented dataframes
x_train_aug = pd.Series()
y_train_aug = pd.Series()
x_train_aug = x_train_aug.append(pd.Series(augmented_sentences), ignore_index=True)
y_train_aug = y_train_aug.append(pd.Series(augmented_sentences_labels), ignore_index=False)

In [None]:
# saving the augmented dataframes to csv
aug_df=pd.concat([x_train_aug,y_train_aug],axis=1)
aug_df.to_csv('aug.csv')

# Implementing the Roberta Model

In [None]:
# reading the augmented dataframes
aug_df = pd.read_csv('semeval/aug.csv')
x_train_aug = aug_df['0']
y_train_aug = aug_df['1']

In [None]:
# Calculating Classweights
from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(
    class_weight = "balanced",
    classes = np.unique(y_train_aug),
    y = y_train_aug
)

class_weights = dict(zip(np.unique(y_train_aug), class_weights))
class_weights

In [None]:
# reshaping the train and dev values
x_train = x_train_aug.to_numpy().reshape(-1)
y_train = y_train_aug.to_numpy().reshape(-1,1)
x_dev = x_dev.to_numpy().reshape(-1)
y_dev = y_dev.to_numpy().reshape(-1,1)

# creating the tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', num_labels=3, output_attentions=True)

In [None]:
# encoding the train and dev values using roberta
def roberta_encode(texts, tokenizer):
    MAX_LEN = 512
    
    ct = len(texts)
    
    input_ids = np.ones((ct, MAX_LEN), dtype='int32')       
    attention_mask = np.zeros((ct, MAX_LEN), dtype='int32')
    token_type_ids = np.zeros((ct, MAX_LEN), dtype='int32') # Not used in text classification

    for k, text in enumerate(texts):
        # Tokenize
        tok_text = tokenizer.tokenize(str(text))
        
        # Truncate and convert tokens to numerical IDs
        enc_text = tokenizer.convert_tokens_to_ids(tok_text[:(MAX_LEN-2)])
        
        input_length = len(enc_text) + 2
        input_length = input_length if input_length < MAX_LEN else MAX_LEN
        
        # Add tokens [CLS] and [SEP] at the beginning and the end
        input_ids[k,:input_length] = np.asarray([0] + enc_text + [2], dtype='int32')
        
        # Set to 1s in the attention input
        attention_mask[k,:input_length] = 1

    return {
        'input_word_ids': input_ids,
        'input_mask': attention_mask,
        'input_type_ids': token_type_ids
    }

In [None]:
# encoding the train and dev values using above function
x_train = roberta_encode(x_train, tokenizer)
x_dev = roberta_encode(x_dev, tokenizer)

y_train = np.asarray(y_train, dtype='int32')
y_dev = np.asarray(y_dev, dtype='int32')

In [None]:
# function to use f1_macro as a metric while compiling neural model
def f1_macro(y_true, y_pred_func):
    y_pred_func = np.argmax(y_pred_func, axis=1).astype(int)
    macro_f1 = f1_score(y_true, y_pred_func, average='macro')
    return macro_f1

In [None]:
# building model with parameter n_categories that represents the number of classes
def build_model(n_categories):
    MAX_LEN = 512
    input_word_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_word_ids')
    input_mask = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_mask')
    input_type_ids = tf.keras.Input(shape=(MAX_LEN,), dtype=tf.int32, name='input_type_ids')
    
    

    # Import RoBERTa model from HuggingFace
    roberta_model = TFRobertaForSequenceClassification.from_pretrained('roberta-base')
    # custom_objects = {"TFRobertaForSequenceClassification": TFRobertaForSequenceClassification}
    # config = roberta_model_copy.get_config()
    # with tf.keras.utils.custom_object_scope(custom_objects):
    #     roberta_model = TFRobertaForSequenceClassification.from_config(config)
    x = roberta_model(input_word_ids, attention_mask=input_mask, token_type_ids=input_type_ids)

    # Huggingface transformers have multiple outputs, 
    # embeddings are the first one,
    # so let's slice out the first position
    x = x[0]

    x = tf.keras.layers.Dropout(rate=0.1)(x)
    x = tf.keras.layers.Flatten()(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.Dense(n_categories, activation='softmax')(x)

    model = tf.keras.Model(inputs=[input_word_ids, input_mask, input_type_ids], outputs=x)
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.00001),
        loss=SparseCategoricalFocalLoss(gamma=2),
        metrics=[f1_macro])
    
    

    return model

In [None]:
# building and providing arch of the model
model_copy = build_model(3)
model_copy.summary()

2023-01-31 11:34:16.321252: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 512)]        0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 tf_roberta_for_sequence_classi  TFSequenceClassifie  124647170  ['input_word_ids[0][0]',         
 fication (TFRobertaForSequence  rOutput(loss=None,               'input_mask[0][0]',         

In [None]:
tf.config.run_functions_eagerly(True)

In [None]:
batch_size = 16

# to save the best model
checkpoint_filepath = 'checkpoints/model-improvement-roberta-h5-{epoch:02d}-{val_f1_macro:.2f}.h5'

# parameters based on which model is being saved
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    monitor='val_f1_macro', 
    save_best_only=True,
    save_weights_only=False,
    mode='max'
    )

# fitting the model to training data
model_copy.fit(x=x_train,
                y=y_train,
                batch_size=batch_size,
                epochs=10,
                callbacks=[model_checkpoint_callback],
                validation_data=(x_dev, y_dev),
                shuffle=True,
                verbose=1, class_weight=class_weights
                )
        

Epoch 1/10
 3/63 [>.............................] - ETA: 2:30:43 - loss: 0.4684 - accuracy: 0.4375

KeyboardInterrupt: 

# Loading the Best Model and Preparing Submissions for dev and test data

In [None]:
# loading the best model
from keras.models import load_model
model = build_model(3)

# used as because roberta is a custom object, if not mentioned code throws an error
with tf.keras.utils.CustomObjectScope({'TFRobertaForSequenceClassification': TFRobertaForSequenceClassification.from_pretrained('roberta-base')}):
    model.load_weights('/Users/nitanshjain/Documents/Projects/Sem_Eval/semeval2023task3/codefiles/subtask1/roberta/saved_models/semeval.h5')  
model.summary(print_fn=print)


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_word_ids (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 input_mask (InputLayer)        [(None, 512)]        0           []                               
                                                                                                  
 input_type_ids (InputLayer)    [(None, 512)]        0           []                               
                                                                                                  
 tf_roberta_for_sequence_classi  TFSequenceClassifie  124647170  ['input_word_ids[0][0]',         
 fication_4 (TFRobertaForSequen  rOutput(loss=None,               'input_mask[0][0]',       

## Dev Data

In [None]:
# prediciting model based on dev data using the best model acheived before
y_pred = model.predict(x_dev)

# converting the predicted values to the required format
y_pred_final = list()
y_pred_sub = list()
for val in y_pred:
    if val[0]>val[1] and val[0]>val[2]:
        y_pred_final.append(0)
        y_pred_sub.append('opinion')
    elif val[1]>val[0] and val[1]>val[2]:
        y_pred_final.append(1)
        y_pred_sub.append('reporting')
    else:
        y_pred_final.append(2)
        y_pred_sub.append('satire')

print(y_pred_final)

# calculating the f1 score and classification report
from sklearn.metrics import f1_score, classification_report
print('Macro f1_score = {}'.format(f1_score(y_dev, y_pred_final, average='macro')))
print('Micro f1_score = {}'.format(f1_score(y_dev, y_pred_final, average='micro')))
print(classification_report(y_dev, y_pred_final))

[[0.6076755  0.09752647 0.29479805]
 [0.07354168 0.4099992  0.5164591 ]
 [0.29761648 0.13507716 0.56730634]
 [0.24889196 0.12301996 0.6280881 ]
 [0.11061794 0.5260354  0.3633466 ]
 [0.58988655 0.09426451 0.31584892]
 [0.07691354 0.48831424 0.43477225]
 [0.24900638 0.10243633 0.64855725]
 [0.08745711 0.4918792  0.4206638 ]
 [0.15074833 0.65005827 0.19919336]
 [0.16665484 0.6515295  0.18181562]
 [0.08090598 0.48015743 0.4389366 ]
 [0.61772084 0.10087476 0.28140438]
 [0.18013841 0.61986285 0.19999874]
 [0.65231043 0.17238237 0.17530717]
 [0.5682245  0.28098202 0.15079357]
 [0.14330181 0.64278287 0.21391536]
 [0.07587258 0.4786329  0.4454946 ]
 [0.15035337 0.6446922  0.20495437]
 [0.1801468  0.64452446 0.17532872]
 [0.13460378 0.63756865 0.22782758]
 [0.21477118 0.6218781  0.16335076]
 [0.558794   0.2825773  0.15862872]
 [0.64987504 0.11430971 0.23581523]
 [0.12162676 0.60454935 0.27382395]
 [0.13389395 0.6408172  0.22528872]
 [0.10773923 0.57667625 0.3155845 ]
 [0.45118672 0.40414113 0.14

In [None]:
# getting the id of each article in dev data
y_dev_id = dev_sub1_df.id

# creating a dataframe with the predicted values using dictionary
dict = {'article_id': y_dev_id, 'class': y_pred_sub} 
submit_df = pd.DataFrame(dict) 
submit_df.set_index('article_id', inplace=True)
print(submit_df.head())

# saving the dataframe to a txt file in the required format
submit_df.to_csv('dev_data_25_16.txt', sep='\t')

Unnamed: 0_level_0,class
article_id,Unnamed: 1_level_1
820791520,reporting
821040551,reporting
813552066,reporting
817176202,reporting
820419869,reporting


## Test Data

In [None]:
# loading the test data
test_df = pd.read_csv('/Users/nitanshjain/Documents/Projects/Sem_Eval/semeval2023task3/preprocessed_data/subtask1/en_test_subtask_1.csv')
print(test_df.shape)
print(test_df.head())

In [None]:
# encoding the test data
x_test = test_df.preprocessed_headlines + test_df.preprocessed_articles
x_test = roberta_encode(x_test, tokenizer)

In [None]:
# predicting the test data
y_test = model_copy.predict(x_test)

# converting the predicted values to the required format
y_test_final = list()
y_test_sub = list()
for val in y_test:
    if val[0]>val[1] and val[0]>val[2]:
        y_test_final.append(0)
        y_test_sub.append('opinion')
    elif val[1]>val[0] and val[1]>val[2]:
        y_test_final.append(1)
        y_test_sub.append('reporting')
    else:
        y_test_final.append(2)
        y_test_sub.append('satire')

print(y_test_final) 

In [None]:
# getting the id of each article in test data
y_test_id = test_df.id

# creating a dataframe with the predicted values using dictionary
dict_test = {'article_id': y_test_id, 'class': y_test_sub} 
submit_test_df = pd.DataFrame(dict_test) 
submit_test_df.set_index('article_id', inplace=True)
submit_test_df.head()

# saving the dataframe to a txt file in the required format
submit_test_df.to_csv('2_test_data_25_16.txt', sep='\t')