### Loading Data

this notebook is very inspired by Coan et al. work at: https://www.nature.com/articles/s41598-021-01714-4

In [None]:
import pandas as pd
import numpy as np
#import preprocess
import torch
from sklearn.preprocessing import LabelEncoder
import re
import unicodedata
from sklearn.utils.class_weight import compute_class_weight
from simpletransformers.classification import ClassificationModel
from google.colab import drive
from sklearn.utils.class_weight import compute_class_weight
import sklearn.metrics as m



In [None]:
drive.mount('/content/gdrive/', force_remount=True)

In [None]:
training = pd.read_csv("/content/gdrive/MyDrive/Contrarian_Claims/training.csv")
validation = pd.read_csv("/content/gdrive/MyDrive/Contrarian_Claims/validation.csv")
testing = pd.read_csv("/content/gdrive/MyDrive/Contrarian_Claims/test.csv")

In [None]:
training.head(3)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

### Pre-processing

In [None]:
# Define text pre-processing functions as the original authors do it.
def remove_between_square_brackets(text):
    return re.sub('\[[^]]*\]', '', text)
def remove_non_ascii(text):
    """Remove non-ASCII characters from list of tokenized words"""
    return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
def strip_underscores(text):
    return re.sub(r'_+', ' ', text)
def remove_multiple_spaces(text):
    return re.sub(r'\s{2,}', ' ', text)

# Merge text pre-processing functions
def denoise_text(text):
    text = remove_between_square_brackets(text)
    text = remove_non_ascii(text)
    text = strip_underscores(text)
    text = remove_multiple_spaces(text)
    return text.strip()

In [None]:
#pre-process the text data
#Pre-process the text
training['text'] = training['text'].astype(str).apply(denoise_text)
validation['text'] = validation['text'].astype(str).apply(denoise_text)
testing['text'] = testing['text'].astype(str).apply(denoise_text)

# Load the label encoder
label_encoder = LabelEncoder()

# Encode the labels
training['labels'] = label_encoder.fit_transform(training["claim"])
validation['labels'] = label_encoder.fit_transform(validation["claim"])
testing['labels'] = label_encoder.fit_transform(testing["claim"])


In [None]:
# Calculate weights
weights = compute_class_weight('balanced', classes=training.labels.unique(), y=training.labels)
weights = [*weights]


# RoBerta

In [None]:
#Train model

#cuda_available = torch.cuda.is_available()


model = ClassificationModel('roberta', 'roberta-large', 
                            num_labels = 18, weight = weights,
                            args={'reprocess_input_data': True, 
                                  'overwrite_output_dir': False,
                                  'output_dir': 'models/new_model/',
                                  'best_model_dir': 'models/new_model/best_model/',
                                  # Hyperparameters
                                  'train_batch_size': 6,
                                  'num_train_epochs': 3, 
                                  'learning_rate': 1e-5,
                                  # Text processing
                                  'max_seq_length': 256,
                                  'sliding_window': True,
                                  'stride': 0.6,
                                  'do_lower_case': False,
                                  # Saving
                                  'save_model_every_epoch': True,
                                  'save_eval_checkpoints': True,
                                  'weight_decay': 0
                                  })
model.train_model(training)

In [None]:
# Define additional model performance scores (F1)
def f1_multiclass_macro(labels, preds):
    return m.f1_score(labels, preds, average='macro')
def f1_multiclass_micro(labels, preds):
    return m.f1_score(labels, preds, average='micro')
def f1_multiclass_weighted(labels, preds):
    return m.f1_score(labels, preds, average='weighted')
def f1_class(labels, preds):
    return m.f1_score(labels, preds, average=None)
def precision(labels, preds):
    return m.precision_score(labels, preds, average='macro')
def recall(labels, preds):
    return m.recall_score(labels, preds, average='macro')


result, model_outputs, wrong_predictions = model.eval_model(validation, f1_macro = f1_multiclass_macro, 
                                                            precision = precision, 
                                                            recall = recall,
                                                            f1_micro = f1_multiclass_micro, 
                                                            f1_weighted = f1_multiclass_weighted, 
                                                            f1_class = f1_class)

In [None]:
print(result)

### XLNet

In [None]:
xlnet = ClassificationModel("xlnet", "xlnet-large-cased", num_labels=18, weight=weights, use_cuda=False,
                                  args={'reprocess_input_data': True, 
                                  'overwrite_output_dir': True,
                                  'output_dir': '/Models/new_model',
                                  'best_model_dir': '/Models/best_model',
                                  
                                  #Hyperparameters
                                  'train_batch_size': 6,
                                  'num_train_epochs': 3, 
                                  'learning_rate': 1e-5,

                                  'max_seq_length': 256,
                                  'sliding_window': True,
                                  'stride': 0.6,
                                  'do_lower_case': False,

                                  'save_model_every_epoch': True,
                                  'save_eval_checkpoints': False,
                                  'weight_decay': 0,
                                  "save_steps" : 8000
                                  })

xlnet.train_model(training)

In [None]:
result, model_outputs, wrong_predictions = xlnet.eval_model(validation, f1_macro = f1_multiclass_macro, 
                                                            precision = precision, 
                                                            recall = recall,
                                                            f1_micro = f1_multiclass_micro, 
                                                            f1_weighted = f1_multiclass_weighted, 
                                                            f1_class = f1_class)


In [None]:
print(result)

### BERT

In [None]:
#Train model

#cuda_available = torch.cuda.is_available()


model = ClassificationModel('bert', 'bert-large-cased', 
                            num_labels = 18, weight = weights,
                            args={'reprocess_input_data': True, 
                                  'overwrite_output_dir': False,
                                  'output_dir': 'models/new_model/',
                                  'best_model_dir': 'models/new_model/best_model/',
                                  # Hyperparameters
                                  'train_batch_size': 6,
                                  'num_train_epochs': 3, 
                                  'learning_rate': 1e-5,
                                  # Text processing
                                  'max_seq_length': 256,
                                  'sliding_window': True,
                                  'stride': 0.6,
                                  'do_lower_case': False,
                                  # Saving
                                  'save_model_every_epoch': True,
                                  'save_eval_checkpoints': True,
                                  'weight_decay': 0
                                  })
model.train_model(training)

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(validation, f1_macro = f1_multiclass_macro, 
                                                            precision = precision, 
                                                            recall = recall,
                                                            f1_micro = f1_multiclass_micro, 
                                                            f1_weighted = f1_multiclass_weighted, 
                                                            f1_class = f1_class)


In [None]:
print (result)

### DistilBERT (more epochs)

In [None]:
#Train model

#cuda_available = torch.cuda.is_available()


model = ClassificationModel('distilbert', 'distilbert-base-cased', 
                            num_labels = 18, weight = weights,
                            args={'reprocess_input_data': True, 
                                  'overwrite_output_dir': False,
                                  'output_dir': 'models/new_model/',
                                  'best_model_dir': 'models/new_model/best_model/',
                                  # Hyperparameters
                                  'train_batch_size': 10,
                                  'num_train_epochs': 5, 
                                  'learning_rate': 1e-5,
                                  # Text processing
                                  'max_seq_length': 256,
                                  'sliding_window': True,
                                  'stride': 0.6,
                                  'do_lower_case': False,
                                  # Saving
                                  'save_model_every_epoch': True,
                                  'save_eval_checkpoints': True,
                                  'weight_decay': 0
                                  })
model.train_model(training)

In [None]:
result, model_outputs, wrong_predictions = model.eval_model(validation, f1_macro = f1_multiclass_macro, 
                                                            precision = precision, 
                                                            recall = recall,
                                                            f1_micro = f1_multiclass_micro, 
                                                            f1_weighted = f1_multiclass_weighted, 
                                                            f1_class = f1_class)


In [None]:
print (result)