In [None]:
!pip install simpletransformers transformers==4.40.2

In [None]:

# Load the required packages

# Dataframes
import pandas as pd, numpy as np

# Regular expressions
import re

# Unidecoder
import unicodedata

# Timestamp / time measurment
import time

# for train/test data preparation
from sklearn.model_selection import train_test_split

# Label encode
from sklearn.preprocessing import LabelEncoder

# Class weights
from sklearn.utils.class_weight import compute_class_weight

# Model performance scores
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# PyTorch: enable GPU access
import torch

# Simpletransformers classifier
from simpletransformers.classification import ClassificationModel, ClassificationArgs


## Load data

In [None]:
# Training data
dat = pd.read_csv('training_data.csv')

dat['final_climate']=dat['final_climate'].astype(int)
dat['final_climate'].sum()

In [None]:
# set qs_id as index b
dat.set_index("qs_id", drop = False, inplace = True, verify_integrity = True)

In [None]:
# make numeric labels
dat["label"] = dat["final_broad"].astype("category").cat.codes
dat["label"].value_counts()

In [None]:
# Make stratifications of data by langauge and climate relevance, from https://stackoverflow.com/a/62918682
dat["strata_"] = dat.set_index(['language','label']).index.factorize()[0]

## Set up GPU

In [None]:
# If you want to select a specific GPU, set it here:
# gpu = 0
# torch.cuda.set_device(gpu)

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use GPU {}:'.format(torch.cuda.current_device()), torch.cuda.get_device_name(torch.cuda.current_device()))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## Set up for cross validation

In [None]:
# separate 10% validation set
val_df = dat.sample(frac=.2)
train_data = dat.iloc[~val_df.index]

In [None]:
# set up model arguments based on optimisation results
model_type = "xlmroberta"
model_name = "xlm-roberta-base"

model_args = ClassificationArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.evaluate_during_training = False
model_args.manual_seed = 4
model_args.use_multiprocessing = True

model_args.train_batch_size = 8
model_args.eval_batch_size = 8
model_args.num_train_epochs = 2
model_args.learning_rate= 1e-5
model_args.max_seq_length = 256
model_args.sliding_window = True
model_args.stride = 0.6
model_args.weight_decay = 0

model_args.labels_list = [0,1]
model_args.no_save = True
model_args.save_model_every_epoch=False
model_args.save_optimizer_and_scheduler=False

In [None]:
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import classification_report

seed=4
n=5
kf = StratifiedKFold(n_splits=n, random_state=seed, shuffle=True)

# Load the label encoder
label_encoder = LabelEncoder()

results_dict = dict()
preds_dict = dict()

n=0
for train_index, test_index in kf.split(dat.index,dat.strata_):

    # create train, val, test dfs
    train_df = pd.DataFrame(zip(dat.iloc[train_index]['original_text'].values,dat.iloc[train_index]['clim_pol_rel'].values),columns=['text','labels'])
    test_df = pd.DataFrame(zip(dat.iloc[test_index]['original_text'].values,dat.iloc[test_index]['clim_pol_rel'].values),columns=['text','labels'])

    # Encode the labels
    train_df['labels'] = label_encoder.fit_transform(train_df.labels)
    test_df['labels'] = label_encoder.fit_transform(test_df.labels)

    weights = compute_class_weight(class_weight = 'balanced', classes=[0,1], y=train_df.labels)
    weights = [*weights]

    print(train_df.shape, test_df.shape)

    model = ClassificationModel(model_type, model_name, weight=weights, num_labels=2, args=model_args)
    model.train_model(train_df)
    result, model_outputs, wrong_predictions = model.eval_model(test_df,
                                                                f1_score = f1_score,
                                                                acc=accuracy_score,
                                                                recall=recall,
                                                                precision=precision)

    print(f'Fold: {n}')
    print(result)
    results_dict[n]=result

    preds,output = model.predict(test_df['text'].tolist())
    true_labels = test_df['labels']
    print(classification_report(true_labels, preds))

    test_df['preds']=preds
    test_df['original_text']=test_df['text']
    preds_merged = dat.merge(test_df,on='original_text',how='inner')
    preds_dict[n]=preds_merged

    n+=1

In [None]:
# compile overall results
results = pd.DataFrame()
for k,v in results_dict.items():
  out = pd.DataFrame(v.values(),index=v.keys()).T
  rd = pd.concat([rd,out])
results.reset_index(inplace=True,drop=True)
results

In [None]:
# save
results.to_csv('cv_overall_results.csv')

In [None]:
# compile per langauge results
overall_lang_perf=pd.DataFrame()
for k in preds_dict.keys():
  lang_fold_perf = dict()
  for i in preds_dict[k]['language'].unique():
    print(i)
    s = preds_dict[k][preds_dict[k]['language']==i]
    lang_fold_perf[i]=classification_report(s['labels'], s['preds'],output_dict=True)['1']['f1-score']
  out = pd.DataFrame(lang_fold_perf.values(),index=lang_fold_perf.keys()).T
  overall_lang_perf=pd.concat([overall_lang_perf,out])


In [None]:
# save
overall_lang_perf.to_csv('language_performance.csv')