In [None]:
!pip install simpletransformers transformers==4.40.2

In [None]:

# Load the required packages

# Dataframes
import pandas as pd, numpy as np

# Regular expressions
import re

# Unidecoder
import unicodedata

# Timestamp / time measurment
import time

# for train/test data preparation
from sklearn.model_selection import train_test_split

# Label encode
from sklearn.preprocessing import LabelEncoder

# Class weights
from sklearn.utils.class_weight import compute_class_weight

# Model performance scores
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# PyTorch: enable GPU access
import torch

# Simpletransformers classifier
from simpletransformers.classification import ClassificationModel, ClassificationArgs


## Load data
Code for cross-validation inspired by Hauke Licht's Cross-lingual supervised text classification tutorial: https://github.com/haukelicht/crosslingual-supervised-text-classification-tuorial?tab=readme-ov-file

In [None]:
# Training data
dat = pd.read_csv('training_data.csv')

dat['final_climate']=dat['final_climate'].astype(int)
dat['final_climate'].sum()

In [None]:
# set qs_id as index b
dat.set_index("qs_new", drop = False, inplace = True, verify_integrity = True)

In [None]:
# make numeric labels
dat["label"] = dat["final_climate"].astype("category").cat.codes
dat["label"].value_counts()

In [None]:
# Make stratifications of data by langauge and climate relevance, from https://stackoverflow.com/a/62918682
dat["strata_"] = dat.set_index(['language','label']).index.factorize()[0]

## Set up GPU

In [None]:
# If you want to select a specific GPU, set it here:
# gpu = 0
# torch.cuda.set_device(gpu)

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use GPU {}:'.format(torch.cuda.current_device()), torch.cuda.get_device_name(torch.cuda.current_device()))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## Set up for cross validation

In [None]:
# CREATE [train+test]/validation split (80-20)

train_test_ids, val_ids = train_test_split(dat.index.values, test_size = .2, stratify = dat.strata_.values)

print(len(train_test_ids), "training samples")
print(len(val_ids), "val samples")


In [None]:
# create train, val, test dfs
train_test_df = pd.DataFrame(zip(dat.loc[train_test_ids]['original_text'].values,dat.loc[train_test_ids][f'final_climate'].values,dat.loc[train_test_ids]['strata_'].values),columns=['text','labels','strata_'])
val_df = pd.DataFrame(zip(dat.loc[val_ids]['original_text'].values,dat.loc[val_ids][f'final_climate'].values),columns=['text','labels'])

In [None]:
# Load the label encoder
label_encoder = LabelEncoder()

# Encode the labels
train_test_df['labels'] = label_encoder.fit_transform(train_test_df.labels)
val_df['labels'] = label_encoder.fit_transform(val_df.labels)

In [None]:
train_test_df.to_csv(f'cv_train_test.csv',header=True,index=False)
val_df.to_csv(f'cv_val.csv',header=True,index=False)

In [None]:
# MAKE THE FOLDS
N_FOLDS = 5
cv_idxs = np.random.choice(range(N_FOLDS), len(train_test_ids))

cv_folds_ids = list()

for fold in range(5):
  idxs = cv_idxs == fold
  # train IDs, test IDs
  cv_folds_ids.append( ( train_test_ids[np.logical_not(idxs)], train_test_ids[idxs] ) )


In [None]:
# ARRANGE INTO DICTIONARY
cv_folds_dict = dict()
for n,_ in enumerate(cv_folds_ids):
  cv_folds_dict[n]=cv_folds_ids[n]

In [None]:
# set up model arguments

model_args = ClassificationArgs()
model_args.reprocess_input_data = True
model_args.overwrite_output_dir = True
model_args.evaluate_during_training = True
model_args.manual_seed = 4
model_args.use_multiprocessing = True

model_args.train_batch_size = 8
model_args.eval_batch_size = 8
model_args.num_train_epochs = 2
model_args.learning_rate= 1e-5
model_args.max_seq_length = 256
model_args.do_lower_case=True
model_args.sliding_window = True
model_args.stride = 0.6

model_args.weight_decay = .187677
model_args.hidden_dropout_prob =  .188775
model_args.attention_probs_dropout_prob = .330174

model_args.labels_list = [0,1]
model_args.no_save = True
model_args.save_model_every_epoch=False
model_args.save_optimizer_and_scheduler=False


In [None]:
results_dict = dict()
preds_dict = dict()

for n,fold in cv_folds_dict.items():
  # create train, test dfs
  train_df = pd.DataFrame(zip(dat.loc[fold[0]]['original_text'].values,dat.loc[fold[0]]['label'].values),columns=['text','labels'])
  test_df = pd.DataFrame(zip(dat.loc[fold[1]]['original_text'].values,dat.loc[fold[1]]['label'].values),columns=['text','labels'])

  # Examine size of splits
  print(train_df.shape[0], test_df.shape[0])

  # TEST STRATA DISTRIBUTIONS ACROSS FOLDS
  # train_dist = df.loc[fold[0]].groupby(["language", "label"]).size().unstack()
  # test_dist  = df.loc[fold[1]].groupby(["language", "label"]).size().unstack()

  # Load the label encoder
  label_encoder = LabelEncoder()

  # Encode the labels
  train_df['labels'] = label_encoder.fit_transform(train_df.labels)
  test_df['labels'] = label_encoder.fit_transform(test_df.labels)


  print(f'Fold: {n}')

  weights = compute_class_weight(class_weight = 'balanced', classes=[0,1], y=train_df.labels)
  weights = [*weights]

  model = ClassificationModel(model_type, model_name,  weight=weights, num_labels=2, args=model_args)
  print(model.train_model(train_df, eval_df = test_df))

  eval_result, eval_model_outputs, eval_wrong_predictions = model.eval_model(val_df,
                                                              f1_score = f1_class,
                                                              acc=accuracy_score,
                                                              recall=recall,
                                                              precision=precision)


  print(eval_result)
  results_dict[n]=eval_result

  preds,output = model.predict(val_df['text'].tolist())
  true_labels = val_df['labels']
  print(classification_report(true_labels, preds))

  val_df['preds']=preds
  val_df['original_text']=val_df['text']
  preds_merged = df.merge(val_df,on='original_text',how='inner')
  preds_dict[n]=preds_merged


In [None]:
# Save overall results
pd.DataFrame.from_dict(results_dict).to_csv('results_5-fold_cv.csv')

In [None]:
lang_perf=pd.DataFrame()
# language specific f1 score
for k in preds_dict.keys():
  lang_fold_perf = dict()
  for i in preds_dict[k]['language'].unique():
    s = preds_dict[k][preds_dict[k]['language']==i]
    try:
      lang_fold_perf[i]=classification_report(s['labels'], s['preds'],output_dict=True)['1']['f1-score']
    except:
      print('Error with', i)
  out = pd.DataFrame(lang_fold_perf.values(),index=lang_fold_perf.keys()).T
  lang_perf=pd.concat([lang_perf,out])


In [None]:
# save per language results
lang_perf.to_csv('language_performance.csv')