In [None]:
!pip install simpletransformers transformers==4.40.2

In [None]:
# Load the required packages

# Dataframes
import pandas as pd, numpy as np

# Regular expressions
import re

# Unidecoder
import unicodedata

# Timestamp / time measurment
import time

# for train/test data preparation
from sklearn.model_selection import train_test_split

# Label encode
from sklearn.preprocessing import LabelEncoder

# Class weights
from sklearn.utils.class_weight import compute_class_weight

# Model performance scores
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# PyTorch: enable GPU access
import torch

# Simpletransformers classifier
from simpletransformers.classification import ClassificationModel, ClassificationArgs


## Load data

In [None]:
# Training data
dat = pd.read_csv('training_data.csv')

dat['final_climate']=dat['final_climate'].astype(int)
dat['final_climate'].sum()

In [None]:
# set qs_id as index
dat.set_index("qs_new", drop = False, inplace = True, verify_integrity = True)

In [None]:
# make numeric labels
dat["label"] = dat["final_climate"].astype("category").cat.codes
dat["label"].value_counts()

In [None]:
# Make stratifications of data by langauge and climate relevance, from https://stackoverflow.com/a/62918682
dat["strata_"] = dat.set_index(['language','label']).index.factorize()[0]

## Load GPU

In [None]:
# If you want to select a specific GPU, set it here:
# gpu = 0
# torch.cuda.set_device(gpu)

# If there's a GPU available...
if torch.cuda.is_available():

    # Tell PyTorch to use the GPU.
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use GPU {}:'.format(torch.cuda.current_device()), torch.cuda.get_device_name(torch.cuda.current_device()))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

## Set up for training final model

In [None]:
# Load the label encoder
label_encoder = LabelEncoder()

# Encode the labels
dat['labels'] = label_encoder.fit_transform(train_df.label)


In [None]:
# Calculate weights
weights = compute_class_weight(class_weight = 'balanced', classes=[0,1], y=dat.labels)
weights = [*weights]
print(weights)

In [None]:
model_type = "xlmroberta"
model_name = "xlm-roberta-base"

In [None]:
%%time

# Create a ClassificationModel with optimised hyperparams
model = ClassificationModel(model_type, model_name, weight=weights,
                            num_labels = 2,
                            args={'reprocess_input_data': True,
                                  'overwrite_output_dir': True,
                                  'output_dir': 'results_22May/',
                                  # Hyperparameters
                                  'train_batch_size': 8,
                                  'num_train_epochs': 2,
                                  'learning_rate': 1e-5,
                                  'weight_decay': .187677,
                                  'hidden_dropout_prob': .188775,
                                  'attention_probs_dropout_prob': .330174,
                                  # Text processing
                                  'max_seq_length': 256,
                                  'sliding_window': True,
                                  'stride': 0.6,
                                  'do_lower_case': True,
                                  # Evaluation
                                  'evaluate_during_training': False,
                                  'evaluate_during_training_verbose': True,
                                  'evaluate_during_training_steps': -1,
                                  # Saving
                                  'save_model_every_epoch': False,
                                  'save_eval_checkpoints': True,
                                  })



In [None]:
%%time
# Train and evaluate the model
model.train_model(train_df = dat,
                  f1_eval = f1_class)

 ## Load model and collect predictions

In [None]:
model = ClassificationModel(
    model_type, model_name = 'results_22May'
)

In [None]:
# load full data
master = pd.read_csv('total_predictions_training_15May_trimmed_for_pred.csv')

In [None]:
# remove all training set dat['qs_new']
master = master[~master['qs_new'].isin(dat['qs_new'])]
master.reset_index(inplace=True)

In [None]:
master.shape

In [None]:
%%time
# Code for running (and saving) the predictions in batches in case GPU resources limited. Set variable stop to size of dataset or simply remove the row indexing  to do it all at once.
start=0
stop=int(2e5)
preds,output = model.predict(master['original_text'][start:stop].tolist())
pd.DataFrame(zip(master['qs_new'][start:stop],preds,output),columns=['qs_new','preds','output']).to_csv(f'results_22May/pred_outputs_{start}_{stop}.csv')
