#Training the model and using it for prediction.
Google Colab requires only Simpletransformers and wget libraries to be installed. Simpletransformers for working with the transformers models and wget to retrieve the zip file from github repository.

In [None]:
!pip install simpletransformers
!pip install wget

In [None]:
import pandas as pd
import numpy as np
import wget
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from sklearn.model_selection import train_test_split

#Evaluation
import sklearn
from scipy.special import softmax

#PARAMETERS
CLASSES = 3
seed = 1

In [None]:
#Load the training data
url = 'https://github.com/markmets/immigration-prediction-EST/raw/main/Annotated_Dataset.csv'
dataset = pd.read_csv(url)
dataset

# DATA PREPARATION

In [None]:
# Change the dataset to 0-2 scale (against, neutral, supportive). Model requires numerical input.
dataset = dataset[dataset.stanceConsolidated != 'MH'] #remove nonevaluative
dataset['stanceConsolidated'] = dataset['stanceConsolidated'].replace(['1','2'],'0')
dataset['stanceConsolidated'] = dataset['stanceConsolidated'].replace(['4','5'],'2')
dataset['stanceConsolidated'] = dataset['stanceConsolidated'].replace(['3'],'1')

dataset.reset_index(drop=True, inplace=True) #reset indexing after removing rows
dataset=dataset[['sentence','stanceConsolidated']] #Drop extra columns
dataset= dataset.astype({"stanceConsolidated": int}) #label has to be numerical type

print(dataset['stanceConsolidated'].value_counts(), '\nfull len:', len(dataset))

dataset.columns = ["text", "labels"] #rename columns

In [None]:
#Create evaluation set (20%)
train_df, eval_df = train_test_split(dataset, test_size=0.2, stratify=dataset['labels'], random_state=seed)
print('train data size: {},   evaluation data size: {}'.format(len(train_df), len(eval_df)))

In [None]:
# Get weights
  # Necessary with highly unbalanced datasets

unique, counts = np.unique(train_df['labels'], return_counts=True)

if CLASSES>=2:
    weight_for_0 = (1 / counts[0]) * (len(train_df) / 2.0)
    weight_for_1 = (1 / counts[1]) * (len(train_df) / 2.0) #to 2 class
if CLASSES>=3:
    weight_for_2 = (1 / counts[2]) * (len(train_df) / 2.0) #to 3 class 
if CLASSES==4:
    weight_for_3 = (1 / counts[3]) * (len(train_df) / 2.0) #to 4 class

#Weights output
class_weight=[weight_for_0,weight_for_1,weight_for_2]
print(class_weight)

# TRAIN

In [None]:
### Adds Classification report
def clasreport(labels, preds):
    return sklearn.metrics.classification_report(labels, preds, output_dict=True)
def confmatrix(labels, preds):
    return sklearn.metrics.confusion_matrix(labels, preds)

### Set model parameters
  #Main settings
model_args = ClassificationArgs()
model_args.manual_seed = seed #manually set seed
model_args.use_multiprocessing = False #Not working with XLMRoberta
#model_args.labels_list = ["0", "1", "2"] #Errors

  #Memory relevant and hyperparameters
model_args.num_train_epochs = 2
model_args.learning_rate = 5e-5
model_args.train_batch_size = 16 #default 8
model_args.eval_batch_size = 64
model_args.max_seq_length = 512 
model_args.warmup_ratio = 0.1

  #Saving models
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True 
model_args.no_save = False #Saving models, False is default

  #Eval during traing every epoch (currently does it anyways)
model_args.evaluate_during_training = False
model_args.best_model_dir='output' #directory for output model

### TRAIN
model = ClassificationModel("camembert", "EMBEDDIA/est-roberta", #Choose model
                            use_cuda=True, #True applies GPU
                             # cuda_device=1, #Choose GPU if multiple
                            num_labels=CLASSES, #number of classes (3 for negative, neutral, positive)
                            weight= class_weight, #class weights
                        args= model_args) #load parameters defined above

model.train_model(train_df)
result, model_outputs,wrong_predictions  = model.eval_model(eval_df, classification_report=clasreport, confusion_matrix=confmatrix)

print(pd.DataFrame(result['classification_report']).transpose()) #show classification report
#result["confusion_matrix"] #for numerical confusion matrix

#PREDICT

In [None]:
!wget https://github.com/markmets/immigration-prediction-EST/raw/main/All_immigration_sentences.zip #get zip file
!unzip /content/All_immigration_sentences.zip #unzip

predict_df = pd.read_csv('/content/All_immigration_sentences.csv')  #load csv
predict_df

In [None]:
# PREDICT
loaded_model = ClassificationModel("camembert", "/content/outputs/", use_cuda=True) # Load the saved model, use GPU

# PREDICT EXAMPLE SENTENCES
#predictions, raw_outputs = loaded_model.predict(["Ma armastan immigrante!", "immigratsiooni teemal on palju diskuteeritud", "Rohkem immigrante tähendab vähem töökohti kõikidele põliseestlastele ja see on probleem."])
#predictions

# PREDICT LOADED DATASET
# First loading bar is wrong and estimated time is much less than shown. Second is accurate. Ca 20min.
predictions, raw_outputs = loaded_model.predict(list(predict_df[predict_df.columns[0]])) #make first column of sentences into list and apply model for prediction
print('done:', len(predictions), 'predictions')

In [None]:
#SAVE THE PREDICTIONS
predict_df['predictions_stance']=predictions
probabilities_outputs = softmax(raw_outputs, axis=1)
predict_df['probabilities_outputs_stance'] = list(probabilities_outputs)
predict_df

#SAVE AS CSV FILE
#predict_df.to_csv('predictedData.csv', index=False)

# Extra - Adding threshold predictions
Default predictions chooses the most probable class. This allows to distinguish cases where the probability of most likely class is below or above certain threshold in relation to second best class.

In [None]:
def check_difference(valueList, threshold=0.3): #0.3 means that it has to be at least 30% more probable than any other class
    sorted_lst = np.sort(valueList)[::-1] #sort descending
    max_value = sorted_lst[0] #take largest
    second_max_value = sorted_lst[1] #take second largest
    if (max_value - second_max_value) >= threshold: #see if their difference is above threshold
        return np.argmax(valueList) #return largest value index
    else:
        return 'Uncertain' #return this string
    
predict_df['probabilities_outputs_stance'].apply(check_difference)
predict_df['predictions_stance_0.3'] = predict_df['probabilities_outputs_stance'].apply(check_difference)
predict_df

In [None]:
#Return the certainty of the class (in relation to second best)
def check_certainty(valueList): #0.3 means that it has to be at least 30% more probable than any other class
    sorted_lst = np.sort(valueList)[::-1] #sort descending
    max_value = sorted_lst[0] #take largest
    second_max_value = sorted_lst[1] #take second largest
    return (max_value - second_max_value)  #see if their difference is above threshold
    
predict_df['stance_certainty'] = predict_df['probabilities_outputs_stance'].apply(check_certainty)
predict_df