# Data Processing

We annotate for 5 dimensions and prepare labeled datasets for each dimension:

*   cm_labeled
*   punitiveness_labeled
*   praise_labeled
*   otr_labeled
*   rationale_labeled



In [None]:
from google.colab import files
import io
import os
import re
import pandas as pd
import numpy as np

# Select one dataset
df = pd.read_csv("cm_labeled.csv")
df = df.rename(columns={'cm': 'label'})
# Give each row a prediction index
df['pred_index'] = range(1, len(df) + 1)

In [None]:
# Mask out numbers and student identifiers
df["text"] = df["text"].apply(lambda x: re.sub('[0-9]+', "<num>", x))
df["text"] = df["text"].apply(lambda x: re.sub('(student )[a-z][^a-z]', "<student>", x))
df["text"] = df["text"].apply(lambda x: re.sub('(Student )[A-Z]', "<student>", x))

# Utility

Install packages and set up the train, predict, and evaluation functions.

In [None]:
!pip install transformers==4.24.0
!pip install simpletransformers==0.63.11
from simpletransformers.classification import ClassificationModel, ClassificationArgs
from argparse import ArgumentParser
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import KFold, train_test_split
from scipy.stats import pearsonr, spearmanr
import warnings
import pandas as pd
from sys import exit
import logging
import torch
warnings.filterwarnings("ignore")

In [None]:
def pearson_corr(preds, labels):
    return pearsonr(preds, labels)[0]

def spearman_corr(preds, labels):
    return spearmanr(preds, labels)[0]

def accuracy(preds, labels):
    return sum([p == l for p, l in zip(preds, labels)]) /len(labels)

def precision(preds, labels):
    return precision_score(y_true=labels, y_pred=preds)#, average = "weighted")

def recall(preds, labels):
    return recall_score(y_true=labels, y_pred=preds)#, average = "weighted")

def f1(preds, labels):
    return f1_score(y_true=labels, y_pred=preds)#, average = "weighted")

In [None]:
def train(colname, train_df, eval_df, text_cols,
          output_dir, model="roberta", num_labels=2,
          num_train_epochs=10,
          train_batch_size=8, gradient_accumulation_steps=2,
          max_seq_length=512,
          cross_validate=False,
          balance_labels_trim=False,
          balance_labels_weights=False,
          weights = None):
    print("Train size: %d" % len(train_df))
    print("Eval size: %d" % len(eval_df))

    print(train_df.head())
    print(eval_df.head())

    print("Is CUDA available? " + str(torch.cuda.is_available()))

    print("balance_labels_trim: " + str(balance_labels_trim))
    if balance_labels_trim:
        most_common = train_df["labels"].value_counts().idxmax()
        print("Most common label is: %s" % most_common)
        most_common_df = train_df[train_df["labels"]==most_common]
        concat_list = [most_common_df]
        for label, group in train_df[train_df["labels"]!=most_common].groupby("labels"):
            concat_list.append(group.sample(replace=True, n=len(most_common_df)))
        train_df = pd.concat(concat_list)
        print("Train size: %d" % len(train_df))
        print(train_df["labels"].value_counts())

    # Shuffle training data
    train_df = train_df.sample(frac=1)
    save_dir = output_dir + "/" + colname + "_train_size=" + str(len(train_df))

    model_args = ClassificationArgs()
    model_args.reprocess_input_data = True
    model_args.overwrite_output_dir = True
    model_args.evaluate_during_training = True  # change if needed
    model_args.max_seq_length = int(max_seq_length / len(text_cols))
    model_args.num_train_epochs = num_train_epochs
    model_args.evaluate_during_training_steps = int(len(train_df) / train_batch_size) # after each epoch
    model_args.save_eval_checkpoints = False
    model_args.save_model_every_epoch = False
    model_args.wandb_project = colname
    model_args.train_batch_size = train_batch_size
    model_args.output_dir = save_dir
    model_args.best_model_dir = save_dir +"/best_model"
    model_args.cache_dir = save_dir + "/cache"
    model_args.tensorboard_dir = save_dir + "/tensorboard"
    model_args.regression = num_labels == 1
    model_args.gradient_accumulation_steps = gradient_accumulation_steps
    model_args.wandb_kwargs = {"reinit": True}
    model_args.fp16 = False
    model_args.fp16_opt_level = "O0"
    model_args.no_cache = False
    model_args.no_save = cross_validate
    model_args.save_optimizer_and_scheduler = True

    print("balance_labels_weights: " + str(weights))
    if balance_labels_weights:
      model = ClassificationModel(model.split("-")[0], model,
                                use_cuda=torch.cuda.is_available(),
                                num_labels=num_labels,
                                args=model_args, weight=weights)
    else:
      model = ClassificationModel(model.split("-")[0], model,
                                use_cuda=torch.cuda.is_available(),
                                num_labels=num_labels,
                                args=model_args)

    print("regression: " + str(model_args.regression))
    print("num_labels: " + str(num_labels))

    train_args = {"use_multiprocessing": False,
                            "process_count": 1,
                            "use_multiprocessing_for_evaluation": False}
    if model_args.regression:
      model.train_model(train_df,
                      eval_df=eval_df,
                      pearson=pearson_corr,
                      spearman=spearman_corr,
                      args=train_args)
    else:
      model.train_model(train_df,
                      eval_df=eval_df,
                      accuracy=accuracy,
                      precision=precision,
                      recall=recall,
                      f1=f1,
                      args=train_args)
    return model

In [None]:
def predict(fname, model_path, model=None,
            model_type="roberta-base", predict_list=None,
          index_list=None, index_colname="index"):

    print(model_path)

    if model is None:
        model = ClassificationModel(model_type.split("-")[0], model_path)

    preds, outputs = model.predict(predict_list)
    with open(model_path + '/' + fname + '_preds.txt', 'w') as f:
        f.write(f"{index_colname}\tpred\outputs\n")
        for index, pred, output in zip(index_list, preds, outputs):
            f.write(f"{index}\t{pred}\t{output}\n")

    return preds

In [None]:
def save_errors(fname, df):
  false_positives = df[df["pred"] > df["labels"]]
  fp_filename = "fp_" + fname + ".csv"
  false_positives.to_csv(fp_filename)
  files.download(fp_filename)
  false_negatives = df[df["pred"] < df["labels"]]
  fn_filename = "fn_" + fname + ".csv"
  false_negatives.to_csv(fn_filename)
  files.download(fn_filename)

# Setup

Set parameters.

In [None]:
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

train_data = df
label_col = "label"
text_cols = "text"
predict_index_col = "pred_index"
model_type = "roberta-base"
#model_type = "bert-base-cased"
text_cols = text_cols.split(",")
output_dir = "outputs/roberta"
#output_dir = "outputs/bert"
model = None
dev_split_size = 0
num_train_epochs = 5
train_batch_size=8
gradient_accumulation_steps=2
balance_labels_trim=False,
balance_labels_weights=True,
weights = [1, 0.15]

print("Loading data from %s" % train_data)
train_data = train_data[~train_data[label_col].isnull()]
print("Loaded %d training examples." % len(train_data))
print("Using %s as label" % label_col)

if len(text_cols) == 1:
  train_data = train_data.rename(columns={text_cols[0]: 'text', label_col: 'labels'})
  cols = ["text", "labels", "pred_index"]
elif len(text_cols) == 2:
  train_data = train_data.rename(columns={text_cols[0]: 'text_a',
                                          text_cols[1]: 'text_b',
                                          label_col: 'labels'})
  cols = ["text_a", "text_b", "labels"]
else:
    print("You can have up to 2 texts to classify!")
    exit()

train_data = train_data[cols].dropna()
train_data.head()

#Cross Validation

This workflow will ask you to sign in to Weights and Biases. Track model performance metrics at the WB dashboards.

In [None]:
n = 5
kf = KFold(n_splits=n, random_state=42, shuffle=True)
k = 0
for train_index, val_index in kf.split(train_data):
  print("Split %d" % k)
  output_dir_k = output_dir + "/" + label_col + "_k%d" % k

  train_df = train_data.iloc[train_index]
  eval_df = train_data.iloc[val_index]
  model = train(label_col, train_df, eval_df, text_cols, output_dir=output_dir_k,
                          model=model_type, num_labels=2, num_train_epochs=5, balance_labels_weights=balance_labels_weights, weights = weights,
                          cross_validate=True)
  # Alternatively, use 1 label for training regression models
  # model = train(label_col, train_df, eval_df, text_cols, output_dir=output_dir,
  #                       model=model_type, num_labels=1, num_train_epochs=5, balance_labels=False, cross_validate=True)

  if len(text_cols) == 1:
    predict_list = eval_df["text"].tolist()
  elif len(text_cols) == 2:
    predict_list = eval_df[["text_a", "text_b"]].values.tolist()
  else:
    print("You can have up to 2 texts to classify!")
    exit()

  index_list = eval_df[predict_index_col].tolist()
  fname = label_col + "_pred" + "_split_%d" % k
  preds = predict(fname, output_dir_k, model, model_type, predict_list=predict_list,
                    index_list=index_list, index_colname=predict_index_col)

  # on the last split, save the false positives/negatives
  # if k == (n-1):
  #   eval_df["pred"] = preds
  #   save_errors(trial_name, eval_df)

  k += 1

#Predict Final

This workflow will ask you to sign in to Weights and Biases. Track model performance metrics at the WB dashboards.

In [None]:
train_df = train_data
eval_df = train_data
model = train(label_col, train_df, eval_df, text_cols, output_dir,
              model_type, num_labels=2, num_train_epochs=num_train_epochs, balance_labels=False)

# Alternatively, use 1 label for training regression models
# model = train(label_col, train_df, eval_df, text_cols, output_dir=output_dir,
#                         model=model_type, num_labels=1, num_train_epochs=5, balance_labels=False)

In [None]:
from google.colab import files
import io
import os
import re
import pandas as pd
import numpy as np

# TODO: Read in the file you want to predict
# Reminder: Remove training data from the application task set prior to uploading here
app_df = pd.read_csv("YOURFILE.csv")
app_df['pred_index'] = range(1, len(app_df) + 1)

In [None]:
# Mask out numbers and student identifiers
app_df["text"] = app_df["text"].apply(lambda x: re.sub('[0-9]+', "<num>", x))
app_df["text"] = app_df["text"].apply(lambda x: re.sub('(student )[a-z][^a-z]', "<student>", x))
app_df["text"] = app_df["text"].apply(lambda x: re.sub('(Student )[A-Z]', "<student>", x))

In [None]:
# If the application set is very large, predict in chunks
# Otherwise, adapt to predict once
chunks = np.array_split(app_df, 25)
output_dir_k = output_dir + "/preds" # You might have to make this folder in colab first
chunk_n = 0
for chunk in chunks:
  predict_list = chunk["text"].tolist()
  index_list = chunk[predict_index_col].tolist()
  fname = label_col + "_pred_%d" % chunk_n
  predict(fname, output_dir_k, model, model_type, predict_list=predict_list,
        index_list=index_list, index_colname=predict_index_col)
  chunk_n += 1

In [None]:
# Piece predicted chunks back together again
outputfiles = os.listdir("outputs/roberta/preds")
output_dfs = []
for ofile in outputfiles:
  if ofile.startswith(label_col + "_pred_"):
    ofile_df = pd.read_csv("outputs/roberta/preds/" + ofile, sep="\t")
    ofile_df['index'] = ofile_df.index
    ofile_df.reset_index()
    ofile_df.columns = ['pred', "raw", 'pred_index']
    output_dfs.append(ofile_df)

pred_df = pd.concat(output_dfs)
# Join with original utterances
combined_df = pred_df.join(app_df.set_index('pred_index'), on="pred_index")
combined_df_sorted = combined_df.sort_values(by=['pred_index'])
combined_df_sorted.to_csv("predictions.csv")
files.download('predictions.csv')