# Emotion Regression: How angry are you?

## Load data and packages

In [1]:
# do the imports and downloads for nltk
import simpletransformers
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import logging
import time
import torch
import pandas as pd
import numpy as np
from datasets import load_dataset

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f"CUDA available is '{torch.cuda.is_available()}' --> device={device}")

CUDA available is 'True' --> device=cuda


In [2]:
# load and preprocess data as pandas.DataFrame

def load_dataframe(key="train"):
    assert key in ["train", "test"]
    if key == "train": fname = "EI-reg-En-anger-train.txt"
    else: fname = "2018-EI-reg-En-anger-test-gold.txt"
    with open(fname, encoding="utf-8") as f:
        train_text = f.read()
    train_txt = train_text.split("\n")
    data = []
    for line in train_txt[1:-1]:
        cols = line.split("\t")
        data.append([cols[1], float(cols[3])])
    print(f"* {key}-data (list-of-list):")
    [print(x) for x in data[:5]]
    print("\n")
    print(f"* DataFrame ({key}):")
    df = pd.DataFrame(data)
    df.columns = ["text", "labels"]
    print(df.sample(5))
    print("\n")
    return df

    
train_df = load_dataframe("train")
test_df = load_dataframe("test")

* train-data (list-of-list):
['@xandraaa5 @amayaallyn6 shut up hashtags are cool #offended', 0.562]
['it makes me so fucking irate jesus. nobody is calling ppl who like hajime abusive stop with the strawmen lmao', 0.75]
['Lol Adam the Bull with his fake outrage...', 0.417]
["@THATSSHAWTYLO passed away early this morning in a fast and furious styled car crash as he was leaving an ATL strip club. That's rough stuff", 0.354]
['@Kristiann1125 lol wow i was gonna say really?! haha have you seen chris or nah? you dont even snap me anymore dude!', 0.438]


* DataFrame (train):
                                                   text  labels
1334  It's the most magical time of the year......Xm...   0.429
1445  @luxbet Did you even give out any pizzas ? Ser...   0.400
57    @NJDDanin123 I personally liked #relentless …d...   0.354
528   @LaraTheIrish &lt; feel everything. She would ...   0.479
1257                         Literally fuming fuck sake   0.860


* test-data (list-of-list):
["@PageSh

## Setup model architecture

In [3]:
# setup regression transformer model (BERT)

def get_reg_transformer(epochs=20):
    model_args = ClassificationArgs()
    model_args.num_train_epochs = epochs
    model_args.regression = True
    model_args.device = device
    model_args.train_batch_size = 32
    model_args.eval_batch_size = model_args.train_batch_size
    model_args.learning_rate = 4e-5
    model = ClassificationModel(
        "bert",
        "bert-base-uncased",
        num_labels=1,
        args=model_args
    )
    return model

model = get_reg_transformer()


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Train model and evaluate test-set (PearsonR)

In [4]:
# train regression model
model.train_model(train_df)

  0%|          | 0/1701 [00:00<?, ?it/s]

Epoch:   0%|          | 0/20 [00:00<?, ?it/s]

Running Epoch 0 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 1 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 2 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 3 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 4 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 5 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 6 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 7 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 8 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 9 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 10 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 11 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 12 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 13 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 14 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 15 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 16 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 17 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 18 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

Running Epoch 19 of 20:   0%|          | 0/54 [00:00<?, ?it/s]

(1080, 0.008126266685199139)

In [5]:
# evaluate model on train-set (to optimize hyperparameters)
import scipy
def evaluate_pearsonR(key="train", train=train_df, test=test_df):
    assert key in ["train", "test"]
    if key == "train": df = train
    else: df = test
    x = list(df["text"])
    y = list(df["labels"])
    predictions, _ = model.predict(x)
    pearsonR = scipy.stats.pearsonr(predictions, y)
    print(f"* Pearson Correlation on entire test-set: {round(100 * pearsonR.statistic, 2)}%")
    
evaluate_pearsonR("train")

  0%|          | 0/1701 [00:00<?, ?it/s]

  0%|          | 0/54 [00:00<?, ?it/s]

* Pearson Correlation on entire test-set: 98.92%


In [6]:
# evaluate on test-set (not used for optimization of hyperparameters)
evaluate_pearsonR("test")

  0%|          | 0/1002 [00:00<?, ?it/s]

  0%|          | 0/32 [00:00<?, ?it/s]

* Pearson Correlation on entire test-set: 73.61%
