#**Exercise 5 - Sequence and Anger Regression using Transformers**#
#Part 2 Emotion Regression: How angry are you?

# Prepare the Environment

In [None]:
!pip install simpletransformers
from simpletransformers.classification import ClassificationModel, ClassificationArgs

Collecting simpletransformers
  Downloading simpletransformers-0.64.3-py3-none-any.whl (250 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m250.8/250.8 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
Collecting datasets (from simpletransformers)
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval (from simpletransformers)
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting wandb>=0.10.32 (from simpletransformers)
  Downloading wandb-0.16.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting streamlit (from simpletransformers)
  Downloading

In [None]:
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

# prepare the dataset

In [58]:
!gdown 1pCHvfO36_wI_AEc_LhjWVtprBALh_CcS

Downloading...
From: https://drive.google.com/uc?id=1pCHvfO36_wI_AEc_LhjWVtprBALh_CcS
To: /content/2018-EI-reg-En-anger-test-gold.txt
  0% 0.00/124k [00:00<?, ?B/s]100% 124k/124k [00:00<00:00, 109MB/s]


In [59]:
!gdown 1JgrpjG4JENGq4XDuYjNS9wAiJ0EfTfSx

Downloading...
From: https://drive.google.com/uc?id=1JgrpjG4JENGq4XDuYjNS9wAiJ0EfTfSx
To: /content/EI-reg-En-anger-train.txt
  0% 0.00/208k [00:00<?, ?B/s]100% 208k/208k [00:00<00:00, 133MB/s]


In [60]:
import pandas as pd

# Read the text file into a Pandas DataFrame
train = pd.read_csv('EI-reg-En-anger-train.txt', delimiter='\t')
test = pd.read_csv('2018-EI-reg-En-anger-test-gold.txt', delimiter='\t')

# Display the DataFrame
train.head()

Unnamed: 0,ID,Tweet,Affect Dimension,Intensity Score
0,2017-En-10264,@xandraaa5 @amayaallyn6 shut up hashtags are c...,anger,0.562
1,2017-En-10072,it makes me so fucking irate jesus. nobody is ...,anger,0.75
2,2017-En-11383,Lol Adam the Bull with his fake outrage...,anger,0.417
3,2017-En-11102,@THATSSHAWTYLO passed away early this morning ...,anger,0.354
4,2017-En-11506,@Kristiann1125 lol wow i was gonna say really?...,anger,0.438


In [61]:
# keep only the "text" and "score" columns in the dataset

train = train.rename(columns={'Tweet' : 'text', 'Intensity Score':'score'}).drop({'Affect Dimension','ID'}, axis=1)
test = test.rename(columns={'Tweet' : 'text', 'Intensity Score':'score'}).drop({'Affect Dimension','ID'}, axis=1)
train.head()

Unnamed: 0,text,score
0,@xandraaa5 @amayaallyn6 shut up hashtags are c...,0.562
1,it makes me so fucking irate jesus. nobody is ...,0.75
2,Lol Adam the Bull with his fake outrage...,0.417
3,@THATSSHAWTYLO passed away early this morning ...,0.354
4,@Kristiann1125 lol wow i was gonna say really?...,0.438


# preprocessing the data

In [62]:
# install the emoji library before running, if necessary
!pip install emoji



In [63]:
import re
import html
import emoji

import spacy
nlp = spacy.load('en_core_web_sm')

# remove emojis
def remove_emojis(text):
    return emoji.demojize(text).replace(":", ": ")

def handle_special_cases(text):
    new_text = text.replace("\\n", "")
    new_text = re.sub(r'(?<!\s)#', ' #', new_text) # add space before hashtags
    return new_text

# lowercasing
def lowercasing(text):
    return text.lower()

# handle HTML characters
def handle_html(text):
    return(html.unescape(text))

# tokenization & lemmatization
# along with digit, punctuation and stop word removal
def tokenize_lemmatize(text):
    doc = nlp(text)
    return [t.lemma_ for t in doc if (not t.is_digit) and (not t.is_punct) and not (t.is_stop)]

def text_preprocessing(text_df):
    # function assumes input is a dataframe with a 'text' column
    new_text_df = text_df.copy()
    new_text_df['text'] = new_text_df['text'].apply(remove_emojis)
    new_text_df['text'] = new_text_df['text'].apply(handle_html)
    new_text_df['text'] = new_text_df['text'].apply(handle_special_cases)
    new_text_df['text'] = new_text_df['text'].apply(lowercasing)
    new_text_df['text'] = new_text_df['text'].apply(tokenize_lemmatize)
    return new_text_df

In [64]:
# Apply preprocessing to all data (might take about 1min to run!)
test = text_preprocessing(test)
train = text_preprocessing(train)

In [65]:
# check preprocessed data

train.head()

Unnamed: 0,text,score
0,"[@xandraaa5, @amayaallyn6, shut, hashtag, cool...",0.562
1,"[make, fucking, irate, jesus, call, ppl, like,...",0.75
2,"[lol, adam, bull, fake, outrage]",0.417
3,"[@thatsshawtylo, pass, away, early, morning, f...",0.354
4,"[@kristiann1125, lol, wow, go, to, haha, see, ...",0.438


# Set the model

In [77]:
# creating a model on simpletransformers
model_args = ClassificationArgs(num_train_epochs=1, regression = True, manual_seed=42, train_batch_size=4, max_seq_length=128)

# Create a ClassificationModel
bert_model = ClassificationModel(
    "bert",
    "bert-base-cased",
    num_labels=1,
    args=model_args,
    use_cuda=True
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [78]:
#training the model

bert_model.train_model(train, output_dir='test_5')

  0%|          | 0/1701 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/426 [00:00<?, ?it/s]



(426, 0.04221889619254223)

# prediction

In [79]:
# model prediction on test dataset
bert_predictions, _ = bert_model.predict(test['text'].tolist())

  0%|          | 0/1002 [00:00<?, ?it/s]

  0%|          | 0/126 [00:00<?, ?it/s]

# Evaluation

In [82]:
# Calculate Pearson correlation coefficient and p-value
from scipy.stats import pearsonr

correlation_coefficient, p_value = pearsonr(bert_predictions, test['score'])

In [86]:
print(f"correlation_coefficient: {correlation_coefficient}, p_value: {p_value}")

correlation_coefficient: 0.2197896489377823, p_value: 1.9952357830675683e-12
