## Emotion detection from text

In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import BertTokenizerFast
from transformers import TrainingArguments, Trainer
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import EarlyStoppingCallback

In [2]:
train_df = pd.read_csv('train.tsv', sep='\t', names=['text', 'label', 'user'], usecols=[0,1])
test_df = pd.read_csv('test.tsv', sep='\t', names=['text', 'label', 'user'], usecols=[0,1])

In [3]:
train_df = train_df[~train_df['label'].str.contains(",")]
test_df = test_df[~test_df['label'].str.contains(",")]

In [4]:
'''
{
"anger": ["anger", "annoyance", "disapproval"],
"disgust": ["disgust"],
"fear": ["fear", "nervousness"],
"joy": ["joy", "amusement", "approval", "excitement", "gratitude",  "love", "optimism", "relief", "pride", "admiration", "desire", "caring"],
"sadness": ["sadness", "disappointment", "embarrassment", "grief",  "remorse"],
"surprise": ["surprise", "realization", "confusion", "curiosity"]
}
'''

'\n{\n"anger": ["anger", "annoyance", "disapproval"],\n"disgust": ["disgust"],\n"fear": ["fear", "nervousness"],\n"joy": ["joy", "amusement", "approval", "excitement", "gratitude",  "love", "optimism", "relief", "pride", "admiration", "desire", "caring"],\n"sadness": ["sadness", "disappointment", "embarrassment", "grief",  "remorse"],\n"surprise": ["surprise", "realization", "confusion", "curiosity"]\n}\n'

In [5]:
train_df.label.replace({'0':'joy', '1':'joy', '2':'anger', '3':'anger', '4':'joy', '5':'joy', '6':'surprise', '7':'surprise', '8':'joy', '9':'sadness', '10':'anger', '11':'disgust', '12':'sadness', '13':'joy', '14':'fear', '15':'joy', '16':'sadness', '17':'joy', '18':'joy', '19':'fear', '20':'joy', '21':'joy', '22':'surprise', '23':'joy', '24':'sadness', '25':'sadness', '26':'surprise', '27':'neutral'}, inplace=True)
test_df.label.replace({'0':'joy', '1':'joy', '2':'anger', '3':'anger', '4':'joy', '5':'joy', '6':'surprise', '7':'surprise', '8':'joy', '9':'sadness', '10':'anger', '11':'disgust', '12':'sadness', '13':'joy', '14':'fear', '15':'joy', '16':'sadness', '17':'joy', '18':'joy', '19':'fear', '20':'joy', '21':'joy', '22':'surprise', '23':'joy', '24':'sadness', '25':'sadness', '26':'surprise', '27':'neutral'}, inplace=True)

In [6]:
train_df.label.value_counts()

joy         12920
neutral     12823
anger        3878
surprise     3553
sadness      2121
fear          515
disgust       498
Name: label, dtype: int64

In [7]:
test_df.label.value_counts()

neutral     1606
joy         1603
anger        520
surprise     449
sadness      259
fear          77
disgust       76
Name: label, dtype: int64

In [8]:
train_df = train_df.groupby("label").sample(n=5000, random_state=663, replace=True).drop_duplicates()

In [9]:
train_df.label.value_counts()

neutral     4153
joy         4127
anger       2805
surprise    2676
sadness     1921
fear         514
disgust      497
Name: label, dtype: int64

In [10]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit(train_df[['label']])
enc.categories_

[array(['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness',
        'surprise'], dtype=object)]

In [11]:
train_df[['label']] = enc.transform(train_df[['label']])
test_df[['label']] = enc.transform(test_df[['label']])

In [12]:
train_df["label"] = train_df["label"].astype(int)
test_df["label"] = test_df["label"].astype(int)

In [13]:
model_name = "bert-base-uncased"
max_length = 512

In [14]:
tokenizer = BertTokenizer.from_pretrained(model_name)

In [15]:
from transformers import BertTokenizerFast, BertForSequenceClassification

In [16]:
train_encodings = tokenizer(list(train_df.text), truncation=True, padding=True, max_length=max_length)
test_encodings = tokenizer(list(test_df.text), truncation=True, padding=True, max_length=max_length)

In [17]:
model=BertForSequenceClassification.from_pretrained(model_name, num_labels=len(enc.categories_[0]))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [12]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels=None):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        if self.labels:
            item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.encodings["input_ids"])

In [None]:
train_dataset = Dataset(train_encodings, list(train_df.label))
test_dataset = Dataset(test_encodings, list(test_df.label))

In [19]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred, average='micro')
    precision = precision_score(y_true=labels, y_pred=pred, average='micro')
    f1 = f1_score(y_true=labels, y_pred=pred, average='micro')

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [20]:
training_args = TrainingArguments(
    output_dir="text_emotion_model/",
#     evaluation_strategy="epoch",
    evaluation_strategy="steps",
    eval_steps=250,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    seed=663,
    load_best_model_at_end=True,
    weight_decay=0.01,
    logging_steps=200
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)

In [21]:
# torch.cuda.empty_cache()

In [22]:
trainer.train()

***** Running training *****
  Num examples = 16693
  Num Epochs = 10
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 2610


Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
250,1.189,0.89239,0.662963,0.662963,0.662963,0.662963
500,0.8398,0.929306,0.663181,0.663181,0.663181,0.663181
750,0.6868,1.035489,0.649237,0.649237,0.649237,0.649237
1000,0.2933,1.253104,0.631155,0.631155,0.631155,0.631155
1250,0.191,1.454075,0.638126,0.638126,0.638126,0.638126


***** Running Evaluation *****
  Num examples = 4590
  Batch size = 32
***** Running Evaluation *****
  Num examples = 4590
  Batch size = 32
Saving model checkpoint to text_emotion_model/checkpoint-500
Configuration saved in text_emotion_model/checkpoint-500/config.json
Model weights saved in text_emotion_model/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4590
  Batch size = 32
***** Running Evaluation *****
  Num examples = 4590
  Batch size = 32
Saving model checkpoint to text_emotion_model/checkpoint-1000
Configuration saved in text_emotion_model/checkpoint-1000/config.json
Model weights saved in text_emotion_model/checkpoint-1000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 4590
  Batch size = 32


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from text_emotion_model/checkpoint-500 (score: 0.9293056130409241).


TrainOutput(global_step=1250, training_loss=0.5976902160644532, metrics={'train_runtime': 3138.7063, 'train_samples_per_second': 53.184, 'train_steps_per_second': 0.832, 'total_flos': 1.29845461376376e+16, 'train_loss': 0.5976902160644532, 'epoch': 4.79})

In [24]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [31]:
df = np.array(['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'surprise']).reshape(-1, 1)

enc_categories = enc.transform(df)

inverted = enc.inverse_transform(enc_categories)

for i,j in zip(enc_categories, inverted):
    print(i, j)

[0.] ['anger']
[1.] ['disgust']
[2.] ['fear']
[3.] ['joy']
[4.] ['neutral']
[5.] ['sadness']
[6.] ['surprise']


### making predictions

In [8]:
model_name = "bert-base-uncased"
max_length = 512
model_path = "text_emotion_model/checkpoint-1000"
model = BertForSequenceClassification.from_pretrained(model_path, num_labels=7) 
test_trainer = Trainer(model) 
tokenizer = BertTokenizer.from_pretrained(model_name)

loading configuration file text_emotion_model/checkpoint-1000/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "problem_type": "single_label_classification",
  

In [29]:
def get_emotion(text, trainer, model_path='text_emotion_model/checkpoint-1000', num_labels=7):
    if trainer is None:
        model = BertForSequenceClassification.from_pretrained(model_path, num_labels=num_labels) 
        trainer = Trainer(model) 
    emotion_dict = {0:'anger', 1:'disgust', 2:'fear', 3:'joy', 4:'neutral', 5:'sadness', 6:'surprise'}
    tokenized = tokenizer([text], padding=True, truncation=True, max_length=512) 
    raw_pred, raw_pred2, raw_pred3 = trainer.predict(Dataset(tokenized)) 
    print(raw_pred)
    print(raw_pred2)
    print(raw_pred3)
    y_pred = np.argmax(raw_pred, axis=1)
    return emotion_dict[y_pred[0]]

In [30]:
text = 'This is an interesting project'
print(text, '- emotion:', get_emotion(text, test_trainer))
text = 'This project sucks'
print(text, '- emotion:', get_emotion(text, test_trainer))

***** Running Prediction *****
  Num examples = 1
  Batch size = 8
***** Running Prediction *****
  Num examples = 1
  Batch size = 8


[[-0.60881066 -1.9285843  -1.5669314   6.0609436   0.25553012 -1.920858
   0.2618818 ]]
None
{'test_runtime': 0.1224, 'test_samples_per_second': 8.168, 'test_steps_per_second': 8.168}
This is an interesting project - emotion: joy
[[ 5.523139    0.24936825 -2.5073912  -1.4559844  -0.5487342   0.32804137
  -1.9395146 ]]
None
{'test_runtime': 0.0627, 'test_samples_per_second': 15.938, 'test_steps_per_second': 15.938}
This project sucks - emotion: anger


In [37]:
text = '''
no no no
'''

In [38]:
print(text, '- emotion:', get_emotion(text, test_trainer))

***** Running Prediction *****
  Num examples = 1
  Batch size = 8


[[ 4.3270106 -1.4724618 -0.6859344 -1.3424183  1.7190559 -1.5251224
  -1.6715395]]
None
{'test_runtime': 0.0912, 'test_samples_per_second': 10.963, 'test_steps_per_second': 10.963}

no no no
 - emotion: anger


In [41]:
np.sum([ 4.3270106, -1.4724618, -0.6859344, -1.3424183,  1.7190559, -1.5251224,
  -1.6715395])

-0.6514099