In [2]:
from datasets import load_dataset, load_metric
from transformers import (AutoTokenizer, GPT2Tokenizer, AutoModelForSequenceClassification, 
                          TrainingArguments, Trainer, AutoModelForCausalLM, GPT2LMHeadModel, AutoModelWithLMHead,
                         DataCollatorForLanguageModeling, pipeline)
import torch
import wandb
import numpy as np
import os
import tensorflow as tf

# Preparing data (Emotion dataset by Twitter)

In [4]:
emotion_dataset = load_dataset("emotion")

Using custom data configuration default
Reusing dataset emotion (/home/avsorokina/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

In [30]:
emotion_dataset = emotion_dataset.map(lambda record: {'text': 'sadness ' + record['text']} if record['label']==0 else \
                          ({'text': 'joy ' + record['text']} if record['label']==1 else \
                          ({'text': 'love ' + record['text']} if record['label']==2 else \
                          ({'text': 'anger ' + record['text']} if record['label']==3 else \
                          ({'text': 'fear ' + record['text']} if record['label']==4 else \
                           {'text': 'surprise ' + record['text']})
                          ) 
                          )
                          )
                          )

Loading cached processed dataset at /home/avsorokina/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-0725efdf174a2254.arrow
Loading cached processed dataset at /home/avsorokina/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-e834b2ee31382d6e.arrow
Loading cached processed dataset at /home/avsorokina/.cache/huggingface/datasets/emotion/default/0.0.0/348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705/cache-500c8bcbfa97f2d6.arrow


In [31]:
emotion_dataset['train'][0]

{'text': 'sadness i didnt feel humiliated', 'label': 0}

In [27]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [28]:
tokenizer.pad_token = tokenizer.eos_token

In [29]:
def tokenize_func(record):
    return tokenizer(record["text"], padding = "max_length", truncation = True, max_length = 128)

In [32]:
emotion_dataset_tokenized = emotion_dataset.map(tokenize_func)

  0%|          | 0/16000 [00:00<?, ?ex/s]

  0%|          | 0/2000 [00:00<?, ?ex/s]

  0%|          | 0/2000 [00:00<?, ?ex/s]

In [33]:
emotion_dataset_tokenized = emotion_dataset_tokenized.remove_columns(['text']).rename_column('label', 'labels').with_format('torch')


In [34]:
emotion_train_dataset, emotion_eval_dataset = emotion_dataset_tokenized["train"], emotion_dataset_tokenized["test"]

# Modelling 

In [35]:
training_args = TrainingArguments(
                    output_dir="./emotions",
                    overwrite_output_dir=True,
                    num_train_epochs=10, 
                    per_device_train_batch_size=2,
                    per_device_eval_batch_size=4, 
                    logging_steps = 500,
                    save_steps=500,
                    warmup_steps=500,
                    )

## Full

In [36]:
model = AutoModelWithLMHead.from_pretrained("gpt2")
model.config.pad_token_id = model.config.eos_token_id

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model, 
    args=training_args, 
    data_collator=data_collator,
    train_dataset=emotion_train_dataset)
trainer.train()

trainer.save_model("emotion_fine_tuned/gpt2_full")
tokenizer.save_pretrained("emotion_fine_tuned/gpt2_full")

***** Running training *****
  Num examples = 16000
  Num Epochs = 10
  Instantaneous batch size per device = 2
  Total train batch size (w. parallel, distributed & accumulation) = 2
  Gradient Accumulation steps = 1
  Total optimization steps = 80000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
[34m[1mwandb[0m: Currently logged in as: [33mnastyatrvl[0m (use `wandb login --relogin` to force relogin)


Step,Training Loss
500,4.2284
1000,3.9418
1500,3.8591
2000,3.884
2500,3.8056
3000,3.7926
3500,3.7385
4000,3.7146
4500,3.7725
5000,3.7463


Saving model checkpoint to ./emotions/checkpoint-500
Configuration saved in ./emotions/checkpoint-500/config.json
Model weights saved in ./emotions/checkpoint-500/pytorch_model.bin
Saving model checkpoint to ./emotions/checkpoint-1000
Configuration saved in ./emotions/checkpoint-1000/config.json
Model weights saved in ./emotions/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to ./emotions/checkpoint-1500
Configuration saved in ./emotions/checkpoint-1500/config.json
Model weights saved in ./emotions/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to ./emotions/checkpoint-2000
Configuration saved in ./emotions/checkpoint-2000/config.json
Model weights saved in ./emotions/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to ./emotions/checkpoint-2500
Configuration saved in ./emotions/checkpoint-2500/config.json
Model weights saved in ./emotions/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to ./emotions/checkpoint-3000
Configuration saved in ./emot

('emotion_fine_tuned/gpt2_full/tokenizer_config.json',
 'emotion_fine_tuned/gpt2_full/special_tokens_map.json',
 'emotion_fine_tuned/gpt2_full/vocab.json',
 'emotion_fine_tuned/gpt2_full/merges.txt',
 'emotion_fine_tuned/gpt2_full/added_tokens.json',
 'emotion_fine_tuned/gpt2_full/tokenizer.json')

In [38]:
tokenizer = AutoTokenizer.from_pretrained("emotion_fine_tuned/gpt2_full")
model = AutoModelWithLMHead.from_pretrained("emotion_fine_tuned/gpt2_full")

Didn't find file emotion_fine_tuned/gpt2_full/added_tokens.json. We won't load it.
loading file emotion_fine_tuned/gpt2_full/vocab.json
loading file emotion_fine_tuned/gpt2_full/merges.txt
loading file emotion_fine_tuned/gpt2_full/tokenizer.json
loading file None
loading file emotion_fine_tuned/gpt2_full/special_tokens_map.json
loading file emotion_fine_tuned/gpt2_full/tokenizer_config.json
loading configuration file emotion_fine_tuned/gpt2_full/config.json
Model config GPT2Config {
  "_name_or_path": "emotion_fine_tuned/gpt2_full",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "pad_token_id": 50256,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale

In [39]:
gpt2_finetune_pipe = pipeline('text-generation', model=model, tokenizer=tokenizer)

In [40]:
test_prompts = ["I am ",
             "I think that",
             "I like",
             "I don't like",
             "I want",
             "My dream is"]

In [41]:
emotions = ['sadness ', 'joy ', 'love ', 'anger ', 'fear ', 'surprise ']

In [42]:
classifier = pipeline("text-classification",model='bhadresh-savani/distilbert-base-uncased-emotion', return_all_scores=True)

loading configuration file https://huggingface.co/bhadresh-savani/distilbert-base-uncased-emotion/resolve/main/config.json from cache at /home/avsorokina/.cache/huggingface/transformers/690674b44bd5b1a7ef81fea02641d3b53827649f92ae54381924832f1edefaac.49a3ba1a12c5b0c12c1f5d39ce0fc262dc3810bdc41be4d875eaf3181375d3f3
Model config DistilBertConfig {
  "_name_or_path": "bhadresh-savani/distilbert-base-uncased-emotion",
  "activation": "gelu",
  "architectures": [
    "DistilBertForSequenceClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "sadness",
    "1": "joy",
    "2": "love",
    "3": "anger",
    "4": "fear",
    "5": "surprise"
  },
  "initializer_range": 0.02,
  "label2id": {
    "anger": 3,
    "fear": 4,
    "joy": 1,
    "love": 2,
    "sadness": 0,
    "surprise": 5
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout"

In [43]:
classifier2 = pipeline("text-classification",model='arpanghoshal/EmoRoBERTa', return_all_scores=True)

loading configuration file https://huggingface.co/arpanghoshal/EmoRoBERTa/resolve/main/config.json from cache at /home/avsorokina/.cache/huggingface/transformers/9c70675f5e9090fae783ea1880a8e785e68bda247a0ae8fb5cd1bbdc86cbc4c7.cd7e8a4fe6807ed49afe99d5a84290c6c94c4855bef92225d127edbbf327a2c9
Model config RobertaConfig {
  "_name_or_path": "arpanghoshal/EmoRoBERTa",
  "architectures": [
    "RobertaForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "admiration",
    "1": "amusement",
    "2": "anger",
    "3": "annoyance",
    "4": "approval",
    "5": "caring",
    "6": "confusion",
    "7": "curiosity",
    "8": "desire",
    "9": "disappointment",
    "10": "disapproval",
    "11": "disgust",
    "12": "embarrassment",
    "13": "excitement",
    "14": "fear",


In [44]:
for emotion in emotions:
    print(emotion, 2*"\n")
    for prompt in test_prompts:
        res = gpt2_finetune_pipe(text_inputs=emotion+prompt, 
                          top_k = 50,
                          top_p = 0.95,
                          no_repeat_ngram_size = 3,  
                          repetition_penalty=1.2,
                          max_length = 50,
                          temperature = 0.7)
        res = res[0]['generated_text'].split(' ', 1)[1] #remove first token indicating emotion before classification
        prediction1 = classifier(res)
        prediction2 = classifier2(res)
        prediction1 = sorted(prediction1[0], key=lambda d: d['score'], reverse = True) 
        prediction2 = sorted(prediction2[0], key=lambda d: d['score'], reverse = True) 
        print(res, 2*"\n", prediction1[0], 2*"\n", prediction2[0])
        print(100*'-')

sadness  


I am ive been feeling so depressed lately that i want to quit my job and cross the finish line somewhere in between episodes of housewife drama or sitcom limbo where everything just feels resolved and thrown into place like a thrown jigsaw which 

 {'label': 'sadness', 'score': 0.999117910861969} 

 {'label': 'disappointment', 'score': 0.7120047807693481}
----------------------------------------------------------------------------------------------------
I think that i shouldnt have left this place feeling humiliated and alone so i did what every mother does when she has a child who is born without her or no one but the thought of leaving was making me feel stressed out worried upset 

 {'label': 'sadness', 'score': 0.9989664554595947} 

 {'label': 'disappointment', 'score': 0.7229012250900269}
----------------------------------------------------------------------------------------------------
I like i was on top of my game and i just kind of lost myself feeling unhappy abo