<a href="https://colab.research.google.com/github/matchten/textmsg-analyzer/blob/main/textmsg_analyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Import Packages
!pip install transformers
!pip install datasets --upgrade
!pip install evaluate
!pip install transformers[torch]
!pip install numpy
!pip install huggingface_hub

In [None]:
# Log in to notebook
from huggingface_hub import notebook_login

notebook_login()

In [22]:
from transformers.pipelines.pt_utils import KeyDataset
from datasets import concatenate_datasets, load_dataset
from datasets import Dataset, DatasetDict
import numpy as np
import sklearn

# Finetune dataset consisting of daily dialogue and emotion sequences
dialogue_data = load_dataset("daily_dialog")
dialogue_data = dialogue_data.select_columns(["dialog", "emotion"])
dialogue_data = dialogue_data.rename_column("dialog", "text")
dialogue_data = dialogue_data.rename_column("emotion", "label")

def list_to_string(example):
    text = ""
    for message in example["text"]:
        text += message
    example["text"] = text
    return example

def normalize_labels(example):
    emotion_sequence = example["label"]
    for i in range(len(emotion_sequence)):
        emotion = emotion_sequence[i]
        if emotion in {1,2,3,5}:
            emotion_sequence[i] = 0
        elif emotion == 0:
            emotion_sequence[i] = 1
        else:
            emotion_sequence[i] = 2

    if all(emotion == 1 for emotion in emotion_sequence):
        normalized_emotion = 1
    else:
        emotion_sequence = [emotion for emotion in emotion_sequence if emotion != 1]
        average_emotion = np.average(emotion_sequence)
        if average_emotion <= 2/3:
            normalized_emotion = 0
        elif 2/3 < average_emotion < 4/3:
            normalized_emotion = 1
        else:
            normalized_emotion = 2

    example["label"] = np.int32(normalized_emotion)
    return example

dialogue_data = dialogue_data.map(list_to_string)
dialogue_data = dialogue_data.map(normalize_labels)

negative_text = dialogue_data.filter(lambda example: example["label"] == 0)

for _ in range(3):
    dialogue_data["train"] = concatenate_datasets([dialogue_data["train"], negative_text["train"]])

for label in range(3):
    print(dialogue_data["train"]["label"].count(label))

In [23]:
# Combine the datasets to form a finetuned dataset
# finetune_train = concatenate_datasets([new_data["train"], dialogue_data["train"]])
# finetune_test = concatenate_datasets([new_data["test"], dialogue_data["test"]])
finetune_train = dialogue_data["train"]
finetune_val = dialogue_data["validation"]
finetune_test = dialogue_data["test"]

# print(finetune_train)
# print(finetune_test)

In [None]:
# Tokenizing data
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig

MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL, num_labels = 3)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length = 512, return_tensors = 'pt')

tokenized_finetune_train = finetune_train.map(tokenize_function, batched=True).shuffle(seed=42)
tokenized_finetune_val = finetune_val.map(tokenize_function, batched=True).shuffle(seed=42)
tokenized_finetune_test = finetune_test.map(tokenize_function, batched=True).shuffle(seed=42)

print(tokenized_finetune_train)
print(tokenized_finetune_val)
print(tokenized_finetune_test)

In [25]:
# Creating Model
import numpy as np
import evaluate
import accelerate
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

training_args = TrainingArguments(
    output_dir="text-message-analyzer-finetuned",
    evaluation_strategy="steps",
    eval_steps = 5,
    num_train_epochs = 1,
    per_device_train_batch_size = 15,
    per_device_eval_batch_size = 8,
    push_to_hub = True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_finetune_train,
    eval_dataset=tokenized_finetune_val,
    compute_metrics=compute_metrics,
)


In [26]:
trainer.train()

Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
5,No log,0.866567,0.589,0.590262,0.610382,0.589
10,No log,0.759624,0.661,0.659042,0.66032,0.661
15,No log,1.178285,0.521,0.524374,0.724187,0.521
20,No log,0.890905,0.615,0.631751,0.691022,0.615
25,No log,0.799527,0.666,0.674278,0.69182,0.666
30,No log,0.769942,0.65,0.658548,0.693536,0.65
35,No log,0.734438,0.662,0.669118,0.685719,0.662
40,No log,0.732556,0.654,0.667526,0.703576,0.654
45,No log,0.960775,0.603,0.570456,0.721139,0.603
50,No log,0.859287,0.628,0.633797,0.726168,0.628


TrainOutput(global_step=922, training_loss=0.6292309874826295, metrics={'train_runtime': 6826.5258, 'train_samples_per_second': 2.024, 'train_steps_per_second': 0.135, 'total_flos': 3635701206165504.0, 'train_loss': 0.6292309874826295, 'epoch': 1.0})

In [31]:
trainer.predict(tokenized_finetune_test)

PredictionOutput(predictions=array([[-2.3932734 ,  2.1236422 ,  0.17131135],
       [-2.7597926 ,  3.500655  , -0.53153944],
       [-3.403019  ,  1.0126785 ,  2.275654  ],
       ...,
       [-3.4283621 ,  1.296931  ,  2.0177114 ],
       [ 0.5950537 ,  1.1233617 , -1.9834183 ],
       [-2.3246453 ,  3.321121  , -0.8039319 ]], dtype=float32), label_ids=array([1, 1, 2, 2, 1, 2, 1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 0, 2, 1, 1, 1, 2,
       2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1, 0, 1, 1, 2,
       1, 2, 0, 2, 2, 2, 1, 2, 2, 2, 1, 1, 1, 1, 1, 0, 1, 0, 0, 2, 1, 1,
       2, 1, 1, 1, 2, 1, 2, 1, 2, 2, 1, 1, 1, 1, 2, 1, 2, 1, 0, 1, 1, 0,
       0, 1, 1, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 0, 1, 0, 2, 2, 1, 1, 1, 2,
       1, 1, 2, 1, 2, 2, 2, 1, 0, 2, 1, 1, 0, 2, 1, 2, 2, 2, 1, 2, 0, 1,
       2, 2, 1, 2, 0, 1, 2, 2, 1, 2, 2, 0, 2, 2, 2, 2, 1, 2, 2, 1, 0, 2,
       1, 1, 1, 1, 2, 0, 0, 2, 1, 2, 2, 1, 0, 2, 2, 0, 1, 1, 2, 1, 2, 1,
       1, 0, 2, 2, 1, 2, 2, 1, 1, 2, 1, 1, 2, 1, 2, 2, 1, 2,

In [33]:
trainer.push_to_hub()
tokenizer.push_to_hub("matchten/text-message-analyzer-finetuned")

CommitInfo(commit_url='https://huggingface.co/matchten/text-message-analyzer-finetuned/commit/939033535f7abd9be2a46cfd01e7e48ce9d3c069', commit_message='Upload tokenizer', commit_description='', oid='939033535f7abd9be2a46cfd01e7e48ce9d3c069', pr_url=None, pr_revision=None, pr_num=None)