<a href="https://colab.research.google.com/github/neculaluana/Twitter-emotion-analysis/blob/main/emotion_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
import re
import torch
import plotly.express as px

from google.colab import drive
from datasets import Dataset, DatasetDict, Features, Value, ClassLabel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
from transformers import DataCollatorWithPadding
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import pipeline

In [None]:
df_train = pd.read_csv('https://raw.githubusercontent.com/neculaluana/Twitter-emotion-analysis/main/input/training.csv?token=GHSAT0AAAAAACBUYC47QMRBYGX5L5L7H6FGZCBAPEA')
df_test = pd.read_csv('https://raw.githubusercontent.com/neculaluana/Twitter-emotion-analysis/main/input/test.csv?token=GHSAT0AAAAAACBUYC477ZGWJ43J7OLYWPVGZCBAM7Q')
df_valid = pd.read_csv('https://raw.githubusercontent.com/neculaluana/Twitter-emotion-analysis/main/input/validation.csv?token=GHSAT0AAAAAACBUYC47FCWVRUS45ZFJAQOYZCBAQDA')

In [None]:
def clean_tweet(tweet):

  tweet = re.sub(r'https?://[^ ]+', '', str(tweet))      #removes links
  tweet = re.sub(r'@[^ ]+', '', str(tweet))              #removes mentions
  tweet = re.sub(r'#', '', str(tweet))                   #removes hashtag symbol
  tweet = re.sub(r'([A-Za-z])\1{2,}', r'\1', str(tweet)) #removes repeated characters ex: heeeeeeey
  tweet = re.sub(r'[^A-Za-z ]', '', str(tweet))          #removes unwanted characters and punctuation
  tweet = re.sub(r' 0 ', 'zero', str(tweet))             #transforms 0 to zero (it can influence emotion)
  tweet = tweet.lower()                                  #lower-casing
  return tweet

In [None]:
df = pd.concat([df_train, df_valid, df_test], ignore_index=True, sort=False)
df["text"]=df["text"].apply(lambda text: clean_tweet(text))
print(df)

In [None]:
emotion_names = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
custom_features = Features({
    'text': Value(dtype='string'),
    'label': ClassLabel(names=emotion_names)
})

"""emotions_full = DatasetDict({
    "train": Dataset.from_pandas(df_train,features=custom_features),
    "test": Dataset.from_pandas(df_test,features=custom_features),
    "validation": Dataset.from_pandas(df_valid,features=custom_features)
    })

"""
emotions_full_dataset = Dataset.from_pandas(df, features=custom_features)
data_column=emotions_full_dataset ["text"]
label_column=emotions_full_dataset ["label"]


X_train, X_val, y_train, y_val = train_test_split(data_column, label_column, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_train, y_train, test_size=0.5, random_state=42)

dataset = DatasetDict({"train": Dataset.from_dict({"text": X_train, "label": y_train}),
                        "validation": Dataset.from_dict({"text": X_val, "label": y_val}),
                        "test": Dataset.from_dict({"text": X_test, "label": y_test})})

In [None]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

def tokenize(batch):
    return tokenizer(batch['text'], padding=True, truncation=True)

tokenized_datasets = dataset.map(tokenize, batched=True, batch_size=None)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
drive.mount('/content/gdrive')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=6).to(device)

In [None]:
training_args = TrainingArguments(
    output_dir='/content/gdrive/MyDrive/EmotionAnalysis/checkpoints60',          # output directory
    num_train_epochs=3, # total number of training epochs
    learning_rate=2e-5,             # learning rate
    per_device_train_batch_size=64,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='/content/gdrive/MyDrive/EmotionAnalysis/logs60',            # directory for storing logs
    logging_steps=10,             # log saving step
    save_total_limit=1,             # number of total save model
    load_best_model_at_end=True,    # load the best model when finished training (default metric is loss)
    metric_for_best_model="accuracy",   # use accuracy when comparing two models
    greater_is_better=True,            # higher metric value is better
    evaluation_strategy="epoch",    # evaluate each `logging_steps`
    save_strategy="epoch",        # save each `logging_steps`
)

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    accuracy = accuracy_score(labels, preds)
    return {"accuracy": accuracy, "f1": f1}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)
trainer.train()
trainer.save_model('/content/gdrive/MyDrive/EmotionAnalysis/models60')

In [None]:
result_eval=trainer.evaluate(tokenized_datasets["test"])
print(result_eval)

In [None]:
drive.mount('/content/gdrive')

classifier = pipeline("text-classification", model="/content/gdrive/MyDrive/EmotionAnalysis/models60", tokenizer=DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased'))

In [None]:
#prediction for one tweet
emotion_names = ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']
predict_emotions = classifier("I can't believe someone would do such a thing", return_all_scores=True)
emotion_mapping = {
    'LABEL_0': 'sadness',
    'LABEL_1': 'joy',
    'LABEL_2': 'love',
    'LABEL_3': 'anger',
    'LABEL_4': 'fear',
    'LABEL_5': 'surprise'
}
#for prediction in predict_emotions:
    #prediction[0] = emotion_mapping[prediction["label"]]
print(predict_emotions)

df_preds = pd.DataFrame.from_records(predict_emotions[0])
px.bar(x=emotion_names,y=100*df_preds['score'],template='plotly_white')