In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')
os.chdir('drive/My Drive/Colab Notebooks/EECS595/final_proj')
!ls

Mounted at /content/drive
emoji_bert_sentiment_analysis.ipynb  logs4			res
emotag-dimension-as-emojis.txt	     rehydrated_tweets_parquet	Untitled0.ipynb


In [None]:
import pandas as pd
from transformers import BertTokenizerFast, BertModel, Trainer, TrainingArguments
import torch
from torch.utils.data import Dataset
from torch import nn
import emoji as emo
from tqdm import tqdm
import concurrent.futures
import multiprocessing as mp
import time
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from collections import Counter
import unicodedata as ud
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from itertools import repeat
import numpy as np
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

In [None]:
SEED = 595

# Helpful Links
- [`DOCS` emoji package docs](https://carpedm20.github.io/emoji/docs/)
- [`HUGGINGFACEFORUM` Adding new tokens while preserving tokenization of adjacent tokens](https://discuss.huggingface.co/t/adding-new-tokens-while-preserving-tokenization-of-adjacent-tokens/12604)
- [`MEDIUM` NLP | How to add a domain-specific vocabulary (new tokens) to a subword tokenizer already trained like BERT WordPiece](https://medium.com/@pierre_guillou/nlp-how-to-add-a-domain-specific-vocabulary-new-tokens-to-a-subword-tokenizer-already-trained-33ab15613a41)
- [`KAGGLE_CODE` Tutorial: How to add a new token to Bert](https://www.kaggle.com/code/xhlulu/tutorial-how-to-add-a-new-token-to-bert)
- [`KAGGLE_CODE` Sentiment Classification | BERT | HuggingFace](https://www.kaggle.com/code/ahmedabdulhamid/sentiment-classification-bert-huggingface/notebook)
- [`MEDIUM` Twitter Sentiment Analysis with Deep Learning using BERT and Hugging Face
](https://medium.com/mlearning-ai/twitter-sentiment-analysis-with-deep-learning-using-bert-and-hugging-face-830005bcdbbf)

# BERT Emoji Model Pipeline
1. Load data
2. Split into train and test sets
3. Get emojis from emotag 620
4. Get unique emoticons in train `emojis` col. (We don't have a way of finding emoticons in the text)
5. Demojize train & test sets, also emotag 620 emojis
6. Create a dict for emoticons and apply a similar "demojize" operation in the same fashion as 5.
7. Add demojized emotag 620 emojis and emoticons to tokenizer and resize BERT
8. Perform sentiment analysis

# 1. Load Data

In [None]:
parquet_dir = './rehydrated_tweets_parquet'

Get the paths of all parquet files

In [None]:
parquet_list = []
for root, _, files in os.walk(parquet_dir):
    for file in files:
        if file[0] != '.':
            parquet_file_path = os.path.join(root, file)
            if parquet_file_path[-8:] == '.parquet':
                parquet_list.append(parquet_file_path)

### Subset Data
We subset data temporarily just for testing and initial temporary results

In [None]:
# just use the first 2 parquet files
parquet_list = parquet_list[:2]

Load subset and apply basic pre-processing

In [None]:
df = pd.concat(map(pd.read_parquet, tqdm(parquet_list)))
df = df.reset_index()
df = df.set_index('index')

100%|██████████| 2/2 [00:00<00:00, 37.20it/s]


In [None]:
df

Unnamed: 0_level_0,tweet_date,tweet_id,sentiment,emojis,tweet_day,tweet_month,tweet_year,tweet_year_month,tweet_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,2019-04-08,1115139461875097600,1,😘,8,4,2019,2019-04,RT @tpagon: ระดับเดซิเบลของเสียงต่างๆ https://...
1,2019-04-08,1115139461875097600,1,😘,8,4,2019,2019-04,GOT7 fighting😘😘😘\n#GOT7 \n#BBMAs\n@GOT7Official
2,2019-04-08,1115139461912731648,1,🤗,8,4,2019,2019-04,I love it when ppl come to me to help with dog...
5,2019-04-08,1115139512231792640,1,😆,8,4,2019,2019-04,Aaaw you're a hella sweet bros😆
6,2019-04-08,1115139516438724608,1,😍,8,4,2019,2019-04,That booty😍😍
...,...,...,...,...,...,...,...,...,...
33970,2019-04-22,1120567851780378624,0,":(,:(",22,4,2019,2019-04,@Psycopathhh_ noted ate :(
33971,2019-04-22,1120567855978811392,1,😝,22,4,2019,2019-04,"Craving something sweet, like donuts😝😝"
33973,2019-04-22,1120567885305245696,1,😘,22,4,2019,2019-04,@paraadonat Miss you too 😘😂
33974,2019-04-22,1120567902095249408,0,😬,22,4,2019,2019-04,shidd mike b never even had one😬


# 2. Split into train and test sets

In [None]:
train_df, test_df = train_test_split(df, test_size = 0.2, random_state = SEED)

In [None]:
train_df

Unnamed: 0_level_0,tweet_date,tweet_id,sentiment,emojis,tweet_day,tweet_month,tweet_year,tweet_year_month,tweet_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4086,2019-04-22,1120278935525150720,0,"😭,😭",22,4,2019,2019-04,@jeonierals FCK PIPAY. FCK! 😭
13895,2019-04-08,1115434778654859264,0,😞,8,4,2019,2019-04,Those late night thoughts 😞
8045,2019-04-22,1120323957163548672,0,😩,22,4,2019,2019-04,“EVERYBODY AIN’T PROUD OF U🎯\nThey just surpri...
27610,2019-04-22,1120500289902120960,0,🙃,22,4,2019,2019-04,@stefanypackk @SsssamanthaaMUA I just started ...
17083,2019-04-08,1115470434420396032,1,"😌,😌",8,4,2019,2019-04,"so help me, help you 😌🤝"
...,...,...,...,...,...,...,...,...,...
15400,2019-04-08,1115450884782161920,1,😍,8,4,2019,2019-04,@podgie_TV oh my goodness😍
22619,2019-04-22,1120449773738315776,0,😬,22,4,2019,2019-04,RT @LaQuica91: aretter de faire sortir votre c...
22103,2019-04-22,1120444555986386944,0,😫,22,4,2019,2019-04,@chandlerrkentt_ My baby 😫😫😫😫 fuck I love him
11629,2019-04-08,1115409898043576320,1,"😌,😌",8,4,2019,2019-04,All I can do is smile 😌😌


# 3. Get emojis from emotag 620
We use emotag 620 emojis instead of all emojis in our dataset to avoid catastrophic forgetting when adding too many tokens. 

Citation:

["When the datasets for pre-training and fine-tuning are too distant, for example
two completely different languages, the weights of the model can revert to a random state and
lead to the observation of the phenomena called catastrophic forgetting."](https://essay.utwente.nl/80128/1/Yeung_InteractionTechnology_EEMCS.pdf)

In [None]:
emotag620_path = './emotag-dimension-as-emojis.txt'

In [None]:
# read emojis from top emotag 620 and save as list
with open(emotag620_path, 'r') as f:
    emotag620_str = f.read()
emotag620_list = emotag620_str.split()
emotag620_list.remove('word')

[Convert unicode 9f999 str to display emoji](https://stackoverflow.com/questions/69133311/unicode-for-an-emoji-in-a-string-variable-isnt-shown-as-the-emoji)

In [None]:
def unicode_to_emoji(unicode_str):
    formated_str = f"U{unicode_str.zfill(8)}"
    return chr(int(formated_str[1:], 16))

emotag620_list = [unicode_to_emoji(x) for x in emotag620_list]

In [None]:
emotag620_list[:5]

['🀄', '🃏', '🌀', '🌁', '🌂']

# 4. Get unique emoticons in train `emojis` column
(We don't have a way of finding emoticons in the text)

In [None]:
emoticons = set()
for emoji_str in train_df['emojis'].values:
    if ',' in emoji_str:
        emoji_list = emoji_str.split(',')
        for emoji in emoji_list:
            if not emo.is_emoji(emoji):
                emoticons.add(emoji)
    else:
        if not emo.is_emoji(emoji_str):
            emoticons.add(emoji_str)
emoticons = list(emoticons)

In [None]:
emoticons[:5]

[":')", ';)', 'D;', ':\\', '=]']

# 5. Demojize train & test sets, also emotag 620 emojis

In [None]:
train_df['demojized_tweet_text'] = train_df['tweet_text'].apply(emo.demojize)
test_df['demojized_tweet_text'] = test_df['tweet_text'].apply(emo.demojize)

In [None]:
train_df.head(3)

Unnamed: 0_level_0,tweet_date,tweet_id,sentiment,emojis,tweet_day,tweet_month,tweet_year,tweet_year_month,tweet_text,demojized_tweet_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4086,2019-04-22,1120278935525150720,0,"😭,😭",22,4,2019,2019-04,@jeonierals FCK PIPAY. FCK! 😭,@jeonierals FCK PIPAY. FCK! :loudly_crying_face:
13895,2019-04-08,1115434778654859264,0,😞,8,4,2019,2019-04,Those late night thoughts 😞,Those late night thoughts :disappointed_face:
8045,2019-04-22,1120323957163548672,0,😩,22,4,2019,2019-04,“EVERYBODY AIN’T PROUD OF U🎯\nThey just surpri...,“EVERYBODY AIN’T PROUD OF U:bullseye:\nThey ju...


In [None]:
demojized_emotag620 = [emo.demojize(x) for x in emotag620_list]

In [None]:
demojized_emotag620[:5]

[':mahjong_red_dragon:',
 ':joker:',
 ':cyclone:',
 ':foggy:',
 ':closed_umbrella:']

# 6. Create a dict for emoticons and apply a similar "demojize" operation in same fashion as 5.

In [None]:
demojize_emoticon_names = []
for i in range(len(emoticons)):
    key_name = f":emoticon{i + 1}:"
    demojize_emoticon_names.append(key_name)
emoticon_dict = dict(zip(emoticons, demojize_emoticon_names))

In [None]:
dict(list(emoticon_dict.items())[:5])

{":')": ':emoticon1:',
 ';)': ':emoticon2:',
 'D;': ':emoticon3:',
 ':\\': ':emoticon4:',
 '=]': ':emoticon5:'}

In [None]:
def replace_emoticon_with_name(tweet, emoticon_dict):
    for key in emoticon_dict:
        tweet = tweet.replace(key, emoticon_dict[key])
    return tweet

In [None]:
train_df['demojized_tweet_text'] = train_df['demojized_tweet_text'].apply(replace_emoticon_with_name, emoticon_dict=emoticon_dict)
test_df['demojized_tweet_text'] = test_df['demojized_tweet_text'].apply(replace_emoticon_with_name, emoticon_dict=emoticon_dict)

In [None]:
train_df.head(3)

Unnamed: 0_level_0,tweet_date,tweet_id,sentiment,emojis,tweet_day,tweet_month,tweet_year,tweet_year_month,tweet_text,demojized_tweet_text
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
4086,2019-04-22,1120278935525150720,0,"😭,😭",22,4,2019,2019-04,@jeonierals FCK PIPAY. FCK! 😭,@jeonierals FCK PIPAY. FCK! :loudly_crying_face:
13895,2019-04-08,1115434778654859264,0,😞,8,4,2019,2019-04,Those late night thoughts 😞,Those late night thoughts :disappointed_face:
8045,2019-04-22,1120323957163548672,0,😩,22,4,2019,2019-04,“EVERYBODY AIN’T PROUD OF U🎯\nThey just surpri...,“EVERYBODY AIN’T PROUD OF U:bullseye:\nThey ju...


# 7. Add demojized emotag 620 emojis and emoticons to tokenizer and resize BERT

In [None]:
model = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model, use_fast = True)
model = AutoModelForSequenceClassification.from_pretrained(model)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-cased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.25.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 28996
}

loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--bert-base-cased/snapshots/5532cc56f74641d4bb33641f5c76a55d11f846e0/vocab.txt
loading file tokenize

In [None]:
new_tokens = demojized_emotag620 + demojize_emoticon_names
print(new_tokens[:3])
print(new_tokens[-3:])

[':mahjong_red_dragon:', ':joker:', ':cyclone:']
[':emoticon30:', ':emoticon31:', ':emoticon32:']


In [None]:
num_add_toks = tokenizer.add_tokens(new_tokens)
print(f"Added {num_add_toks}, tokens")
model.resize_token_embeddings(len(tokenizer))

Added 652, tokens


Embedding(29648, 768)

# 8. Perform Sentiment Analysis

In [None]:
device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = model.to(device)

In [None]:
train_texts = train_df['demojized_tweet_text'].values.tolist()
train_labels = train_df['sentiment'].values.tolist()
test_texts = test_df['demojized_tweet_text'].values.tolist()
test_labels = test_df['sentiment'].values.tolist()

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt').to(device)
test_encodings = tokenizer(test_texts, truncation=True, padding=True, return_tensors='pt').to(device)

In [None]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

In [None]:
train_dataset = SentimentDataset(train_encodings, train_labels)
test_dataset = SentimentDataset(test_encodings, test_labels)

In [None]:
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis = 1)
    
    accuracy = accuracy_score(y_true = labels, y_pred = pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(labels, pred, average='weighted')

    return {"accuracy": accuracy,"recall": recall, "precision": precision,"f1_score":f1}

In [None]:
training_args = TrainingArguments(
    output_dir='./res',          # output directory
    evaluation_strategy="steps",
    num_train_epochs=5,              # total number of training epochs
    per_device_train_batch_size=32,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs4',            # directory for storing logs
    load_best_model_at_end=True,
    dataloader_pin_memory=False
)

using `logging_steps` to initialize `eval_steps` to 500
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    compute_metrics = compute_metrics
)
trainer.train()

***** Running training *****
  Num examples = 24248
  Num Epochs = 5
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 3790
  Number of trainable parameters = 108812546
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


Step,Training Loss,Validation Loss,Accuracy,Recall,Precision,F1 Score
500,0.3321,0.182067,0.91044,0.868758,0.953284,0.910378
1000,0.1366,0.164645,0.922646,0.992318,0.874471,0.922103
1500,0.1329,0.148465,0.922316,0.988476,0.876526,0.921819
2000,0.1209,0.164306,0.926769,0.922215,0.934783,0.926781
2500,0.1135,0.189866,0.922481,0.96959,0.889835,0.922204
3000,0.0901,0.176319,0.923305,0.93822,0.91508,0.923262
3500,0.0661,0.242538,0.921161,0.949424,0.902617,0.92104


***** Running Evaluation *****
  Num examples = 6063
  Batch size = 64
Saving model checkpoint to ./res/checkpoint-500
Configuration saved in ./res/checkpoint-500/config.json
Model weights saved in ./res/checkpoint-500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 6063
  Batch size = 64
Saving model checkpoint to ./res/checkpoint-1000
Configuration saved in ./res/checkpoint-1000/config.json
Model weights saved in ./res/checkpoint-1000/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 6063
  Batch size = 64
Saving model checkpoint to ./res/checkpoint-1500
Configuration saved in ./res/checkpoint-1500/config.json
Model weights saved in ./res/checkpoint-1500/pytorch_model.bin
  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
***** Running Evaluation *****
  Num examples = 6

TrainOutput(global_step=3790, training_loss=0.13547678069263147, metrics={'train_runtime': 7755.206, 'train_samples_per_second': 15.633, 'train_steps_per_second': 0.489, 'total_flos': 1.75073890681032e+16, 'train_loss': 0.13547678069263147, 'epoch': 5.0})

In [None]:
trainer.evaluate