In [None]:
# Construct BERT model to do classification
import torch
from transformers import RobertaTokenizer, RobertaModel
from transformers import BertTokenizer, BertModel
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AdamW, get_linear_schedule_with_warmup
import time
import datetime
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import f1_score, accuracy_score
import json
from tqdm import tqdm
import random
from collections import Counter

random.seed(1)
# check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:

def read_json_file(file_path):
    total_line = 0
    data = []
    with open(file_path, 'r') as file:
        for line in tqdm(file):
            total_line += 1
            json_line = json.loads(line)
            data.append(json_line)
    return data, total_line

file_path = 'tweets_DM.json'
tweets_data, lines = read_json_file(file_path)

In [None]:
print((tweets_data[0]["_source"]["tweet"]))
clean_dicts = []
for i in range(lines):
    clean_dicts.append({"ids":tweets_data[i]["_source"]["tweet"]["tweet_id"], "text":tweets_data[i]["_source"]["tweet"]["text"], "Type":"train"})

In [None]:
# read label from emotion.csv
emotion_df = pd.read_csv('emotion.csv')
data_type = pd.read_csv('data_identification.csv')

In [None]:
train_id_set = set(data_type[data_type['identification'] == 'train']['tweet_id'])
test_id_set = data_type[data_type['identification'] == 'test']['tweet_id']
emotion_dict = dict(zip(emotion_df['tweet_id'], emotion_df['emotion']))
for dic in tqdm(clean_dicts):
    if dic['ids'] in train_id_set:
        dic['label'] = emotion_dict[dic['ids']]
    else:
        dic['Type'] = 'test'

In [None]:
# extract type == train from X
train_dicts = []
test_dicts = []
for idx, dic in enumerate(clean_dicts):
    if dic['Type'] == 'train':
        train_dicts.append(dic)
    else:
        test_dicts.append(dic)
print(len(train_dicts))
print(len(test_dicts))

In [None]:
# do statistics on train_dicts

emotion_counter = Counter()
for dic in train_dicts:
    emotion_counter[dic['label']] += 1
print(emotion_counter)
# extract each emotion from train_dicts with the same number of least emotion
emotion_num = emotion_counter.most_common()[-1][1]
print(emotion_num)
# emotion_num = 1000
train_dicts_same = []
for key in emotion_counter.keys():
    target_dict = [dic for dic in train_dicts if dic['label'] == key]
    random.shuffle(target_dict)
    train_dicts_same += target_dict[:]
print(len(train_dicts_same))

In [None]:
# do one-hot encoding on train_dicts_same
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

label_encoder = LabelEncoder()
one_hot_encoder = OneHotEncoder()
X = [dic['text'] for dic in train_dicts_same]
y = [dic['label'] for dic in train_dicts_same]
label_y = label_encoder.fit_transform(y)
encode_y = one_hot_encoder.fit_transform(label_y.reshape(-1, 1)).toarray()



In [None]:
# train test split
from sklearn.model_selection import train_test_split

train_X, val_X, train_y, val_y = train_test_split(X, encode_y, test_size=0.1, random_state=1)
val_X, test_X, val_y, test_y = train_test_split(val_X, val_y, test_size=0.5, random_state=1)
# do statistics on train_y

train_y = np.argmax(train_y, axis=1)
val_y = np.argmax(val_y, axis=1)
test_y = np.argmax(test_y, axis=1)
print(Counter(train_y))
print(Counter(val_y))
print(Counter(test_y))

In [None]:
# load BERT tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=True)

In [None]:
# define model class for BERT

class BertClassifier(torch.nn.Module):
    def __init__(self, freeze_bert=False):
        super(BertClassifier, self).__init__()
        # specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 8

        # instantiate BERT model
        self.bert = RobertaModel.from_pretrained('roberta-base')

        # instantiate an one-layer feed-forward classifier
        self.classifier = torch.nn.Sequential(
            torch.nn.Linear(D_in, H),
            torch.nn.ReLU(),
            torch.nn.Linear(H, D_out),
            torch.nn.Softmax(dim=1)
        )

        # freeze bert layers
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False

    def forward(self, input_ids, attention_mask):
        # feed input to BERT
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)
        # extract last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]
        # feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)
        return logits

In [None]:
model = BertClassifier(freeze_bert=False)
model.to(device)

# define optimizer and learning rate scheduler
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5)
# number of training epochs
epochs = 1
# number of batches
batch_size = 32
# calculate number of training steps
num_train_steps = int(len(train_X) / batch_size * epochs)
# create the learning rate scheduler
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=num_train_steps)
# define loss function
loss_fn = torch.nn.CrossEntropyLoss()

In [None]:
# define function to train model
def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    # start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):
        # =======================================
        #               Training
        # =======================================
        # print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Val F1 ':^9} | {'Elapsed':^9}")
        print("-" * 70)
        # measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()
        # reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0
        # put the model into the training mode
        model.train()
        # for each batch of training data
        for step, batch in enumerate(tqdm(train_dataloader)):
            batch_counts += 1
            # load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
            # zero out any previously calculated gradients
            model.zero_grad()
            # perform forward pass
            logits = model(b_input_ids, b_attn_mask)
            # compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels)
            batch_loss += loss.item()
            total_loss += loss.item()
            # perform backward pass to calculate gradients
            loss.backward()
            # clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            # update parameters
            optimizer.step()
            scheduler.step()
            # print the loss values and time elapsed for every 20 batches
            if (step % 500 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch
                # print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")
                # reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()
        # calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)
        print("-" * 70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # after the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy, val_f1_score = evaluate(model, val_dataloader)
            # print validation results
            time_elapsed = time.time() - t0_epoch
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {val_f1_score:^9.2f} | {time_elapsed:^9.2f}")
            print("-" * 70)
        print("\n")
    print("Training complete!")
    
# define function for evaluation
def evaluate(model, val_dataloader):
    # put the model into the evaluation mode
    model.eval()
    # tracking variables
    val_accuracy = []
    val_loss = []
    # for each batch in our validation set
    val_f1_score = []
    for batch in val_dataloader:
        # load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)
        # deactivate autograd
        with torch.no_grad():
            # model predictions
            logits = model(b_input_ids, b_attn_mask)
        # compute loss and accuracy
        loss = loss_fn(logits, b_labels)
        val_loss.append(loss.item())
        # get the predictions
        preds = torch.argmax(logits, dim=1).flatten()
        # calculate the accuracy rate
        # calculate the f1 score
        f1_score_macro = f1_score(b_labels.cpu().numpy(), preds.cpu().numpy(), average='macro')
        accuracy = (preds == b_labels).cpu().numpy().mean() * 100
        val_f1_score.append(f1_score_macro)
        val_accuracy.append(accuracy)
    # compute the average accuracy and loss over the validation set
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)
    val_f1_score = np.mean(val_f1_score)
    return val_loss, val_accuracy, val_f1_score

In [None]:
# convert train data to torch tensor
train_inputs = tokenizer(train_X, padding=True, truncation=True, max_length=256, return_tensors="pt")
train_labels = torch.tensor(train_y)
# convert validation data to torch tensor
val_inputs = tokenizer(val_X, padding=True, truncation=True, max_length=256, return_tensors="pt")
val_labels = torch.tensor(val_y)
# convert test data to torch tensor
test_inputs = tokenizer(test_X, padding=True, truncation=True, max_length=256, return_tensors="pt")
test_labels = torch.tensor(test_y)

# create the DataLoader for our training set
train_data = TensorDataset(train_inputs['input_ids'], train_inputs['attention_mask'], train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
# create the DataLoader for our validation set
val_data = TensorDataset(val_inputs['input_ids'], val_inputs['attention_mask'], val_labels)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)
# create the DataLoader for our test set
test_data = TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'], test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
                                    

In [None]:
train(model=model, train_dataloader=train_dataloader, val_dataloader=val_dataloader, epochs=epochs, evaluation=True)

In [None]:
# compute the accuracy and f1 score on the test set
test_loss, test_accuracy, test_f1_score = evaluate(model, test_dataloader)
# print the accuracy and loss on the test set
print(f"Test Accuracy: {test_accuracy}")
print(f"Test F1 Score: {test_f1_score}")


In [None]:
# save model
torch.save(model.state_dict(), 'BERT_model_v3.bin')

In [None]:
# load model
model = BertClassifier(freeze_bert=False)
model.load_state_dict(torch.load('BERT_model_v3.bin'))
model.to(device)

In [None]:
# do predict on test_dicts, which has no label
test_inputs = tokenizer([dic['text'] for dic in test_dicts], padding=True, truncation=True, max_length=256, return_tensors="pt")

# create the DataLoader for our test set
test_data = TensorDataset(test_inputs['input_ids'], test_inputs['attention_mask'])
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [None]:


# put the model into the evaluation mode
model.eval()
# tracking variables
predictions = []
# predict
for batch in tqdm(test_dataloader):
    # load batch to GPU
    b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)
    # deactivate autograd
    with torch.no_grad():
        # model predictions
        logits = model(b_input_ids, b_attn_mask)
    # get the predictions
    preds = torch.argmax(logits, dim=1).flatten()
    # put the predicted labels to a list
    predictions += preds.cpu().numpy().tolist()
# get the prediction result
predictions = label_encoder.inverse_transform(predictions)
# print the result into a csv file
output_df = pd.DataFrame({'id': [dic['ids'] for dic in test_dicts], 'emotion': predictions})



In [None]:
pd.DataFrame.to_csv(output_df, 'submission_v3.csv', index=False)