In [None]:
# https://medium.com/@xiaohan_63326/fine-tune-fine-tuning-bert-for-sentiment-analysis-f5002b08f10a

In [1]:
import transformers
from transformers import get_linear_schedule_with_warmup
from transformers import BertTokenizer
from transformers import BertForSequenceClassification
from transformers import AdamW

import torch
from torch.utils.data import TensorDataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split

import pandas as pd
import re
import string
import operator
import numpy as np
import random

from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from text_classification_dataset import TextClassificationDataset
from sklearn.model_selection import train_test_split

In [2]:
torch.cuda.is_available()

False

#### CUDA check

In [None]:
print(transformers.__version__)
seed = 38
print('\n')
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

#### Functions

In [None]:
def encoding_process(_content):
    get_ids = []
    for text in _content:
        input_ids = tokenizer.encode(
                        text,
                        add_special_tokens = True,
                        max_length = 256,
                        pad_to_max_length = True,
                        return_tensors = 'pt')
        get_ids.append(input_ids)

    get_ids = torch.cat(get_ids, dim=0)
    return get_ids

In [None]:
output_model = './content/model/tweet_bert.pth'

def save(model, optimizer):
    # save
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict()
    }, output_model)

In [None]:
def accuracy_calc(preds, labels):

    pre = np.argmax(preds, axis=1).flatten()
    real = labels.flatten()
    return accuracy_score(real, pre)

In [None]:
def f1_accuracy(preds, labels):

    pre = np.argmax(preds, axis=1).flatten()
    real = labels.flatten()
    return f1_score(real, pre)

#### Data

In [None]:
train = pd.read_csv("..\\..\\..\\..\\data\\twitter_hate-speech\\train_cleaned.csv", index_col=0)
df_train = train[train.tweet_cleaned.notna()]
test = pd.read_csv("..\\..\\..\\..\\data\\twitter_hate-speech\\test_cleaned.csv", index_col=0)
df_test = test[test.tweet_cleaned.notna()]

In [None]:
print(df_train.shape)
print(df_test.shape)

In [None]:
model_name = 'google-bert/bert-base-uncased'
num_classes = 2
max_length = 128
batch_size = 16
num_epochs = 2

In [None]:
train_tweets, val_tweets, train_labels, val_labels = train_test_split(df_train.tweet_cleaned, df_train.label, test_size=0.2, random_state=42)

In [None]:
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)

In [None]:
# make sure return_dict is not default
return_dict = False

# Training dataset
content = df_train['tweet_cleaned'].values
labels = df_train['label'].values
get_ids = encoding_process(content)
labels = torch.tensor(labels)

# Validation dataset
val_content = val_tweets.values
val_labels = val_labels.values
val_get_ids = encoding_process(val_content)
val_labels = torch.tensor(val_labels)

In [None]:
train_dataset = TextClassificationDataset(train_tweets, train_labels, tokenizer, max_length)
val_dataset = TextClassificationDataset(val_tweets, val_labels, tokenizer, max_length)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

#### Model

In [None]:
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2, output_attentions=False, output_hidden_states=False)
model.cuda()
optimizer = AdamW(model.parameters(), lr=2e-5)

In [None]:
total_steps = len(train_dataloader) * num_epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

In [None]:
for epoch in range(num_epochs):
    model.train()
    total_loss, total_val_loss = 0, 0
    total_eval_accuracy = 0
    _f1 = 0
    _train_f1 = 0
    for step, batch in enumerate(train_dataloader):
        model.zero_grad()
        loss, tval_ = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0]>0).to(device), labels=batch[1].to(device),return_dict = False)
        total_loss += loss.item()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        tval_ = tval_.detach().cpu().numpy()
        label_ids = batch[1].to('cpu').numpy()
        _train_f1 += f1_accuracy(tval_, label_ids)

    model.eval()
    for i, batch in enumerate(val_dataloader):
        with torch.no_grad():
            loss, val_ = model(batch[0].to(device), token_type_ids=None, attention_mask=(batch[0]>0).to(device), labels=batch[1].to(device),return_dict = False)

            total_val_loss += loss.item()

            val_ = val_.detach().cpu().numpy()
            label_ids = batch[1].to('cpu').numpy()
            total_eval_accuracy += accuracy_calc(val_, label_ids)
            _f1 += f1_accuracy(val_, label_ids)

    training_loss = total_loss / len(train_dataloader)
    valid_loss = total_val_loss / len(val_dataloader)
    _accuracy = total_eval_accuracy / len(val_dataloader)
    _f1_score = _f1 / len(val_dataloader)
    train_f1_score = _train_f1/ len(train_dataloader)

    print('Training loss is', training_loss)
    print('Valid loss is:', valid_loss)
    print('Acc score is:', _accuracy)
    print('F1_score is:', _f1_score)
    print('train_F1_score is:', train_f1_score)
    print('\n')

In [None]:
save(model, optimizer)