# Defining notebook parameters

In [26]:
PRE_TRAINED_MODEL_NAME = 'HooshvareLab/bert-fa-base-uncased-sentiment-snappfood'

BASE_FILE_NAME = 'bert_fine_tuned'
MAX_LENGTH = 128
BATCH_SIZE = 8
EPOCHS = 1800

FREEZE_BERT = True

POSITIVE_NEGATIVE_THRESHOLD = 0
POSITIVE = 1
NEGATIVE = 0

LR = 5e-5
TQDM_TRAINING = False

CV_FILE = './data/eval.csv'
TRAIN_FILE = './data/train.csv'
TEST_FILE = './data/test.csv'

# Read all provided files as pandas' dataframes

In [27]:
import pandas as pd

cv_df = pd.read_csv(CV_FILE)
train_df = pd.read_csv(TRAIN_FILE)
test_df = pd.read_csv(TEST_FILE)

# Download and load model from internet

In [28]:
from transformers import pipeline, AutoTokenizer, AutoConfig, BertForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

model =  BertForSequenceClassification.from_pretrained(PRE_TRAINED_MODEL_NAME)

# Freeze bert weights

In [29]:
def freeze_submodel(submodel):
    for param in submodel.parameters():
        param.requires_grad = False
        
if FREEZE_BERT:
    freeze_submodel(model.bert)

# Convert all dataframes to pytorch dataset

In [30]:
from torch.utils.data import Dataset

class SentimentDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer):
        rows = []
    
        for row in df.itertuples():
            tokenized = tokenizer(row.comment, padding='max_length', truncation='longest_first', max_length=MAX_LENGTH)
            tokenized['label'] = POSITIVE if row.rate >= POSITIVE_NEGATIVE_THRESHOLD else NEGATIVE
            rows.append(tokenized)
                
        self.__rows = rows
        
    def __len__(self):
        return len(self.__rows)
    
    def __getitem__(self, idx):
        return self.__rows[idx]

In [31]:
cv_dataset = SentimentDataset(cv_df, tokenizer)
train_dataset = SentimentDataset(train_df, tokenizer)
test_dataset = SentimentDataset(test_df, tokenizer)

# Initialize trainer

In [7]:
from transformers import Trainer, TrainingArguments, TrainerCallback

training_args = TrainingArguments(
    output_dir=f"./{BASE_FILE_NAME}_checkpoint",
    overwrite_output_dir=True,
    num_train_epochs=EPOCHS,
    save_steps=1_000,
    save_total_limit=2,
    logging_steps=100,
    prediction_loss_only=True,
    evaluation_strategy='steps',
#     no_cuda = True,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    disable_tqdm=not TQDM_TRAINING,
    learning_rate=LR
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=cv_dataset
)

# Finetune the model

In [None]:
trainer.train()

# Save model for later use

In [None]:
model.save_pretrained(f"./{BASE_FILE_NAME}_final")

# Calc accuracy of model

In [36]:
import torch

def calc_accuracy_bert(model, dataset):
    true_count=0
    false_count=0

    model.eval()
    for idx, row in enumerate(dataset):
        with torch.no_grad():
            model_input = {}
            expected_output = 0
            for key, val in row.items():
                if key == 'label':
                    expected_output = val
                else:
                    model_input[key] = torch.tensor([val])

            model_output = model(**model_input)
            model_output_softmaxed = torch.softmax(model_output.logits, dim=1).tolist()[0]
            predicted_output = 1 if model_output_softmaxed[1] > model_output_softmaxed[0] else 0

            if predicted_output == expected_output:
                true_count+=1
            else:
                false_count+=1
    return true_count / (true_count + false_count)

In [37]:
model = BertForSequenceClassification.from_pretrained(f"./{BASE_FILE_NAME}_final")
for dataset, dataset_label in [(train_dataset, 'train'), (test_dataset, 'test'), (cv_dataset, 'eval')]:
     print(f'{dataset_label} Accuracy: {calc_accuracy_bert(model, dataset)}')

train Accuracy: 0.81125
test Accuracy: 0.7411764705882353
eval Accuracy: 0.745
