# Import libralies

In [None]:
import pickle
import torch
from transformers import BertModel
from transformers import BertJapaneseTokenizer, BertTokenizer
import random
import pandas as pd
import torch.nn as nn
import numpy as np
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from torch.utils.data import TensorDataset
from transformers import BertForSequenceClassification, AdamW, BertConfig
from transformers import get_linear_schedule_with_warmup
import random
import numpy as np
from tqdm import tqdm_notebook as tqdm
from tqdm import tqdm
from codecarbon import EmissionsTracker

# Define variables

In [None]:
MODEL_NAME = "cl-tohoku/bert-base-japanese"

MAX_SEQ_LEN = 512
BATCH_SIZE = 8

EPOCHS = 3

SEED = 42

DATA_DIR = "train"
MODEL_DIR = "models"

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Randomize seed

In [None]:
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
np.random.seed(SEED)
random.seed(SEED)

# Load train data

In [None]:
positive_df = pd.read_csv(f"{DATA_DIR}/train_positive.csv", lineterminator="\n").dropna()
negative_df = pd.read_csv(f"{DATA_DIR}/train_negative.csv", lineterminator="\n").dropna()

positive_df["Label"] = 1
negative_df["Label"] = 0

print(len(positive_df))
print(len(negative_df))

# Concat train data

In [None]:
df = pd.concat([positive_df, negative_df], axis=0)

# Train

## Tokenize

In [None]:
tokenizer = BertJapaneseTokenizer.from_pretrained(MODEL_NAME)

encoded_data_train = tokenizer.batch_encode_plus(
    df["Text"].values, 
    add_special_tokens = True, 
    return_attention_mask = True, 
    padding = "max_length", 
    max_length = MAX_SEQ_LEN, 
    return_tensors = "pt",
    truncation = True
)

## Make train_dataset

In [None]:
input_ids = []
attention_masks = []
labels = []

input_ids = encoded_data_train["input_ids"]
attention_masks = encoded_data_train["attention_mask"]
labels = torch.tensor(df["Label"].values)

train_dataset = TensorDataset(input_ids, attention_masks, labels)
train_dataloader = DataLoader(
            train_dataset,           # The training samples.
            shuffle=True,            # Select batches randomly
            batch_size = BATCH_SIZE, # Trains with this batch size.
)

## Train main process

In [None]:
model = BertForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels = 2,
        output_attentions = False,
        output_hidden_states = False
)

model.to(DEVICE)

optimizer = AdamW(
    model.parameters(),
    lr = 1e-5, 
    eps = 1e-8
)

total_steps = len(train_dataloader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, 
                                            num_warmup_steps = 50, # Default value in run_glue.py
                                            num_training_steps = total_steps)

for epoch in range(1, EPOCHS+1):
    model.train()
    
    loss_train_total = 0
    progress_bar = tqdm(train_dataloader, desc = "Epoch {:1d}".format(epoch), leave = False, disable = False)
    
    for batch in progress_bar:
        model.zero_grad()
        batch = tuple(b.to(DEVICE) for b in batch)

        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "labels": batch[2],
        }       

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({"training_loss": "{:.3f}".format(loss.item()/len(batch))})
        del loss
        
    tqdm.write(f'\nEpoch {epoch}')
    loss_train_avg = loss_train_total/len(train_dataloader)            
    tqdm.write(f'Training loss: {loss_train_avg}')
    print('----------------------------------')
    
    torch.save(model.state_dict(), f"{MODEL_DIR}/model_{epoch}.pth")