In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold 
import torch 
from torch.utils.data import Dataset, DataLoader 
from transformers import AutoTokenizer 
import os 
import random 
import torch.nn as nn 
from transformers import AutoConfig 
from transformers import AutoModel  
from transformers import get_cosine_schedule_with_warmup 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error 
from transformers import AdamW 
import wandb 
from tqdm.notebook import tqdm 

wandb.login()

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
scaler = torch.cuda.amp.GradScaler() 

SEED = 0 
N_FOLDS = 5 
INPUT_DIR = '/content/drive/MyDrive/data' 
MAX_LEN = 320 

MODEL_NAME = 'bert-base-uncased' 
TOKENIZER = AutoTokenizer.from_pretrained(MODEL_NAME) 
LR = 2e-5 
WEIGHT_DECAY = 1e-6 
N_EPOCHS = 5
WARM_UP_RATIO = 0.1 

BS = 32
ACCUMULATE = 100
MIXED_PRECISION = False 

EXP_NAME = 'baseline' 

def create_folds(data):
    data["kfold"] = -1
    data = data.sample(frac=1, random_state=SEED).reset_index(drop=True)

    num_bins = int(np.floor(1 + np.log2(len(data))))
    
    data.loc[:, "bins"] = pd.cut(
        data["likes_count"], bins=num_bins, labels=False
    )
    kf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)
    for f, (t_, v_) in enumerate(kf.split(X=data, y=data.bins.values)):
        data.loc[v_, 'kfold'] = f
    data = data.drop("bins", axis=1)
    
    return data

def set_seed(seed=SEED):
  random.seed(set_seed) 
  os.environ["PYTHONHASHED"] = str(seed) 
  np.random.seed(seed) 
  torch.manual_seed(seed) 
  torch.cuda.manual_seed(seed) 
  torch.cuda.manual_seed_all(seed) 
  torch.backends.cudnn.deterministic = True 
  torch.backends.cudnn.benchmark = False 

class QiitaDataset(Dataset):

  def __init__(self, df):
    self.texts = df['body'].tolist() 
    self.labels = df['likes_count']

  def __len__(self):
    return len(self.texts) 

  def __getitem__(self, item):
    text =self.texts[item] 
    label = self.labels[item] 

    tok = TOKENIZER.encode_plus(
        text, 
        max_length=MAX_LEN,
        truncation=True, 
        padding='max_length',
        return_attention_mask=True, 
        return_token_type_ids=True,
    )

    d = {
        "input_ids": torch.tensor(tok['input_ids'], dtype=torch.long),
        "attention_mask": torch.tensor(tok["attention_mask"], dtype=torch.long),
         "token_type_ids": torch.tensor(tok["token_type_ids"], dtype=torch.long),
         "label": torch.tensor(label, dtype=torch.double),
    }

    return d 

class QiitaModel(nn.Module):

  def __init__(self):
    super(QiitaModel, self).__init__() 
    self.config = AutoConfig.from_pretrained(MODEL_NAME) 
    self.bert = AutoModel.from_pretrained(MODEL_NAME)
    self.regressor = nn.Linear(self.config.hidden_size, 1) 

  def forward(self, input_ids, attention_mask, token_type_ids):
    outputs = self.bert(
        input_ids,
        attention_mask=attention_mask,
        token_type_ids=token_type_ids
    )
    sequence_output = outputs['last_hidden_state'][:, 0] 
    logits = self.regressor(sequence_output)

    return logits 

  def loss_fn(self, logits, label):
    loss = nn.L1Loss(reduction='mean')(logits[:, 0], label)
    return loss

def validation_loop(valid_loader, model):
  model.eval() 
  preds = [] 
  for d in tqdm(valid_loader): 
    with torch.no_grad(): 
      logits = model(
          d['input_ids'].to(device),
          d['attention_mask'].to(device),
          d['token_type_ids'].to(device)
      )
    preds.append(logits[:, 0]) 
  y_pred = torch.hstack(preds).cpu().numpy() 
  y_true = valid_loader.dataset.labels 
  mae_loss = mean_absolute_error(y_true, y_pred) 
  return mae_loss


In [None]:
df = pd.DataFrame() 
for i in range(1, 13):
  df_tmp = pd.read_csv(f"../data_collection/data/2020-{i:02}.csv", encoding='utf8')
  df = pd.concat([df, df_tmp], ignore_index=True)

df = df[['likes_count', 'body']] 
df = df.dropna() 

In [None]:
# train_df = create_folds(df) 
train_df, valid_df = train_test_split(df, test_size=0.3)

train_df = train_df.reset_index()
valid_df = valid_df.reset_index()

# train_index = train_df.query('kfold!=0').index.tolist() 
# valid_index = train_df.query('kfold==0').index.tolist() 

# set dataset 
train_dataset = QiitaDataset(train_df) 
valid_dataset = QiitaDataset(valid_df) 

train_loader = DataLoader(train_dataset, batch_size=BS,
                          pin_memory=True, shuffle=True, drop_last=True, num_workers=0)
valid_loader = DataLoader(valid_dataset, batch_size=32, 
                          pin_memory=True, shuffle=False, drop_last=False, num_workers=0)


# set models 
model = QiitaModel() 
model.to(device) 

# set optimizer 
optimizer = AdamW(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY) 
max_train_steps = N_EPOCHS * len(train_loader) 
warmup_steps = int(max_train_steps * WARM_UP_RATIO) 
scheduler = get_cosine_schedule_with_warmup(
    optimizer, 
    num_warmup_steps=warmup_steps, 
    num_training_steps=max_train_steps
)

wandb.init(project='Qiita_BERT') 
# wandb.watch(model) 


set_seed() 
optimizer.zero_grad() 
train_iter_loss, valid_best_loss, all_step = 0, 999, 0 

for epoch in range(N_EPOCHS):
  print('epoch: ', epoch)
  for d in tqdm(train_loader): 
    all_step += 1 
    model.train() 
    if MIXED_PRECISION:
      with torch.cuda.amp.autocast(): 
        logits = model(
            d["input_ids"].to(device),
            d["attention_mask"].to(device),
            d["token_type_ids"].to(device)
        )
        loss = model.loss_fn(logits, d['label'].float().to(device))
        # loss = loss / ACCUMULATE 
        loss
    else:
      logits = model(
          d['input_ids'].to(device), 
          d['attention_mask'].to(device), 
          d['token_type_ids'].to(device)
      )
      loss = model.loss_fn(logits, d['label'].float().to(device)) 
      # loss = loss 


    # train_iter_loss += loss.item() 

    if MIXED_PRECISION:
      scaler.scale(loss).backward() 
    else:
      loss.backward() 

    wandb.log({
    "train_loss": loss.item(), 
    }) 



  # if all_step % ACCUMULATE == 0:
    if MIXED_PRECISION:
      scaler.step(optimizer) 
      scaler.update() 
    else:
      optimizer.step() 
    optimizer.zero_grad() 
    scheduler.step() 

    # train_iter_loss = 0

  valid_loss = validation_loop(valid_loader, model) 
  if valid_best_loss > valid_loss: 
    valid_best_loss = valid_loss 
    torch.save(model.to('cpu').state_dict(), '/content/drive/MyDrive/Colab Notebooks/modelv2.pth')
    model.to(device)

  wandb.log({
      "valid_loss": valid_loss, 
      "valid_best_loss": valid_best_loss,
  }) 
  
wandb.finish()

print(valid_best_loss)