In [None]:
!pip install transformers

In [None]:
import numpy as np
import pandas as pd
import copy
import gc
import os
import re
import time
from tqdm import tqdm
from time import sleep

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

import transformers
from transformers import AutoTokenizer, AutoModel, AutoTokenizer, AutoConfig, DataCollatorWithPadding, RobertaPreTrainedModel

### Config

In [None]:
CONFIG = {"seed": 42,
          "epochs": 3,
          "model_name": "huggingface/CodeBERTa-small-v1",
          "train_bsize": 16,
          "val_bsize":64,
          "max_length": 256,
          "learning_rate": 0.004, 
          "scheduler": 'get_linear_schedule_with_warmup',
          "weight_decay": 0.01,
          "n_fold":2,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          }

CONFIG["tokenizer"] = AutoTokenizer.from_pretrained(CONFIG['model_name'])
print(CONFIG['device'])

### test df

In [None]:
test_df = pd.read_csv('./drive/MyDrive/data/test.csv')
test_df.shape

(179700, 3)

### step function

In [None]:
CONFIG['threshold'] = 0.5
def step_function(value):
    return (value.view(-1) >= torch.tensor(CONFIG['threshold']).to(CONFIG['device'])).int()

In [None]:
def sub_tokenizing(dataset):
    codes1 = dataset['code1'].tolist()
    codes2 = dataset['code2'].tolist()
    print("codes1, codes2 length:", len(codes1), len(codes2))

    tokenized = CONFIG['tokenizer'](
        codes1,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=CONFIG['max_length']
    )
    tokenized2 =  CONFIG['tokenizer'](
        codes2,
        return_tensors='pt',
        padding='max_length',
        truncation=True,
        max_length=CONFIG['max_length']
    )
    for key, value in tokenized2.items():
        tokenized[key+"2"] = value

    return tokenized, len(codes1)


# Dataset 구성.
class SubCustomDataset(Dataset):
    def __init__(self, tokenized_dataset, length):
        self.tokenized_dataset = tokenized_dataset
        self.length = length

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.tokenized_dataset.items()}
        return item

    def __len__(self):
        return self.length
    
    
def sub_pro_dataset(dataset):
    tokenized, length = sub_tokenizing(dataset)
    custom_dataset = SubCustomDataset(tokenized, length)
    print("Custom dataset size:", len(custom_dataset))
    dataloader = DataLoader(
        custom_dataset, 
        shuffle=False,
        drop_last=False,
        batch_size=CONFIG['val_bsize']
    )
    return dataloader

In [None]:
test_dataloader = sub_pro_dataset(test_df)

codes1, codes2 length: 179700 179700
Custom dataset size: 179700


In [None]:
# Customized model 
class CustomModel(nn.Module):
    def __init__(self, config):
        super(CustomModel, self).__init__()
        self.model = AutoModel.from_pretrained(CONFIG['model_name'], config=config)
        self.similarity_fn = nn.CosineSimilarity()
        self.sequential = nn.Sequential(
            nn.Linear(1, 64),
            nn.BatchNorm1d(64),
            nn.Linear(64, 2)
        )
        gc.collect()
    def forward(self, input_ids=None, attention_mask=None, 
                input_ids2=None, attention_mask2=None, labels=None):
        gc.collect()
        outputs1 = self.model(
            input_ids, attention_mask=attention_mask
        )
        gc.collect()
        outputs2 = self.model(
            input_ids2, attention_mask=attention_mask2
        )
        gc.collect()
        pooler1 = outputs1[0]
        pooler2 = outputs2[0]

        # Mean
        pooler1 =  pooler1.mean(dim=1) # self.pooling(pooler1, attention_mask)
        pooler2 =  pooler2.mean(dim=1) # self.pooling(pooler2, attention_mask2)

        # Normalize
        a_norm = F.normalize(pooler1, p=2, dim=1)
        b_norm = F.normalize(pooler2, p=2, dim=1)

        sim_score =  self.similarity_fn(a_norm, b_norm)
        sim_score = sim_score.unsqueeze(-1)
        sim_score = self.sequential(sim_score)
        del pooler1, pooler2, a_norm, b_norm

        return sim_score

In [None]:
gc.collect()
gc.collect()
gc.collect()

0

In [None]:
MODEL_PATH = ['/content/drive/MyDrive//Loss-Fold-0.pt',
              '/content/drive/MyDrive//Loss-Fold-1.pt',
             ]
MODEL_CONFIG = AutoConfig.from_pretrained(CONFIG['model_name'])
model = CustomModel(MODEL_CONFIG).to(CONFIG['device'])

In [None]:
# 예측값 저장 리스트
preds_lst = []

with torch.no_grad():
  for i, path in enumerate(MODEL_PATH):
    print(f'===getting predictions for model {i+1}===')

    model.load_state_dict(torch.load(path, map_location=CONFIG['device']))
    model.eval()

    preds = []

    bar = tqdm(enumerate(test_dataloader), total=len(test_dataloader))
    for idx, items in bar:
      sleep(0.1)

      item = {key: val.to(CONFIG['device']) for key,val in items.items()}
      outputs = model(**item)

      preds.append(outputs)
    preds_lst.append(preds)

In [None]:
a = preds_lst
b = []
b.append([((a[0][i]+a[1][i])/2) for i in range(len(a[0]))])

In [None]:
c = []
c.append([np.argmax(b[0][i].cpu() > 0.5, axis=-1) for i in range(len(b[0]))])
c_lst = [c[0][i].tolist() for i in range(len(c[0]))]

In [None]:
final_preds = sum(c_lst, [])
len(final_preds)

In [None]:
sub_f = pd.read_csv('./drive/MyDrive/data/sample_submission.csv')
sub_f['similar'] = final_preds

In [None]:
## 예측 결과 저장
sub_f.to_csv('./drive/MyDrive/data/subff.csv', index = False)