In [None]:
import os
from tqdm import tqdm
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.cuda.amp as amp
import warnings
warnings.filterwarnings('ignore')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(41)

In [None]:
from transformers import AutoModel,AutoModelForSequenceClassification, AutoConfig, AutoTokenizer
from transformers import get_scheduler, get_cosine_with_hard_restarts_schedule_with_warmup

checkpoint = "microsoft/codebert-base"
# checkpoint = "microsoft/graphcodebert-base"
# checkpoint = 'klue/roberta-large'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
config = AutoConfig.from_pretrained(checkpoint)
config.num_labels = 1
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, config=config).to(device)

In [None]:
tokenizer.truncation_side = 'left'
model.load_state_dict(torch.load('../input/codesim/model{}.pth'.format(fold)))

In [None]:
code_folder = '../input/codesim/open/code'
problem_folders = os.listdir(code_folder)

In [None]:
def preprocess_script(script):
    '''
    간단한 전처리 함수
    주석 -> 삭제
    '    '-> tab 변환
    다중 개행 -> 한 번으로 변환
    '''
    with open(script,'r',encoding='utf-8') as file:
        lines = file.readlines()
        preproc_lines = []
        for line in lines:
            if line.lstrip().startswith('#'):
                continue
            line = line.rstrip()
            if '#' in line:
                line = line[:line.index('#')]
            line = line.replace('\n','')
            line = line.replace('    ','\t')
            if line == '':
                continue
            preproc_lines.append(line)
        preprocessed_script = '\n'.join(preproc_lines)
    return preprocessed_script

preproc_scripts = []
problem_nums = []

for problem_folder in tqdm(problem_folders):
    scripts = os.listdir(os.path.join(code_folder,problem_folder))
    problem_num = scripts[0].split('_')[0]
    for script in scripts:
        script_file = os.path.join(code_folder,problem_folder,script)
        preprocessed_script = preprocess_script(script_file)

        preproc_scripts.append(preprocessed_script)
    problem_nums.extend([problem_num]*len(scripts))

In [None]:
df = pd.DataFrame(data = {'code':preproc_scripts, 'problem_num':problem_nums})
df.head()

In [None]:
df['tokens'] = df['code'].apply(tokenizer.tokenize)
df['len'] = df['tokens'].apply(len)

ndf = df[df['len'] <= 512].reset_index(drop=True)
train_df = ndf.copy()

In [None]:
from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=41)
for f, (train_idx, val_idx) in enumerate(kf.split(range(len(train_df)), y=train_df['problem_num'])):
    train_df.loc[val_idx, 'fold'] = f

In [None]:
fold = 0

In [None]:
df_train = train_df[train_df['fold'] != fold].reset_index(drop=True)
df_val = train_df[train_df['fold'] == fold].reset_index(drop=True)

In [None]:
!pip install rank_bm25

In [None]:
from rank_bm25 import BM25Okapi
from itertools import combinations

codes = df_train['code'].to_list()

tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
bm25 = BM25Okapi(tokenized_corpus)

In [None]:
# train = pd.read_csv("../input/codesim/train_data.csv")
# train = pd.read_csv("../input/codesim/trainfold0_1.csv")
test = pd.read_csv("../input/codesim/open/test.csv")
sub = pd.read_csv("../input/codesim/open/sample_submission.csv")

In [None]:
def process(script):
    lines = [codes + '\n' for codes in script.split('\n')]
    preproc_lines = []
    for line in lines:
        if line.lstrip().startswith('#'):
            continue
        line = line.rstrip()
        if '#' in line:
            line = line[:line.index('#')]
        line = line.replace('\n','')
        line = line.replace('    ','\t')
        if line == '':
            continue
        preproc_lines.append(line)
    preprocessed_script = '\n'.join(preproc_lines)
    return preprocessed_script

test['code1'] = test['code1'].map(lambda x: process(x))
test['code2'] = test['code2'].map(lambda x: process(x))

In [None]:
class BertSet(Dataset):
    def __init__(self, dataset, labels, mode):
        self.dataset = dataset
        self.labels = labels
        self.mode = mode
        
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        code1 = self.dataset.loc[idx, 'code1']
        code2 = self.dataset.loc[idx, 'code2']
        item = tokenizer(code1, 
                          code2, 
                          return_tensors='pt', 
                          max_length=512, 
                          padding='max_length',
                          truncation=True,
                          add_special_tokens=True,
                          return_token_type_ids=True)
        item['input_ids'] = item['input_ids'].squeeze(0)
        item['attention_mask'] = item['attention_mask'].squeeze(0)
        item['token_type_ids'] = item['token_type_ids'].squeeze(0)
        if self.mode == 'train':
            item['label'] = self.labels[idx]

#         if self.mode == 'train':
#             label = self.labels[idx]
#             item['label'] = label*0.8 + 0.1
        
        return item

In [None]:
# def get_dataloader(data):
#     problems = data['problem_num'].unique().tolist()
#     problems.sort()
#     total_positive_pairs = []
#     total_negative_pairs = []

#     for problem in tqdm(problems):
#         solution_codes = data[data['problem_num'] == problem]['code']
#         solution_codes_indices = solution_codes.index.to_list()
#         sample = data[data['problem_num'] == problem]['code'].sample(frac=1).reset_index(drop=True)

#         length = len(sample) // 2
#         positive_pairs = [(sample[2*n], sample[2*n + 1]) for n in range(length)]
#         negative_pairs = []

#         first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
#         negative_code_scores = bm25.get_scores(first_tokenized_code)
#         negative_code_ranking = negative_code_scores.argsort()[::-1] # 내림차순
#         ranking_idx = 0
        
#         for i in range(len(sample)):
#             if length*3/2 < i:
#                 break
#             solution_code = sample[i]
#             negative_solutions = []
#             high_score_idx = negative_code_ranking[ranking_idx]

#             if high_score_idx not in solution_codes_indices:
#                 negative_solutions.append(data['code'].iloc[high_score_idx])
#             ranking_idx += 1

#             for negative_solution in negative_solutions:
#                 negative_pairs.append((solution_code, negative_solution))

#         total_positive_pairs.extend(positive_pairs)
#         total_negative_pairs.extend(negative_pairs)

#     pos_code1 = list(map(lambda x:x[0],total_positive_pairs))
#     pos_code2 = list(map(lambda x:x[1],total_positive_pairs))

#     neg_code1 = list(map(lambda x:x[0],total_negative_pairs))
#     neg_code2 = list(map(lambda x:x[1],total_negative_pairs))

#     pos_label = [1]*len(pos_code1)
#     neg_label = [0]*len(neg_code1)
    
#     pos_code1.extend(neg_code1)
#     total_code1 = pos_code1
#     pos_code2.extend(neg_code2)
#     total_code2 = pos_code2
#     pos_label.extend(neg_label)
#     total_label = pos_label
#     df = pd.DataFrame(data={
#         'code1':total_code1,
#         'code2':total_code2,
#         'similar':total_label
#     })
#     df = df.sample(frac=1).reset_index(drop=True)
    
#     dataset = BertSet(df, df['similar'].values, 'train')
#     data_loader = DataLoader(dataset, batch_size=16, shuffle=True, num_workers=4)
    
#     return data_loader, df

In [None]:
def get_dataloader(data):
    problems = data['problem_num'].unique().tolist()
    problems.sort()
    total_positive_pairs = []
    total_negative_pairs = []

    for problem in tqdm(problems):
        solution_codes = data[data['problem_num'] == problem]['code']
        solution_codes_indices = solution_codes.index.to_list()
        sample = data[data['problem_num'] == problem]['code'].sample(frac=1).reset_index(drop=True)
        
        length = len(sample) // 2
        positive_pairs = [(sample[2*n], sample[2*n + 1]) for n in range(length)]
        negative_pairs = []

        first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
        negative_code_scores = bm25.get_scores(first_tokenized_code)
        negative_code_ranking = negative_code_scores.argsort()[::-1] # 내림차순
        ranking_idx = 0
        
        while len(negative_pairs) < len(positive_pairs):
            solution_code = sample.sample().values[0]
            negative_solutions = []
            high_score_idx = negative_code_ranking[ranking_idx]
            
            if high_score_idx not in solution_codes_indices:
                negative_solutions.append(data['code'].iloc[high_score_idx])
            ranking_idx += 1

            for negative_solution in negative_solutions:
                negative_pairs.append((solution_code, negative_solution))

        total_positive_pairs.extend(positive_pairs)
        total_negative_pairs.extend(negative_pairs)

    pos_code1 = list(map(lambda x:x[0],total_positive_pairs))
    pos_code2 = list(map(lambda x:x[1],total_positive_pairs))

    neg_code1 = list(map(lambda x:x[0],total_negative_pairs))
    neg_code2 = list(map(lambda x:x[1],total_negative_pairs))

    pos_label = [1]*len(pos_code1)
    neg_label = [0]*len(neg_code1)
    
    pos_code1.extend(neg_code1)
    total_code1 = pos_code1
    pos_code2.extend(neg_code2)
    total_code2 = pos_code2
    pos_label.extend(neg_label)
    total_label = pos_label
    df = pd.DataFrame(data={
        'code1':total_code1,
        'code2':total_code2,
        'similar':total_label
    })
    df = df.sample(frac=1).reset_index(drop=True)
    
    dataset = BertSet(df, df['similar'].values, 'train')
    data_loader = DataLoader(dataset, batch_size=16, shuffle=True, num_workers=4)
    
    return data_loader

In [None]:
# for i in range(15):
#     train_loader, train = get_dataloader(df_train)
#     train.to_csv('./train{}.csv'.format(i), index=False)

In [None]:
testset = BertSet(test, None, 'test')

# train_loader, train = get_dataloader(df_train)
test_loader = DataLoader(testset, batch_size=16, shuffle=False, num_workers=4)

In [None]:
# train = pd.read_csv("../input/codesim/train1/train2.csv")
# trainset = BertSet(train, train['similar'].values, 'train')
# valset = BertSet(df_val, df_val['similar'].values, 'train')
# testset = BertSet(test, None, 'test')

# train_loader = DataLoader(trainset, batch_size=16, shuffle=True, num_workers=4)
# val_loader = DataLoader(valset, batch_size=16, shuffle=True, num_workers=4)
# test_loader = DataLoader(testset, batch_size=16, shuffle=False, num_workers=4)

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = nn.BCELoss()
scaler = amp.GradScaler()
# scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer=optimizer, num_warmup_steps=1, num_training_steps=len(train_loader)*epochs,)

In [None]:
def cal_acc(x, y):
    pred = (x > 0.5).long().squeeze()
    y_pred = (y > 0.5).long().squeeze()
    acc = (pred == y_pred).sum().data.cpu().numpy()/y.size()[0]
    return acc

In [None]:
epochs = 15

for epoch in range(epochs):
    train_loader = get_dataloader(df_train)

#     train = pd.read_csv("../input/codesim/train{}/train{}.csv".format(fold, epoch))
#     dataset = BertSet(train, train['similar'].values, 'train')
#     train_loader = DataLoader(dataset, batch_size=16, shuffle=True, num_workers=4)
    
#     if epoch == 0:
#         scheduler = get_cosine_with_hard_restarts_schedule_with_warmup(optimizer=optimizer, 
#                                                                        num_warmup_steps=len(train_loader)//3, 
#                                                                        num_training_steps=len(train_loader)*epochs)
    train_acc = 0
    train_losses = 0
    model.train()
    for batch_id, batch in enumerate(tqdm(train_loader)):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch['token_type_ids'].to(device)
        label = batch['label'].to(device)
        
        outputs = model(input_ids, attention_mask, token_type_ids)
        
        outputs = F.sigmoid(outputs[0])
        
        loss = criterion(outputs.squeeze(), label.float())
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
#         scheduler.step()
        train_acc += cal_acc(outputs.detach().cpu(), label.detach().cpu())
        train_losses += loss.item()
    
    print("Epoch: {}, acc: {}, loss: {}".format(epoch+1, train_acc/(batch_id+1), train_losses/(batch_id+1)))
#         if batch_id == 16001:
#             print("Epoch: {}, acc: {}, loss: {}".format(epoch+1, cal_acc(outputs.detach().cpu(), label.detach().cpu()), loss.item()))
#             break

torch.save(model.state_dict(), "./model{}.pth".format(fold))

In [None]:
probs = []

model.eval()
for batch in tqdm(test_loader):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    token_type_ids = batch['token_type_ids'].to(device)
    with torch.no_grad():
        outputs = model(input_ids, attention_mask, token_type_ids)
    outputs = F.sigmoid(outputs[0]).detach().cpu().numpy()
    
    probs.extend(outputs)
preds = np.around(probs)

In [None]:
df_prob = pd.DataFrame(probs)
df_prob.to_csv('./prob{}.csv'.format(fold), index=False)
df_prob.head()

In [None]:
sub['similar'] = preds
sub.to_csv('./sub{}.csv'.format(fold), index=False)
sub.head()

In [None]:
sub['similar'].value_counts()