# CodeSimilarity file using graphcodebert

## Import module and library

In [1]:
import os, sys
import copy
import numpy as np
import pandas as pd
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import Counter
from datasets import load_metric, load_dataset, load_from_disk
from transformers import DataCollatorWithPadding, TrainingArguments, Trainer, EarlyStoppingCallback
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import RobertaForSequenceClassification

### Load Dataset and device setup

In [2]:
train_dataset = load_from_disk('../data/train_dataset_lv1')
valid_dataset = load_from_disk('../data/valid_dataset_lv1')

In [3]:
print(os.getcwd())

/home/piai/.jupyter/lab/workspaces/code_clone_detection/notebooks


In [3]:
MODEL = "/home/piai/.jupyter/lab/workspaces/code_clone_detection/notebooks/models/best_models"
MODEL2 = "/home/piai/.jupyter/lab/workspaces/code_clone_detection/notebooks/models/1/checkpoint-19000"
MODEL3 = "/home/piai/.jupyter/lab/workspaces/code_clone_detection/notebooks/models/fold3/checkpoint-1500"
MODEL4 = "/home/piai/.jupyter/lab/workspaces/code_clone_detection/notebooks/models/fold4/checkpoint-1000"
MODEL5 = "/home/piai/.jupyter/lab/workspaces/code_clone_detection/notebooks/models/fold4/checkpoint-2500"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MAX_LEN = 512

model = AutoModelForSequenceClassification.from_pretrained(MODEL)
model.to(device)
model2 = AutoModelForSequenceClassification.from_pretrained(MODEL2)
model2.to(device)
model3 = AutoModelForSequenceClassification.from_pretrained(MODEL3)
model3.to(device)
model4 = AutoModelForSequenceClassification.from_pretrained(MODEL4)
model4.to(device)
model5 = AutoModelForSequenceClassification.from_pretrained(MODEL5)
model5.to(device)

tokenizer = AutoTokenizer.from_pretrained(MODEL)
tokenizer.truncation_side = 'left'

In [4]:
_collator = DataCollatorWithPadding(tokenizer=tokenizer)
_metric = load_metric("glue", "sst2")

def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=MAX_LEN,truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs

def metric_fn(p):
    preds, labels = p
    output =  _metric.compute(references=labels, predictions=np.argmax(preds, axis=-1))
    return output

In [5]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(seed)
    random.seed(seed)
    
seed_everything(42)

In [6]:
import gc
from knockknock import discord_sender

webhook_url='https://discord.com/api/webhooks/982252843270561912/N-lIX9ZEyAnlTJpYn-2Z7IwczmyKbqmZOM-g_fh0XrVsHiWUfj1fknlaA33aGy9JSwwh'

@discord_sender(webhook_url=webhook_url)
def do_train():
    gc.collect()
    torch.cuda.empty_cache()
    trainer.train()

In [None]:
gc.collect()
torch.cuda.empty_cache()

TEST = "../data/test.csv"
SUB = "../data/sample_submission.csv"

test_dataset = load_dataset("csv", data_files=TEST)["train"]
test_dataset = test_dataset.map(example_fn, remove_columns=["code1", "code2"])

Using custom data configuration default-4848ef6d457f4595
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-4848ef6d457f4595/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/179700 [00:00<?, ?ex/s]

In [8]:
test_size = len(test_dataset)
predictions = []
labels = []

## Setting for K-fold train and inference test dataset

In [None]:
k_fold = 5
gap = int(len(train_dataset) / k_fold)

for fold in range(k_fold):
    print("-"*100)
    print(f"{fold+1}th fold - Training start")
    print("-"*100)
    
    output_dir = './models/' + f"fold{fold+1}"
    dataset_size = len(train_dataset)
    total_ids = list(range(dataset_size))      # 전체 dataset 크기 index
    del_ids = list(range(fold*gap, (fold+1)*gap))    # 0 ~ 120,000, 120,000 ~ 240,000 과 같이 나눠줌
    training_ids = set(total_ids) - set(del_ids) # 0 ~ 600,000에서 del_ids를 빼면 그만큼 training fold가 됨
    
    fold_train_dataset = train_dataset.select(list(training_ids))
    fold_valid_dataset = train_dataset.select(del_ids)
    
    args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=4,
        warmup_steps=250,
        disable_tqdm = False,
        do_train=True,
        do_eval=True,
        save_strategy="steps",
        logging_strategy="steps",
        evaluation_strategy="steps",
        eval_steps=500,
        learning_rate=2e-5,
        optim='adamw_torch',
        # metric_for_best_model= "f1",
        save_total_limit=5,
        load_best_model_at_end=True,
    )

    trainer = Trainer(
            model=model,
            args=args,
            data_collator=_collator,
            train_dataset=fold_train_dataset,
            eval_dataset=fold_valid_dataset,
            tokenizer=tokenizer,
            compute_metrics= metric_fn,
            callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
    )
    
    do_train()
    
    print("-"*100)
    print(f"{fold+1}th fold - Inference start")
    print("-"*100)
    pred = trainer.predict(test_dataset)
    predictions.append(pred.predictions)
    
    label = np.argmax(pred.predictions, axis=-1)
    labels.append(label)

----------------------------------------------------------------------------------------------------
1th fold - Training start
----------------------------------------------------------------------------------------------------
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- E

    There is an imbalance between your GPUs. You may want to exclude GPU 0 which
    has less than 75% of the memory or cores of GPU 1. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
***** Running training *****
  Num examples = 480000
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 45000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mnahyeonkang[0m. Use [1m`wandb login --relogin`[0m to force relogin


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




Step,Training Loss,Validation Loss,Accuracy
500,0.0098,0.02719,0.995108
1000,0.0115,0.029157,0.994017
1500,0.0144,0.033534,0.993342
2000,0.0091,0.042651,0.992383


    There is an imbalance between your GPUs. You may want to exclude GPU 0 which
    has less than 75% of the memory or cores of GPU 1. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
***** Running Evaluation *****
  Num examples = 120000
  Batch size = 32
Saving model checkpoint to ./models/fold1/checkpoint-500
Configuration saved in ./models/fold1/checkpoint-500/config.json
Model weights saved in ./models/fold1/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./models/fold1/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./models/fold1/checkpoint-500/special_tokens_map.json
Deleting older checkpoint [models/fold1/checkpoint-2000] due to args.save_total_limit
    There is an imbalance between your GPUs. You may want to exclude GPU 0 which
    has less than 75% of the memory or cores of GPU 1. You can do so by setting
    the device_ids argument to DataParallel, or by

----------------------------------------------------------------------------------------------------
1th fold - Inference start
----------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------
2th fold - Training start
----------------------------------------------------------------------------------------------------


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 480000
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 45000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Accuracy
500,0.0068,0.032458,0.994283
1000,0.0176,0.027827,0.993883
1500,0.0151,0.027394,0.99435
2000,0.0122,0.039131,0.991883
2500,0.0101,0.028691,0.99355
3000,0.0119,0.029672,0.994692


***** Running Evaluation *****
  Num examples = 120000
  Batch size = 32
Saving model checkpoint to ./models/fold2/checkpoint-500
Configuration saved in ./models/fold2/checkpoint-500/config.json
Model weights saved in ./models/fold2/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./models/fold2/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./models/fold2/checkpoint-500/special_tokens_map.json
Deleting older checkpoint [models/fold2/checkpoint-4500] due to args.save_total_limit
    There is an imbalance between your GPUs. You may want to exclude GPU 0 which
    has less than 75% of the memory or cores of GPU 1. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
***** Running Evaluation *****
  Num examples = 120000
  Batch size = 32
Saving model checkpoint to ./models/fold2/checkpoint-1000
Configuration saved in ./models/fold2/checkpoint-1000/config.json
Model weights 

----------------------------------------------------------------------------------------------------
2th fold - Inference start
----------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------
3th fold - Training start
----------------------------------------------------------------------------------------------------


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 480000
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 45000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss,Accuracy
500,0.0063,0.042992,0.992833
1000,0.021,0.027065,0.993967
1500,0.0184,0.015511,0.995692
2000,0.0178,0.024676,0.99515
2500,0.0235,0.021644,0.995258
3000,0.0201,0.022549,0.994817


***** Running Evaluation *****
  Num examples = 120000
  Batch size = 32
Saving model checkpoint to ./models/fold3/checkpoint-500
Configuration saved in ./models/fold3/checkpoint-500/config.json
Model weights saved in ./models/fold3/checkpoint-500/pytorch_model.bin
tokenizer config file saved in ./models/fold3/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./models/fold3/checkpoint-500/special_tokens_map.json
    There is an imbalance between your GPUs. You may want to exclude GPU 0 which
    has less than 75% of the memory or cores of GPU 1. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
***** Running Evaluation *****
  Num examples = 120000
  Batch size = 32
Saving model checkpoint to ./models/fold3/checkpoint-1000
Configuration saved in ./models/fold3/checkpoint-1000/config.json
Model weights saved in ./models/fold3/checkpoint-1000/pytorch_model.bin
tokenizer config file saved 

----------------------------------------------------------------------------------------------------
3th fold - Inference start
----------------------------------------------------------------------------------------------------


----------------------------------------------------------------------------------------------------
4th fold - Training start
----------------------------------------------------------------------------------------------------


PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 480000
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 4
  Total optimization steps = 45000
Automatic Weights & Biases logging enabled, to disable set os.environ["WANDB_DISABLED"] = "true"


Step,Training Loss,Validation Loss


***** Running Evaluation *****
  Num examples = 120000
  Batch size = 32


In [13]:
output_dir = './models/' + f"final"

args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=32,
        gradient_accumulation_steps=4,
        warmup_steps=250,
        disable_tqdm = False,
        do_train=True,
        do_eval=True,
        save_strategy="steps",
        logging_strategy="steps",
        evaluation_strategy="steps",
        eval_steps=500,
        learning_rate=2e-5,
        optim='adamw_torch',
        # metric_for_best_model= "f1",
        save_total_limit=5,
        load_best_model_at_end=True,
    )

trainer = Trainer(
            model=model,
            args=args,
            data_collator=_collator,
            train_dataset=train_dataset,
            eval_dataset=valid_dataset,
            tokenizer=tokenizer,
            compute_metrics= metric_fn,
            callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
    )
############################
trainer2 = Trainer(
            model=model2,
            args=args,
            data_collator=_collator,
            train_dataset=train_dataset,
            eval_dataset=valid_dataset,
            tokenizer=tokenizer,
            compute_metrics= metric_fn,
            callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
    )
#####################
trainer3 = Trainer(
            model=model3,
            args=args,
            data_collator=_collator,
            train_dataset=train_dataset,
            eval_dataset=valid_dataset,
            tokenizer=tokenizer,
            compute_metrics= metric_fn,
            callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
    )
################
trainer4 = Trainer(
            model=model4,
            args=args,
            data_collator=_collator,
            train_dataset=train_dataset,
            eval_dataset=valid_dataset,
            tokenizer=tokenizer,
            compute_metrics= metric_fn,
            callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
    )
############################
trainer5 = Trainer(
            model=model5,
            args=args,
            data_collator=_collator,
            train_dataset=train_dataset,
            eval_dataset=valid_dataset,
            tokenizer=tokenizer,
            compute_metrics= metric_fn,
            callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
    )

print("-"*100)
print(f"Inference start")
print("-"*100)
pred = trainer.predict(test_dataset)
pred2 = trainer2.predict(test_dataset)
pred3 = trainer3.predict(test_dataset)
pred4 = trainer4.predict(test_dataset)
pred5 = trainer5.predict(test_dataset)
predictions.append(pred.predictions)
predictions.append(pred2.predictions)
predictions.append(pred3.predictions)
predictions.append(pred4.predictions)
predictions.append(pred5.predictions)

label = np.argmax(pred.predictions, axis=-1)
label2 = np.argmax(pred2.predictions, axis=-1)
label3 = np.argmax(pred3.predictions, axis=-1)
label4 = np.argmax(pred4.predictions, axis=-1)
label5 = np.argmax(pred5.predictions, axis=-1)
labels.append(label)
labels.append(label2)
labels.append(label3)
labels.append(label4)
labels.append(label5)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: pair_id. If pair_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
    There is an imbalance between your GPUs. You may want to exclude GPU 0 which
    has less than 75% of the memory or cores of GPU 1. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.
***** Running Prediction *****
  Num examples = 179700
  Batch size = 32


----------------------------------------------------------------------------------------------------
Inference start
----------------------------------------------------------------------------------------------------


The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: pair_id. If pair_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 179700
  Batch size = 32


The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: pair_id. If pair_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 179700
  Batch size = 32


The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: pair_id. If pair_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 179700
  Batch size = 32


The following columns in the test set  don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: pair_id. If pair_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 179700
  Batch size = 32


In [14]:
print(predictions)
print(labels)

[array([[ 5.27169  , -5.3973827],
       [-5.06581  ,  5.21526  ],
       [ 4.717493 , -4.670257 ],
       ...,
       [ 5.1468363, -5.170413 ],
       [-5.061063 ,  5.208819 ],
       [ 2.6489146, -2.2357914]], dtype=float32), array([[ 4.8886895, -4.8665314],
       [-3.874259 ,  3.765397 ],
       [ 2.589425 , -2.2947364],
       ...,
       [ 3.7384713, -3.5434034],
       [-3.8515565,  3.749201 ],
       [ 1.6035545, -1.3466036]], dtype=float32), array([[ 5.034718 , -5.234578 ],
       [-4.4402866,  4.5899   ],
       [ 4.8308353, -4.8489704],
       ...,
       [ 4.168168 , -4.001863 ],
       [-4.5176706,  4.670958 ],
       [-1.4308709,  1.6540235]], dtype=float32), array([[ 5.3510113, -5.5393786],
       [-5.1544895,  5.301077 ],
       [ 5.1196322, -5.1616297],
       ...,
       [ 5.2493987, -5.3089194],
       [-5.161056 ,  5.309693 ],
       [ 4.140885 , -3.833423 ]], dtype=float32), array([[ 5.215371 , -5.473607 ],
       [-5.0005484,  5.215244 ],
       [ 5.040931 , -5.13

### Inference test dataset by using Hard and Soft voting

In [17]:
# Hard voting
hard_voted_labels = []
for i in range(test_size):
    label_list = [label[i] for label in labels]
    counter = Counter(label_list)
    selected = sorted(counter.items(), key=lambda x : x[1], reverse=True)[0][0]
    hard_voted_labels.append(selected)
    
hard_voted_df = pd.read_csv(SUB)
hard_voted_df['similar'] = hard_voted_labels
hard_voted_df.to_csv("./submissions/final_submission_HardVoting.csv", index=False)

In [16]:
# Softmax voting
probs = np.sum(predictions, axis=0)/5
soft_voted_labels = np.argmax(probs, axis=-1)

soft_voted_df = pd.read_csv(SUB)
soft_voted_df['similar'] = soft_voted_labels
soft_voted_df.to_csv("./submissions/final_submission_SoftVoting.csv", index=False)