# Code Similarity Detection (Monthly Dacon21)

## import library & package

In [26]:
import os
import numpy as np
import pandas as pd
import torch

from tqdm import tqdm
from itertools import combinations
from transformers import AutoTokenizer, BertForSequenceClassification
from transformers import AutoModel, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding, EarlyStoppingCallback
from datasets import load_dataset, load_metric
from rank_bm25 import BM25Okapi
from sklearn.model_selection import train_test_split

------------------
## Model Training

In [27]:
MODEL = "microsoft/graphcodebert-base"
INPUT_TRAIN = "data/train_data_for_epoch.csv"
INPUT_VALID = "data/valid_data_for_epoch.csv"
MAX_LEN = 512
train_dataset = load_dataset("csv", data_files=INPUT_TRAIN)['train']
valid_dataset = load_dataset("csv", data_files=INPUT_VALID)['train']
tokenizer = AutoTokenizer.from_pretrained(MODEL)

Using custom data configuration default-63dcb23fc60f1973
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-63dcb23fc60f1973/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

Using custom data configuration default-c226f61a3c327361
Reusing dataset csv (/home/piai/.cache/huggingface/datasets/csv/default-c226f61a3c327361/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/1 [00:00<?, ?it/s]

loading configuration file https://huggingface.co/microsoft/graphcodebert-base/resolve/main/config.json from cache at /home/piai/.cache/huggingface/transformers/8edef9fb59cf1f2670191d673b13a719a79361a2ae12cc806f942649b8b90db8.62db6c94b05689b7cb238a1a38840e19d1014fc755a9e328ab74a6c672db2d3d
Model config RobertaConfig {
  "_name_or_path": "microsoft/graphcodebert-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.2",
  "type_vocab_size": 1,
  "use_ca

In [28]:
def example_fn(examples):
    outputs = tokenizer(examples['code1'], examples['code2'], padding=True, max_length=MAX_LEN,truncation=True)
    if 'similar' in examples:
        outputs["labels"] = examples["similar"]
    return outputs

train_dataset = train_dataset.map(example_fn, remove_columns=['code1', 'code2', 'similar'])
valid_dataset = valid_dataset.map(example_fn, remove_columns=['code1', 'code2', 'similar'])

  0%|          | 0/582622 [00:00<?, ?ex/s]

  0%|          | 0/59389 [00:00<?, ?ex/s]

In [5]:
train_dataset.to_csv("data/train_dataset.csv", index=False)
valid_dataset.to_csv("data/valid_dataset.csv", index=False)

Creating CSV from Arrow format:   0%|          | 0/810 [00:00<?, ?ba/s]

Creating CSV from Arrow format:   0%|          | 0/49 [00:00<?, ?ba/s]

1202631041

In [29]:
_collator = DataCollatorWithPadding(tokenizer=tokenizer)
_metric = load_metric("glue", "sst2")

def metric_fn(p):
    preds, labels = p
    output =  _metric.compute(references=labels, predictions=np.argmax(preds, axis=-1))
    return output

In [30]:
# device cuda로 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained("microsoft/graphcodebert-base")
# model = AutoModel.from_pretrained("microsoft/graphcodebert-base")
model.to(device)

loading configuration file https://huggingface.co/microsoft/graphcodebert-base/resolve/main/config.json from cache at /home/piai/.cache/huggingface/transformers/8edef9fb59cf1f2670191d673b13a719a79361a2ae12cc806f942649b8b90db8.62db6c94b05689b7cb238a1a38840e19d1014fc755a9e328ab74a6c672db2d3d
Model config RobertaConfig {
  "_name_or_path": "microsoft/graphcodebert-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 514,
  "model_type": "roberta",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "transformers_version": "4.19.2",
  "type_vocab_size": 1,
  "use_ca

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerN

In [77]:
from GPUtil import showUtilization as gpu_usage
gpu_usage()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
| ID | GPU | MEM |
------------------
|  0 |  3% | 39% |
|  1 |  0% |  8% |


In [76]:
import gc
import torch

gc.collect()
torch.cuda.empty_cache()

In [None]:
args = TrainingArguments(
    'runs/',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=2e-5,
    num_train_epochs=30,
    save_total_limit=7,
    do_train=True,
    do_eval=True,
    eval_steps=1,
    save_strategy="epoch",
    logging_strategy="epoch",
    evaluation_strategy="epoch",
    # save_strategy="steps",
    # logging_strategy="steps",
    # evaluation_strategy="steps",
    load_best_model_at_end = True,
    # metric_for_best_model = 'f1',
)

trainer = Trainer(
        model=model,
        args=args,
        data_collator=_collator,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
        compute_metrics=metric_fn,
        callbacks=[EarlyStoppingCallback(early_stopping_patience = 5)]
)

trainer.train()

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running training *****
  Num examples = 582622
  Num Epochs = 5
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 182070


Step,Training Loss,Validation Loss,Accuracy
500,0.1985,0.251296,0.912846
1000,0.2244,0.292724,0.909192
1500,0.1995,0.191365,0.942683


***** Running Evaluation *****
  Num examples = 59389
  Batch size = 32
Saving model checkpoint to runs/checkpoint-500
Configuration saved in runs/checkpoint-500/config.json
Model weights saved in runs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in runs/checkpoint-500/tokenizer_config.json
Special tokens file saved in runs/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 59389
  Batch size = 32
Saving model checkpoint to runs/checkpoint-1000
Configuration saved in runs/checkpoint-1000/config.json
Model weights saved in runs/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in runs/checkpoint-1000/tokenizer_config.json
Special tokens file saved in runs/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 59389
  Batch size = 32
Saving model checkpoint to runs/checkpoint-1500
Configuration saved in runs/checkpoint-1500/config.json
Model weights saved in runs/checkpoint-1500/pytorch_model

In [24]:
TEST = "data/test.csv"
SUB = "data/sample_submission.csv"

test_dataset = load_dataset("csv", data_files=TEST)["train"]
test_dataset = test_dataset.map(example_fn, remove_columns=["code1", "code2"])

predictions = trainer.predict(test_dataset)

df = pd.read_csv(SUB)
df["similar"] = np.argmax(predictions.predictions, axis=-1)
df.to_csv("./result/submission2.csv", index=False)

Using custom data configuration default-cc6c8aa016161c6d


Downloading and preparing dataset csv/default to /home/piai/.cache/huggingface/datasets/csv/default-cc6c8aa016161c6d/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /home/piai/.cache/huggingface/datasets/csv/default-cc6c8aa016161c6d/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/179700 [00:00<?, ?ex/s]

The following columns in the test set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: pair_id. If pair_id are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 179700
  Batch size = 64
