In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
from tqdm import tqdm
import pandas as pd
import re

##데이터 다운로드

In [None]:
# 데이터 압축 풀기
%cd /content/drive/MyDrive/determining_code_similarity_AI_competition/data
!unzip -qq "/content/drive/MyDrive/open.zip"

##데이터 전처리

In [None]:
# sample_train.csv 처리
file_path = '/content/drive/MyDrive/determining_code_similarity_AI_competition/data/new_train_sample_large.csv'
train = pd.read_csv(file_path, encoding='utf-8')
train.info()

In [None]:
# train = train.replace(re.compile('(^import.*|^from.*)',re.MULTILINE),"",regex=True) #import,from 없애기
train = train.replace(re.compile('(#.*)', re.MULTILINE),"",regex=True) #주석 한 줄
train = train.replace(re.compile('[\'\"]{3}.*?[\'\"]{3}', re.DOTALL),"",regex=True) #주석 여러줄
train = train.replace(re.compile('[\n]{2,}', re.MULTILINE),"\n",regex=True) #다중개행 한번으로
train = train.replace(re.compile('[ ]{4}', re.MULTILINE),"\t",regex=True) #tab 변환
train = train.replace(re.compile('[ ]{1,3}', re.MULTILINE)," ",regex=True) #공백 여러개 변환
train

In [None]:
# train, valid 데이터 분리
from sklearn.model_selection import train_test_split

train_df, valid_df, train_label, valid_label = train_test_split(
        train,
        train['similar'],
        random_state=100,
        test_size=0.1,
        stratify=train['similar'],
        )

train_df.head()

In [None]:
# DatasetDict 타입으로 변환
train_dataset = Dataset.from_dict(train_df)
valid_dataset = Dataset.from_dict(valid_df)
dataset = datasets.DatasetDict({"train":train_dataset,"valid":valid_dataset})
dataset

##토큰화

In [None]:
!pip install transformers
!pip install datasets

In [None]:
import numpy as np
import torch
import datasets
import pickle
import torch,gc
from datasets import load_metric, Dataset, load_dataset
from transformers import RobertaTokenizer, RobertaForSequenceClassification, TrainingArguments, Trainer

In [None]:
batch_size = 16
epoch_num = 7
MAX_LEN = 256
metric = load_metric("accuracy")
metric_name = "accuracy"

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
tokenizer = RobertaTokenizer.from_pretrained("microsoft/graphcodebert-base")
tokenizer.truncation_side = 'left'

In [None]:
def tokenize_function(sample):
    return tokenizer(
        sample['code1'],
        sample['code2'],
        padding=True,
        max_length=MAX_LEN,
        truncation=True,
        return_token_type_ids=True,
        # return_attention_mask=True,
        # return_length=True,
    )

In [None]:
# 함수 테스트
tokenize_function(dataset["train"][:5])

In [None]:
# 토큰화
encoded_dataset = dataset.map(tokenize_function, remove_columns=['code1', 'code2'], batched=True)

In [None]:
encoded_dataset=encoded_dataset.rename_column(original_column_name='similar',new_column_name='labels')

In [None]:
with open("/content/drive/MyDrive/determining_code_similarity_AI_competition/data/encoded_new_dataset_larger", "wb" ) as file:
  pickle.dump(encoded_dataset, file)

In [None]:
with open("/content/drive/MyDrive/determining_code_similarity_AI_competition/data/encoded_new_dataset_larger", "rb" ) as file:
  encoded_dataset = pickle.load(file)
  print(encoded_dataset)

##모델 구성

In [None]:
model = RobertaForSequenceClassification.from_pretrained("/content/drive/MyDrive/determining_code_similarity_AI_competition/data/model_1")

In [None]:
args = TrainingArguments(
    "test_GraphCodeBERT",
    evaluation_strategy="epoch", save_strategy="epoch", logging_strategy="epoch", 
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epoch_num,
    weight_decay=0.01,
    do_train=True,
    do_eval=True,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
)

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["valid"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
trainer.train()
trainer.save_model("/content/drive/MyDrive/determining_code_similarity_AI_competition/data/model_2")

##TEST

In [None]:
# test.csv 전처리
file_path = "/content/drive/MyDrive/determining_code_similarity_AI_competition/data/test.csv"
test = pd.read_csv(file_path, encoding='utf-8')
test.info()

In [None]:
# test = test.replace(re.compile('(^import.*|^from.*)',re.MULTILINE),"",regex=True) #import,from 없애기
test = test.replace(re.compile('(#.*)', re.MULTILINE),"",regex=True) #주석 한 줄
test = test.replace(re.compile('[\'\"]{3}.*?[\'\"]{3}', re.DOTALL),"",regex=True) #주석 여러줄
test = test.replace(re.compile('[\n]{2,}', re.MULTILINE),"\n",regex=True) #다중개행 한번으로
test = test.replace(re.compile('[ ]{4}', re.MULTILINE),"\t",regex=True) #tab 변환
test = test.replace(re.compile('[ ]{1,3}', re.MULTILINE)," ",regex=True) #공백 여러개 변환
test

In [None]:
test.to_csv('/content/drive/MyDrive/determining_code_similarity_AI_competition/data/preprocessed_test.csv',index=False)

In [None]:
TEST = "/content/drive/MyDrive/determining_code_similarity_AI_competition/data/preprocessed_test.csv"
SUB = "/content/drive/MyDrive/determining_code_similarity_AI_competition/data/sample_submission.csv"

test_dataset = load_dataset("csv", data_files=TEST)['train']
test_dataset = test_dataset.map(tokenize_function, remove_columns=['code1', 'code2'])

In [None]:
predictions = trainer.predict(test_dataset)

df = pd.read_csv(SUB)
df['labels'] = np.argmax(predictions.predictions, axis=-1)
df.to_csv('/content/drive/MyDrive/determining_code_similarity_AI_competition/data/submission_large.csv', index=False)

In [None]:
df.drop(["similar"], axis=1, inplace=True)
df.rename(columns={"labels":"similar"}, inplace=True)
df.to_csv("/content/drive/MyDrive/determining_code_similarity_AI_competition/data/submission_large.csv",index = None)

In [None]:
gc.collect()
torch.cuda.empty_cache()