<a href="https://colab.research.google.com/github/ohilikeit/Dacon_code_similarity/blob/main/%EC%8B%A4%ED%97%98%EC%9A%A9_Preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
class FunEvent:
    def __init__(self, tags, year):
        self.tags = tags
        self.year = year
    
    def __str__(self):
        return f"FunEvent(tags={self.tags}, year={self.year})"

tags = ["google", "ml"]
year = 2022
bootcamp = FunEvent(tags, year)
tags.append("bootcamp")
year = 2023
print(bootcamp)

# 1. 데이터 로드 및 전처리

## 1) 데이터 불러오기
간단한 전처리를 하면서 불러온다.

In [None]:
import re 
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

folder = '/content/drive/MyDrive/code_similarity/code'
problems = os.listdir(folder)

def preprocess(script):
     with open(script, 'r', encoding='utf-8') as file:
            lines = file.readlines()
            processed_line = []
            for line in lines:
                line = line.rstrip('\n')                  # 각 줄의 오른쪽 '\n'제거 

                if '#' in line:
                    line = line[:line.index('#')]        # 코드 뒤쪽에 달린 주석 제거       
                if line.startswith('#'):
                    continue                              # 주석 삭제 
                line = line.replace('    ','\t')          # '    '를 탭으로 전환 
                line = line.replace('"""','')             # 주석 삭제 
                line = line.replace('?', '')
                line = re.sub(r"[/\?@]","",line )    # 특수문자 제거 
                processed_line.append(line)
            processed_lines = '\n '.join(processed_line)   # '\n'으로 붙이기, 뒤쪽 bm-25 알고리즘이 '\n'은 알아서 가중치 조절할 예정.

     return processed_lines

final_scripts = []
numbers = []
for problem in tqdm(problems):
    scripts = os.listdir(os.path.join(folder, problem))
    number = scripts[0].split('_')[0]

    for script in scripts:
        each_script = os.path.join(folder, problem, script)
        final_script = preprocess(each_script)
        final_scripts.append(final_script)
    numbers.extend([number]*len(scripts))

print(final_scripts)
print(numbers)

In [None]:
data = pd.DataFrame(data = {'code':final_scripts, 'number': numbers})
data.head()

Unnamed: 0_level_0,code,tokens,len
number,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
problem001,149,149,149
problem002,150,150,150
problem003,150,150,150
problem004,150,150,150
problem005,148,148,148
...,...,...,...
problem296,73,73,73
problem297,150,150,150
problem298,150,150,150
problem299,150,150,150


## 2) 토크나이징
code를 기반으로 pre-train 한 huggingface의 codeberta를 가져와 토크나이징 해준다. 해당 도메인의 corpus로 학습시킨것이 좋다.

In [None]:
! pip install transformers

In [None]:
from transformers import AutoTokenizer, RobertaTokenizer
MODEL = "huggingface/CodeBERTa-small-v1"
tokenizer = RobertaTokenizer.from_pretrained(MODEL)

data['tokens'] = data['code'].apply(tokenizer.tokenize)
data['len'] = data['tokens'].apply(len)
data.describe()

Unnamed: 0,len
count,45101.0
mean,158.847099
std,388.52385
min,5.0
25%,62.0
50%,109.0
75%,199.0
max,71795.0


## 3) 길이 조절하기
input으로 code1과 code2를 동시에 받기에 토큰의 최대 사이즈 512를 고려하여 256을 넘기는 code들을 버린다. pair를 만들기 위한 현실적인 시간을 고려한 점도 있다.

In [None]:
df = data[data['len'] <= 256].reset_index(drop=True) 
df = df.groupby('number').count()
df = df[df['code'] >= 10]

Unnamed: 0,code,tokens,len
0,149,149,149
1,150,150,150
2,150,150,150
3,150,150,150
4,148,148,148
...,...,...,...
294,73,73,73
295,150,150,150
296,150,150,150
297,150,150,150


## 4) train, val 구성
train pair 셋 안에 val의 pair 중 하나라도 들어가면 악영향을 줄 수 있기에 미리 분리해서 pair를 만든다. 

In [None]:
from sklearn.model_selection import train_test_split

train_df, val_df, train_y, val_y = train_test_split(df, df['number'],
                                                    random_state=42,
                                                    test_size=0.1,
                                                    stratify=df['number'])
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)

ValueError: ignored

In [None]:
train_df.to_csv('/content/drive/MyDrive/code_similarity/train_df.csv', index=False)
val_df.to_csv('/content/drive/MyDrive/code_similarity/val_df.csv', index=False)

# 2. pair 구성하기
bm25 알고리즘을 활용하여 각 코드파일마다 25개 미만의 가장 가까우면서도 서로 다른 negative pair를 구성하여 모델을 robust하게 만들어주길 기도하겠다. positive pair의 경우 combination을 사용하였다.

In [None]:
! pip install rank_bm25
! pip install transformers

from numba import jit
! @jit(nopython=True, cache=True)

In [None]:
import re 
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from rank_bm25 import BM25Okapi
from itertools import combinations
from tqdm import tqdm
from transformers import AutoTokenizer, RobertaTokenizer

train_df = pd.read_csv('/content/drive/MyDrive/code_similarity/train_df.csv')
val_df = pd.read_csv('/content/drive/MyDrive/code_similarity/val_df.csv')

# 토크나이저 불러오기
MODEL = "huggingface/CodeBERTa-small-v1"
tokenizer = RobertaTokenizer.from_pretrained(MODEL)

## 1) train set pair 구성

In [None]:
codes = train_df['code'].to_list()
problem_num = train_df['number'].unique().tolist()
problem_num.sort()

# bm25 알고리즘 
tokenized_corpus = [tokenizer.tokenize(code) for code in codes]
bm25 = BM25Okapi(tokenized_corpus)


solution_codes = train_df[train_df['number'] == 'problem001']['code']
positive_pairs = list(combinations(solution_codes.to_list(),2))
first_tokenized_code = tokenizer.tokenize(positive_pairs[0][0])
negative_code_scores = bm25.get_scores(first_tokenized_code)
negative_code_scores.argsort()[::-1][0]
train_df['code'].iloc[negative_code_scores.argsort()[::-1][0]]

print(positive_pairs[0][0])
print(train_df['code'].iloc[negative_code_scores.argsort()[::-1][1]])

for i in range(1, 10):
 	for j in range(1, 10):
 		print '%sx%s=%s' % (i, j, i * j)


In [None]:
# pair 구성
# problem number 하나 당 4~5분 걸리고 전체 number가 300이므로 20~25시간이 걸린다.
final_positive_pairs = []
final_negative_pairs = []
testprob = problem_num[0:20]
test = []
for num in tqdm(testprob):
    working_codes = train_df[train_df['number'] == num]['code']
    positive_pairs = list(combinations(working_codes, 2))
    negative_pairs = []

    for working_code in tqdm(working_codes):
        negative_ones = []
        bm100 = np.array(bm25.get_top_n(working_code, codes, 25))
        '''
        for i in range(100):
            idx = bm100[i]
            if idx not in working_codes:
                negative_ones.append(idx)
        '''
        negative_ones = [bm100[i] for i in range(25) if bm100[i] not in working_codes] 
        
        for negative_one in negative_ones:
             negative_pairs.append((working_code, negative_one))

    final_positive_pairs.extend(positive_pairs)
    final_negative_pairs.extend(negative_pairs)


'''
여기까진 0에서 19까지임. 20부터 또 해야됨 
'''

pos_code1 = list(map(lambda x:x[0], final_positive_pairs))
pos_code2 = list(map(lambda x:x[1], final_positive_pairs))
pos_label = [1]*len(pos_code1)

neg_code1 = list(map(lambda x:x[0], final_negative_pairs))
neg_code2 = list(map(lambda x:x[1], final_negative_pairs))
neg_label = [0]*len(neg_code1)

total_code1 = pos_code1.extend(neg_code1)
total_code2 = pos_code2.extend(neg_code2)
total_label = pos_label.extend(neg_label)

full_data = pd.DataFrame({
    'code1' : total_code1,
    'code2' : total_code2,
    'similar' : total_label
})
full_data.sample(frac=1).reset_index(drop=True, inplace=True)
full_data.to_csv('/content/drive/MyDrive/code_similarity/full_data')

'p'

In [None]:
fpp = final_positive_pairs.copy()
fnp = final_negative_pairs.copy()
len(fpp)
len(fnp)
pos_code1 = list(map(lambda x:x[0], final_positive_pairs))
pos_code2 = list(map(lambda x:x[1], final_positive_pairs))