# What this file does ?

- Create message/code dictionary
- Preprocessing message and code
- Delete duplicate files in commit

## Directories
- `data\{project_name}`: Yet preprocessed data here
- `data\{project_name}\clean`: Preprocessed data here
- `{project_name}_(train/test)_(dextend).pkl`: commit data name
- `{project_name}_feature_(train/test).pkl`: feature data name
- `{project_name}_dict.pkl`: dictionary name

### Import libraries

In [None]:
import pickle
from Dict import Dict

### Load train and test set

Set up:
1. Change `data` and `test` path
2. Change `project_name` (optional if specify in path)

In [None]:
project_name = 'moment'

train = pickle.load(open(f'data/{project_name}/clean/{project_name}_train.pkl', 'rb'))
test = pickle.load(open(f'data/{project_name}/clean/{project_name}_test.pkl', 'rb'))

In [None]:
train[6]

### Create message dictionary and code dictionary

In [None]:
msg_dict = Dict(lower=True)
code_dict = Dict(lower=True)

### Define functions for preprocessing data

In [None]:
def split_sentence(sentence):
    sentence = sentence.replace('.', ' . ').replace('_', ' ').replace('@', ' @ ')\
        .replace('-', ' - ').replace('~', ' ~ ').replace('%', ' % ').replace('^', ' ^ ')\
        .replace('&', ' & ').replace('*', ' * ').replace('(', ' ( ').replace(')', ' ) ')\
        .replace('+', ' + ').replace('=', ' = ').replace('{', ' { ').replace('}', ' } ')\
        .replace('|', ' | ').replace('\\', ' \ ').replace('[', ' [ ').replace(']', ' ] ')\
        .replace(':', ' : ').replace(';', ' ; ').replace(',', ' , ').replace('<', ' < ')\
        .replace('>', ' > ').replace('?', ' ? ').replace('/', ' / ')
    sentence = ' '.join(sentence.split())
    return sentence

### Filter out duplicate files in commit

In [None]:
import json

def filter_duplicate(my_list: list):
    unique_dict = {}
    for dictionary in my_list:
        key = json.dumps(dictionary, sort_keys=True)
        unique_dict[key] = dictionary

    # Get the unique elements from the dictionary
    unique_elements = list(unique_dict.values())

    return unique_elements

In [None]:
for commit in train + test:
    filterd_commit = filter_duplicate(commit['main_language_file_changes'])
    commit['main_language_file_changes'] = filterd_commit

### Creating dictionaries and preprocessing code in train-set

In [None]:
ids = []
messages = []
cc2vec_commits = []
deepjit_commits = []
labels = []

for commit in train:
    message = commit['commit_message'].strip()
    message = split_sentence(message)
    message = ' '.join(message.split(' ')).lower()

    for word in message.split():
        msg_dict.add(word)

    cc2vec_commit = []
    deepjit_commit = []
    for file in commit['main_language_file_changes']:
        list_of_added_code = []
        list_of_removed_code = []
        for hunk in file['code_changes']:
            added_code = hunk['added_code']
            removed_code = hunk['removed_code']

            added_code = added_code.strip()
            removed_code = removed_code.strip()

            added_code = ' '.join(split_sentence(added_code).split())
            removed_code = ' '.join(split_sentence
            (removed_code).split())

            added_code = ' '.join(added_code.split(' '))
            removed_code = ' '.join(removed_code.split(' '))

            list_of_added_code.append(added_code)
            list_of_removed_code.append(removed_code)
            deepjit_commit.append(added_code)
            deepjit_commit.append(removed_code)

            for word in added_code.split():
                code_dict.add(word)
            for word in removed_code.split():
                code_dict.add(word)
        
        cc2vec_commit.append({
            'added_code': list_of_added_code,
            'removed_code': list_of_removed_code
        })

    ids.append(commit['commit_hash'])
    messages.append(message)
    cc2vec_commits.append(cc2vec_commit)
    deepjit_commits.append(deepjit_commit)
    labels.append(commit['bug_inducing'])
        
msg_dict = msg_dict.prune(100000)
code_dict = code_dict.prune(100000)
project_dict = [msg_dict.get_dict(), code_dict.get_dict()]

cc2vec_preprocessed_train = [ids, messages, cc2vec_commits, labels]
deepjit_preprocessed_train = [ids, messages, deepjit_commits, labels]

pickle.dump(project_dict, open(f"data/{project_name}/clean/{project_name}_dict.pkl", 'wb'))
pickle.dump(cc2vec_preprocessed_train, open(f"data/{project_name}/clean/{project_name}_train.pkl", 'wb'))
pickle.dump(deepjit_preprocessed_train, open(f"data/{project_name}/clean/{project_name}_train_dextend.pkl", 'wb'))

### Preprocessing code in test-set

In [None]:
ids = []
messages = []
cc2vec_commits = []
deepjit_commits = []
labels = []

for commit in test:
    message = commit['commit_message'].strip()
    message = split_sentence(message)
    message = ' '.join(message.split(' ')).lower()

    cc2vec_commit = []
    deepjit_commit = []
    for file in commit['main_language_file_changes']:
        list_of_added_code = []
        list_of_removed_code = []
        for hunk in file['code_changes']:    
            added_code = hunk['added_code']
            removed_code = hunk['removed_code']

            added_code = added_code.strip()
            removed_code = removed_code.strip()

            added_code = ' '.join(split_sentence(added_code).split())
            removed_code = ' '.join(split_sentence
            (removed_code).split())

            added_code = ' '.join(added_code.split(' '))
            removed_code = ' '.join(removed_code.split(' '))

            list_of_added_code.append(added_code)
            list_of_removed_code.append(removed_code)
            deepjit_commit.append(added_code)
            deepjit_commit.append(removed_code)
        
        cc2vec_commit.append({
            'added_code': list_of_added_code,
            'removed_code': list_of_removed_code
        })

    ids.append(commit['commit_hash'])
    messages.append(message)
    cc2vec_commits.append(cc2vec_commit)
    deepjit_commits.append(deepjit_commit)
    labels.append(commit['bug_inducing'])

cc2vec_preprocessed_test = [ids, messages, cc2vec_commits, labels]
deepjit_preprocessed_test = [ids, messages, deepjit_commits, labels]

pickle.dump(cc2vec_preprocessed_test, open(f"data/{project_name}/clean/{project_name}_test.pkl", 'wb'))
pickle.dump(deepjit_preprocessed_test, open(f"data/{project_name}/clean/{project_name}_test_dextend.pkl", 'wb'))