### Import libraries

In [1]:
import os, pickle
import pandas as pd

### Load data-set's commits and its test-set

In [2]:
project_name = 'moment'

data = pickle.load(open(f'data/{project_name}/{project_name}.pkl', 'rb'))
test = pickle.load(open(f'data/test-set-bugged/{project_name}.pkl', 'rb'))

print('Number of examples: ', len(data))
print('Number of test examples: ', len(test))

Number of examples:  3980
Number of test examples:  19


### Load data-set's features

In [3]:
# Path to the CSV file
csv_file = f'data/{project_name}/{project_name}_{project_name}_2023-05-01.csv'

# Load the CSV file into a pandas DataFrame
df_features = pd.read_csv(csv_file)

# Display the DataFrame
column_names = df_features.columns.tolist()
features = column_names[3:]
num_rows = df_features.shape[0]
print('Number of features: ', len(features))
print('Features: ', features)
print('Number of examples: ', num_rows)

Number of features:  14
Features:  ['ns', 'nd', 'nf', 'entrophy', 'la', 'ld', 'lt', 'fix', 'ndev', 'age', 'nuc', 'exp', 'rexp', 'sexp']
Number of examples:  2482


### Get dictionary mapping commit_hash and label

In [4]:
commit_result = [(item['commit_hash'], item['bug_inducing']) for item in data]
result_dict = dict(commit_result)

### Get list of commits for train, test

In [5]:
data_commit_hash_list = [item['commit_hash'] for item in data]
test_commit_hash_list = [item['commit_hash'] for item in test]
features_id_list = df_features['_id'].tolist()
print('All commit: ', len(data_commit_hash_list))
print('Test commit: ', len(test_commit_hash_list))
print('Features id: ', len(features_id_list))

All commit:  3980
Test commit:  19
Features id:  2482


### Define functions to preprocessing data

In [6]:
# Find common elements in two lists, used to extract test-set
def find_common_elements(list1, list2):
    set1 = set(list1)
    set2 = set(list2)
    common_elements = list(set1.intersection(set2))
    return common_elements

# Find elements in list A but not in list B, used to extract train-set
def find_unique_elements(list_a, list_b):
    unique_elements = [elem for elem in list_a if elem not in list_b]
    return unique_elements

# Save data using pickle to a directory with given filename
def save_list_to_file(directory, filename, data):
    # Check if the directory exists
    if not os.path.exists(directory):
        os.makedirs(directory)  # Create the directory if it doesn't exist

    filepath = os.path.join(directory, filename)
    
    with open(filepath, 'wb') as file:
        pickle.dump(data, file)
    
    print(f'Saved the list to file: {filepath}')

# Create list of commits for the data-set using a given list of commit_hash
def filter_elements_by_commit_hash(commit_hash_list, elements_list):
    filtered_elements = [element for element in elements_list if element['commit_hash'] in commit_hash_list]
    return filtered_elements

# Function to map '_id' values to 'bug' values
def map_bug_value(row):
    commit_hash = row['_id']
    if commit_hash in result_dict:
        return result_dict[commit_hash]
    else:
        return None

# Print out info of the data-set
def data_info_extract(data):
    number_clean = 0
    number_defect = 0
    for data in data:
        if data['bug_inducing'] == 0:
            number_clean += 1
        else:
            number_defect += 1
    return number_clean, number_defect

### Get train/test-set and total data-set

In [7]:
cleaned_data = find_common_elements(data_commit_hash_list, features_id_list)
cleaned_test = find_common_elements(test_commit_hash_list, features_id_list)
cleaned_train = find_unique_elements(cleaned_data, cleaned_test)
print(len(cleaned_data), len(cleaned_train), len(cleaned_test))

2422 2408 14


### Re-Split the train and test set (test / total = 0.1)

In [8]:
# Example lists
list_a = cleaned_train
list_b = cleaned_test

# Calculate the target length for list b
target_length = int(0.1 * (len(list_a) + len(list_b)))
print(target_length)

# Cut the last element from list a and add it to list b until the target length is reached
while len(list_b) < target_length:
    last_element = list_a.pop()
    list_b.append(last_element)

print("Modified Train:", len(cleaned_train))
print("Modified Test:", len(cleaned_test))

242
Modified Train: 2180
Modified Test: 242


### Get the list of commits for train and test set. And save as .pkl/.csv

In [9]:
train = filter_elements_by_commit_hash(cleaned_train, data)
test = filter_elements_by_commit_hash(cleaned_test, data)
df_features['bug'] = df_features.apply(lambda row: map_bug_value(row), axis=1)
train_features = df_features[df_features['_id'].isin(cleaned_train)]
test_features = df_features[df_features['_id'].isin(cleaned_test)]
print(len(train), len(test))
print(train_features.shape[0], test_features.shape[0])

save_list_to_file(f'data/{project_name}/clean', f'{project_name}_train.pkl', train)
save_list_to_file(f'data/{project_name}/clean', f'{project_name}_test.pkl', test)
train_features.to_csv(f'data/{project_name}/clean/{project_name}_feature_train.csv', index=False)
test_features.to_csv(f'data/{project_name}/clean/{project_name}_feature_test.csv', index=False)

2180 242
2180 242


### Data-set commits information

In [10]:
number_clean, number_defect = data_info_extract(data)
print(f"Uncleaned Data  | Total: {number_clean + number_defect} - Clean: {number_clean} - Defect: {number_defect} - Rate: {number_defect / (number_clean + number_defect)}")
number_clean, number_defect = data_info_extract(train)
print(f"Cleaned Train   | Total: {number_clean + number_defect} - Clean: {number_clean} - Defect: {number_defect} - Rate: {number_defect / (number_clean + number_defect)}")
number_clean, number_defect = data_info_extract(test)
print(f"Cleaned Test    | Total: {number_clean + number_defect} - Clean: {number_clean} - Defect: {number_defect} - Rate: {number_defect / (number_clean + number_defect)}")

Uncleaned Data  | Total: 3980 - Clean: 3005 - Defect: 975 - Rate: 0.2449748743718593
Cleaned Train   | Total: 2180 - Clean: 1464 - Defect: 716 - Rate: 0.3284403669724771
Cleaned Test    | Total: 242 - Clean: 163 - Defect: 79 - Rate: 0.32644628099173556


### Data-set features information

In [11]:
number_clean = train_features['bug'].value_counts().get(0, 0)
number_defect = train_features['bug'].value_counts().get(1, 0)
print(f"Cleaned Train   | Total: {number_clean + number_defect} - Clean: {number_clean} - Defect: {number_defect} - Rate: {number_defect / (number_clean + number_defect)}")

number_clean = test_features['bug'].value_counts().get(0, 0)
number_defect = test_features['bug'].value_counts().get(1, 0)
print(f"Cleaned Train   | Total: {number_clean + number_defect} - Clean: {number_clean} - Defect: {number_defect} - Rate: {number_defect / (number_clean + number_defect)}")

Cleaned Train   | Total: 2180 - Clean: 1464 - Defect: 716 - Rate: 0.3284403669724771
Cleaned Train   | Total: 242 - Clean: 163 - Defect: 79 - Rate: 0.32644628099173556
