In [1]:
# https://pydriller.readthedocs.io/en/latest/tutorial.html

In [2]:
import pydriller
from pydriller import Repository
import json
import pandas
import os
from tqdm import tqdm as tq
import statistics
import csv

In [3]:
bug_related_msg = ['bug','bugs','fix','fixes','patch','fault','corrected','tweaked','problem','problems','issue']

In [4]:
# exception_messages = []
def bugs_from_commits_in_repo(repo_name):
    commits_info = {}
    non_dvc_not_bug_count = 0
    dvc_not_bug_count = 0
    non_dvc_bug_count = 0
    dvc_bug_count = 0
    number_of_exceptions = 0
    total_commits = 0
    dvc_files_changes_count = []
    real_dvc_files_changes_count = []
    dvc_added_any_type_lines_per_repo = []
    dvc_deleted_any_type_lines_per_repo = []
    dvc_added_dvc_lines_per_repo = []
    dvc_deleted_dvc_lines_per_repo = []
    # non_dvc_files_changes_count = []
    
    for commit in Repository(repo_name).traverse_commits():
        total_commits += 1
        if commit.in_main_branch == False:
            continue
        is_bug = False
        is_dvc = False
        for bug_msg in bug_related_msg:
            if bug_msg in commit.msg:
                is_bug = True
                break
        single_commmit = {}
        changed_files = []
        try:            
            how_many_dvc_files = 0
            dvc_added_any_type_lines_per_commit = []
            dvc_deleted_any_type_lines_per_commit = []
            dvc_added_dvc_lines_per_commit = []
            dvc_deleted_dvc_lines_per_commit = []
            for file in commit.modified_files:
                if 'dvc' in str(file.old_path).lower() or 'dvc' in str(file.new_path).lower():
                    is_dvc = True
                    if is_bug == False:    
                        break
                    how_many_dvc_files += 1
                    dvc_added_dvc_lines_per_commit.append(file.added_lines)
                    dvc_deleted_dvc_lines_per_commit.append(file.deleted_lines)
                dvc_added_any_type_lines_per_commit.append(file.added_lines)
                dvc_deleted_any_type_lines_per_commit.append(file.deleted_lines)
                if is_bug == True:
                    file_obj = {}
                    file_obj['file_name'] = file.filename
                    file_obj['old_path'] = file.old_path
                    file_obj['new_path'] = file.new_path
                    file_obj['nloc'] = file.nloc
                    file_obj['added_lines'] = file.added_lines
                    file_obj['deleted_lines'] = file.deleted_lines
                    changed_files.append(file_obj)

            if is_bug == False:
                if is_dvc == False:
                    non_dvc_not_bug_count += 1
                else:
                    dvc_not_bug_count += 1
                continue
            if is_dvc == True:
                dvc_bug_count += 1
            else:
                non_dvc_bug_count += 1

            single_commmit['committer_date'] = str(commit.committer_date) 
            single_commmit['author_date'] = str(commit.author_date)
            single_commmit['commit_message'] = commit.msg
            single_commmit['in_main_branch'] = commit.in_main_branch
            single_commmit['committer'] = commit.committer.email
            single_commmit["modified_files"]=changed_files
            single_commmit['is_dvc_related'] = is_dvc
            commits_info[commit.hash] = single_commmit

            if is_dvc:
                dvc_files_changes_count.append(len(commit.modified_files))
                real_dvc_files_changes_count.append(how_many_dvc_files)
                dvc_added_dvc_lines_per_repo.append(sum(dvc_added_dvc_lines_per_commit))
                dvc_deleted_dvc_lines_per_repo.append(sum(dvc_deleted_dvc_lines_per_commit))
                dvc_added_any_type_lines_per_repo.append(sum(dvc_added_any_type_lines_per_commit))
                dvc_deleted_any_type_lines_per_repo.append(sum(dvc_deleted_any_type_lines_per_commit))
            # else:
            #     non_dvc_files_changes_count.append(len(commit.modified_files))
        except Exception as e:
            number_of_exceptions += 1
            # exception_messages.append(str(e))
            # print(str(e))
    commits_info["non_dvc_not_bug_count"] = non_dvc_not_bug_count
    commits_info["dvc_not_bug_count"] = dvc_not_bug_count
    commits_info["non_dvc_bug_count"] = non_dvc_bug_count
    commits_info["dvc_bug_count"] = dvc_bug_count
    
    DVC_Bug_Ratio_Per_Commit = dvc_bug_count/(dvc_bug_count+dvc_not_bug_count)
    NON_DVC_Bug_Ratio_Per_Commit = non_dvc_bug_count/(non_dvc_bug_count+non_dvc_not_bug_count)
    
    DVC_Files_Modified_Any_Type_Files_Med = statistics.median(dvc_files_changes_count)
    DVC_Files_Modified_Any_Type_Files_Avg = sum(dvc_files_changes_count)/len(dvc_files_changes_count)
    
    DVC_Files_Modified_DVC_Files_Med = statistics.median(real_dvc_files_changes_count)
    DVC_Files_Modified_DVC_Files_Avg = sum(real_dvc_files_changes_count)/len(real_dvc_files_changes_count)
    
    DVC_Files_Added_Any_Type_lines_Med = statistics.median(dvc_added_any_type_lines_per_repo)
    DVC_Files_Added_Any_Type_lines_Avg = sum(dvc_added_any_type_lines_per_repo)/len(dvc_added_any_type_lines_per_repo)

    DVC_Files_Added_DVC_lines_Med = statistics.median(dvc_added_dvc_lines_per_repo)
    DVC_Files_Added_DVC_lines_Avg = sum(dvc_added_dvc_lines_per_repo)/len(dvc_added_dvc_lines_per_repo)
    DVC_Files_Deleted_Any_Type_lines_Med = statistics.median(dvc_deleted_any_type_lines_per_repo)
    DVC_Files_Deleted_Any_Type_lines_Avg = sum(dvc_deleted_any_type_lines_per_repo)/len(dvc_deleted_any_type_lines_per_repo)
    DVC_Files_Deleted_DVC_lines_Med = statistics.median(dvc_deleted_dvc_lines_per_repo)
    DVC_Files_Deleted_DVC_lines_Avg = sum(dvc_deleted_dvc_lines_per_repo)/len(dvc_deleted_dvc_lines_per_repo)
    
    summary = {}
    summary["repo_name"] = repo_name
    summary["DVC_Bug_Ratio_Per_Commit"] = DVC_Bug_Ratio_Per_Commit
    summary["NON_DVC_Bug_Ratio_Per_Commit"] = NON_DVC_Bug_Ratio_Per_Commit
    summary["DVC_Files_Modified_Any_Type_Files_Med"] = DVC_Files_Modified_Any_Type_Files_Med
    summary["DVC_Files_Modified_Any_Type_Files_Avg"] = DVC_Files_Modified_Any_Type_Files_Avg
    summary["DVC_Files_Modified_DVC_Files_Med"] = DVC_Files_Modified_DVC_Files_Med
    summary["DVC_Files_Modified_DVC_Files_Avg"] = DVC_Files_Modified_DVC_Files_Avg
    summary["DVC_Files_Added_Any_Type_lines_Med"] = DVC_Files_Added_Any_Type_lines_Med
    summary["DVC_Files_Added_Any_Type_lines_Avg"] = DVC_Files_Added_Any_Type_lines_Avg
    summary["DVC_Files_Added_DVC_lines_Med"] = DVC_Files_Added_DVC_lines_Med
    summary["DVC_Files_Added_DVC_lines_Avg"] = DVC_Files_Added_DVC_lines_Avg
    summary["DVC_Files_Deleted_Any_Type_lines_Med"] = DVC_Files_Deleted_Any_Type_lines_Med
    summary["DVC_Files_Deleted_Any_Type_lines_Avg"] = DVC_Files_Deleted_Any_Type_lines_Avg
    summary["DVC_Files_Deleted_DVC_lines_Med"] = DVC_Files_Deleted_DVC_lines_Med
    summary["DVC_Files_Deleted_DVC_lines_Avg"] = DVC_Files_Deleted_DVC_lines_Avg
    summary["non_dvc_not_bug_count"] = non_dvc_not_bug_count
    summary["dvc_not_bug_count"] = dvc_not_bug_count
    summary["non_dvc_bug_count"] = non_dvc_bug_count
    summary["dvc_bug_count"] = dvc_bug_count
    summary["total"] = non_dvc_not_bug_count + dvc_not_bug_count + non_dvc_bug_count + dvc_bug_count
    
    print(summary)
#     print("DVC Bug Ratio Per Commit: ", dvc_bug_count/(dvc_bug_count+dvc_not_bug_count))
#     print("NON DVC Bug Ratio Per Commit: ", non_dvc_bug_count/(non_dvc_bug_count+non_dvc_not_bug_count))
    
#     print("DVC Files Modified Any Files (Median and Average):     ", statistics.median(dvc_files_changes_count), ",", sum(dvc_files_changes_count)/len(dvc_files_changes_count))
#     print("DVC Files Modified DVC Files (Median and Average):     ", statistics.median(real_dvc_files_changes_count), ",", sum(real_dvc_files_changes_count)/len(real_dvc_files_changes_count))
    
#     print("DVC Files Added Any-Type lines (Median and Average):     ", statistics.median(dvc_added_any_type_lines_per_repo), ",", sum(dvc_added_any_type_lines_per_repo)/len(dvc_added_any_type_lines_per_repo))
#     print("DVC Files Deleted Any-Type lines (Median and Average):     ", statistics.median(dvc_deleted_any_type_lines_per_repo), ",", sum(dvc_deleted_any_type_lines_per_repo)/len(dvc_deleted_any_type_lines_per_repo))
#     print("DVC Files Added DVC lines (Median and Average):     ", statistics.median(dvc_added_dvc_lines_per_repo), ",", sum(dvc_added_dvc_lines_per_repo)/len(dvc_added_dvc_lines_per_repo))
#     print("DVC Files Deleted DVC lines (Median and Average):     ", statistics.median(dvc_deleted_dvc_lines_per_repo), ",", sum(dvc_deleted_dvc_lines_per_repo)/len(dvc_deleted_dvc_lines_per_repo))
    
    # print("NON DVC Files Median and Average for Number of Modified Files: ", statistics.median(non_dvc_files_changes_count), ",", sum(non_dvc_files_changes_count)/len(non_dvc_files_changes_count))
    
    # print("non_dvc_not_bug_count, dvc_not_bug_count, non_dvc_bug_count, dvc_bug_count, non_dvc_not_bug_count + dvc_not_bug_count + non_dvc_bug_count + dvc_bug_count, total_commits")
    # print(non_dvc_not_bug_count, dvc_not_bug_count, non_dvc_bug_count, dvc_bug_count, non_dvc_not_bug_count + dvc_not_bug_count + non_dvc_bug_count + dvc_bug_count, total_commits)
    # print("Number of Exceptions", str(number_of_exceptions))
    print()
    print()
    
    return commits_info, summary

In [5]:
def save_to_file(root, repo_name, file_name, json_obj):
    repo_name = repo_name.split("/")[1]
    folder_path = os.path.join(root,repo_name)
    file_path = os.path.join(folder_path,file_name)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    with open(file_path, 'w') as f:
        json.dump(json_obj, f, indent=4)

In [6]:
def save_summary_to_file(root, file_name, all_summary):
    file_path = os.path.join(root,file_name)
    with open(file_name, 'w', newline='') as output_file:
        dict_writer = csv.DictWriter(output_file, all_summary[0].keys())
        dict_writer.writeheader()
        dict_writer.writerows(all_summary)

In [7]:
def load_from_file(root, repo_name, file_name):
    repo_name = repo_name.split("/")[1]
    folder_path = os.path.join(root,repo_name)
    file_path = os.path.join(folder_path,file_name)
    with open(file_path, 'r') as f:
        return json.load(f)

In [8]:
repos = [
    'weiji14/deepicedrain',
    'ipno-llead/processing',
    'nasaharvest/crop-mask',
    'BlueBrain/Search',
    'rootski-io/rootski',
    'dna-seq/dna-seq',
    'GenericMappingTools/pygmt',
    'GenericMappingTools/gmt',
]

In [9]:
commits_infos = []
all_summary = []
for i in tq(range(len(repos))):
    commit_info, summary = bugs_from_commits_in_repo("https://github.com/"+repos[i])
    commits_infos.append(commit_info)
    all_summary.append(summary)
    save_to_file("C:\\Users\\F_RABBI\\Desktop\\se4a1_class_group_project\\data",repos[i],"bugs_from_commits.json",commits_infos[i])

 12%|██████████▌                                                                         | 1/8 [01:37<11:21, 97.29s/it]

{'repo_name': 'https://github.com/weiji14/deepicedrain', 'DVC_Bug_Ratio_Per_Commit': 0.5, 'NON_DVC_Bug_Ratio_Per_Commit': 0.12452107279693486, 'DVC_Files_Modified_Any_Type_Files_Med': 10, 'DVC_Files_Modified_Any_Type_Files_Avg': 10.0, 'DVC_Files_Modified_DVC_Files_Med': 5, 'DVC_Files_Modified_DVC_Files_Avg': 5.0, 'DVC_Files_Added_Any_Type_lines_Med': 67, 'DVC_Files_Added_Any_Type_lines_Avg': 67.0, 'DVC_Files_Added_DVC_lines_Med': 18, 'DVC_Files_Added_DVC_lines_Avg': 18.0, 'DVC_Files_Deleted_Any_Type_lines_Med': 39, 'DVC_Files_Deleted_Any_Type_lines_Avg': 39.0, 'DVC_Files_Deleted_DVC_lines_Med': 0, 'DVC_Files_Deleted_DVC_lines_Avg': 0.0, 'non_dvc_not_bug_count': 457, 'dvc_not_bug_count': 1, 'non_dvc_bug_count': 65, 'dvc_bug_count': 1, 'total': 524}




 25%|████████████████████▊                                                              | 2/8 [05:25<17:26, 174.43s/it]

{'repo_name': 'https://github.com/ipno-llead/processing', 'DVC_Bug_Ratio_Per_Commit': 0.02127659574468085, 'NON_DVC_Bug_Ratio_Per_Commit': 0.20943952802359883, 'DVC_Files_Modified_Any_Type_Files_Med': 3.0, 'DVC_Files_Modified_Any_Type_Files_Avg': 3.0, 'DVC_Files_Modified_DVC_Files_Med': 1.5, 'DVC_Files_Modified_DVC_Files_Avg': 1.5, 'DVC_Files_Added_Any_Type_lines_Med': 5.0, 'DVC_Files_Added_Any_Type_lines_Avg': 5.0, 'DVC_Files_Added_DVC_lines_Med': 1.5, 'DVC_Files_Added_DVC_lines_Avg': 1.5, 'DVC_Files_Deleted_Any_Type_lines_Med': 23.0, 'DVC_Files_Deleted_Any_Type_lines_Avg': 23.0, 'DVC_Files_Deleted_DVC_lines_Med': 7.5, 'DVC_Files_Deleted_DVC_lines_Avg': 7.5, 'non_dvc_not_bug_count': 1340, 'dvc_not_bug_count': 92, 'non_dvc_bug_count': 355, 'dvc_bug_count': 2, 'total': 1789}




 38%|███████████████████████████████▏                                                   | 3/8 [10:10<18:45, 225.01s/it]

{'repo_name': 'https://github.com/nasaharvest/crop-mask', 'DVC_Bug_Ratio_Per_Commit': 0.016666666666666666, 'NON_DVC_Bug_Ratio_Per_Commit': 0.06510969568294409, 'DVC_Files_Modified_Any_Type_Files_Med': 4.5, 'DVC_Files_Modified_Any_Type_Files_Avg': 4.5, 'DVC_Files_Modified_DVC_Files_Med': 3.5, 'DVC_Files_Modified_DVC_Files_Avg': 3.5, 'DVC_Files_Added_Any_Type_lines_Med': 16.5, 'DVC_Files_Added_Any_Type_lines_Avg': 16.5, 'DVC_Files_Added_DVC_lines_Med': 9.0, 'DVC_Files_Added_DVC_lines_Avg': 9.0, 'DVC_Files_Deleted_Any_Type_lines_Med': 15.0, 'DVC_Files_Deleted_Any_Type_lines_Avg': 15.0, 'DVC_Files_Deleted_DVC_lines_Med': 9.0, 'DVC_Files_Deleted_DVC_lines_Avg': 9.0, 'non_dvc_not_bug_count': 1321, 'dvc_not_bug_count': 118, 'non_dvc_bug_count': 92, 'dvc_bug_count': 2, 'total': 1533}




 50%|█████████████████████████████████████████▌                                         | 4/8 [11:05<10:30, 157.68s/it]

{'repo_name': 'https://github.com/BlueBrain/Search', 'DVC_Bug_Ratio_Per_Commit': 0.034482758620689655, 'NON_DVC_Bug_Ratio_Per_Commit': 0.12285012285012285, 'DVC_Files_Modified_Any_Type_Files_Med': 55, 'DVC_Files_Modified_Any_Type_Files_Avg': 55.0, 'DVC_Files_Modified_DVC_Files_Med': 35, 'DVC_Files_Modified_DVC_Files_Avg': 35.0, 'DVC_Files_Added_Any_Type_lines_Med': 1359, 'DVC_Files_Added_Any_Type_lines_Avg': 1359.0, 'DVC_Files_Added_DVC_lines_Med': 698, 'DVC_Files_Added_DVC_lines_Avg': 698.0, 'DVC_Files_Deleted_Any_Type_lines_Med': 77, 'DVC_Files_Deleted_Any_Type_lines_Avg': 77.0, 'DVC_Files_Deleted_DVC_lines_Med': 0, 'DVC_Files_Deleted_DVC_lines_Avg': 0.0, 'non_dvc_not_bug_count': 357, 'dvc_not_bug_count': 28, 'non_dvc_bug_count': 50, 'dvc_bug_count': 1, 'total': 436}




 62%|███████████████████████████████████████████████████▉                               | 5/8 [12:00<06:02, 120.71s/it]

{'repo_name': 'https://github.com/rootski-io/rootski', 'DVC_Bug_Ratio_Per_Commit': 0.2, 'NON_DVC_Bug_Ratio_Per_Commit': 0.16777041942604856, 'DVC_Files_Modified_Any_Type_Files_Med': 51, 'DVC_Files_Modified_Any_Type_Files_Avg': 51.0, 'DVC_Files_Modified_DVC_Files_Med': 21, 'DVC_Files_Modified_DVC_Files_Avg': 21.0, 'DVC_Files_Added_Any_Type_lines_Med': 227, 'DVC_Files_Added_Any_Type_lines_Avg': 227.0, 'DVC_Files_Added_DVC_lines_Med': 0, 'DVC_Files_Added_DVC_lines_Avg': 0.0, 'DVC_Files_Deleted_Any_Type_lines_Med': 83, 'DVC_Files_Deleted_Any_Type_lines_Avg': 83.0, 'DVC_Files_Deleted_DVC_lines_Med': 0, 'DVC_Files_Deleted_DVC_lines_Avg': 0.0, 'non_dvc_not_bug_count': 377, 'dvc_not_bug_count': 4, 'non_dvc_bug_count': 76, 'dvc_bug_count': 1, 'total': 458}




 75%|███████████████████████████████████████████████████████████████                     | 6/8 [12:15<02:49, 84.69s/it]

{'repo_name': 'https://github.com/dna-seq/dna-seq', 'DVC_Bug_Ratio_Per_Commit': 0.18181818181818182, 'NON_DVC_Bug_Ratio_Per_Commit': 0.20689655172413793, 'DVC_Files_Modified_Any_Type_Files_Med': 6.5, 'DVC_Files_Modified_Any_Type_Files_Avg': 8.0, 'DVC_Files_Modified_DVC_Files_Med': 3.5, 'DVC_Files_Modified_DVC_Files_Avg': 3.75, 'DVC_Files_Added_Any_Type_lines_Med': 26.0, 'DVC_Files_Added_Any_Type_lines_Avg': 55.75, 'DVC_Files_Added_DVC_lines_Med': 19.5, 'DVC_Files_Added_DVC_lines_Avg': 29.0, 'DVC_Files_Deleted_Any_Type_lines_Med': 23.0, 'DVC_Files_Deleted_Any_Type_lines_Avg': 20.0, 'DVC_Files_Deleted_DVC_lines_Med': 9.5, 'DVC_Files_Deleted_DVC_lines_Avg': 9.25, 'non_dvc_not_bug_count': 69, 'dvc_not_bug_count': 18, 'non_dvc_bug_count': 18, 'dvc_bug_count': 4, 'total': 109}




 88%|████████████████████████████████████████████████████████████████████████▋          | 7/8 [17:35<02:41, 161.53s/it]

{'repo_name': 'https://github.com/GenericMappingTools/pygmt', 'DVC_Bug_Ratio_Per_Commit': 0.1595744680851064, 'NON_DVC_Bug_Ratio_Per_Commit': 0.14724354501046755, 'DVC_Files_Modified_Any_Type_Files_Med': 7, 'DVC_Files_Modified_Any_Type_Files_Avg': 6.266666666666667, 'DVC_Files_Modified_DVC_Files_Med': 1, 'DVC_Files_Modified_DVC_Files_Avg': 1.6, 'DVC_Files_Added_Any_Type_lines_Med': 13, 'DVC_Files_Added_Any_Type_lines_Avg': 59.13333333333333, 'DVC_Files_Added_DVC_lines_Med': 1, 'DVC_Files_Added_DVC_lines_Avg': 4.466666666666667, 'DVC_Files_Deleted_Any_Type_lines_Med': 1, 'DVC_Files_Deleted_Any_Type_lines_Avg': 8.4, 'DVC_Files_Deleted_DVC_lines_Med': 1, 'DVC_Files_Deleted_DVC_lines_Avg': 1.0, 'non_dvc_not_bug_count': 1222, 'dvc_not_bug_count': 79, 'non_dvc_bug_count': 211, 'dvc_bug_count': 15, 'total': 1527}


{'repo_name': 'https://github.com/GenericMappingTools/gmt', 'DVC_Bug_Ratio_Per_Commit': 0.35, 'NON_DVC_Bug_Ratio_Per_Commit': 0.12768607910308316, 'DVC_Files_Modified_Any_Type_File

100%|█████████████████████████████████████████████████████████████████████████████████| 8/8 [1:57:55<00:00, 884.44s/it]


In [13]:
save_summary_to_file("C:\\Users\\F_RABBI\\Desktop\\se4a1_class_group_project\\data","commits_summary.csv",all_summary)

In [10]:
# commits_infos = []
# for i in tq(range(len(repos))):
#     commits_infos.append(load_from_file("C:\\Users\\F_RABBI\\Desktop\\se4a1_class_group_project\\data",repos[i],"bugs_from_commits.json"))

In [11]:
len(commits_infos)

8

In [12]:
# for index in range(8):
#     commits_info = commits_infos[index]
#     dvc_files_changes_count = []
#     non_dvc_files_changes_count = []

#     for commit_hash in commits_info.keys():
#         try:
#             number = len(commits_info[commit_hash]['modified_files'])
#             if commits_info[commit_hash]["is_dvc_related"]:
#                 dvc_files_changes_count.append(number)
#             else:
#                 non_dvc_files_changes_count.append(number)
#         except:
#             continue

#     print("DVC Files::Median and Average for Nmber of Modified Files:     ", statistics.median(dvc_files_changes_count), ",", sum(dvc_files_changes_count)/len(dvc_files_changes_count))
#     print("NON DVC Files::Median and Average for Nmber of Modified Files: ", statistics.median(non_dvc_files_changes_count), ",", sum(non_dvc_files_changes_count)/len(non_dvc_files_changes_count))
#     print()