In [31]:
# https://pydriller.readthedocs.io/en/latest/tutorial.html

In [32]:
import pydriller
from pydriller import Repository
import json
import pandas
import os
from tqdm import tqdm as tq
import statistics

In [33]:
bug_related_msg = ['bug','bugs','fix','fixes','patch','fault','corrected','tweaked','problem','problems','issue']

In [39]:
# exception_messages = []
def bugs_from_commits_in_repo(repo_name):
    commits_info = {}
    non_dvc_not_bug_count = 0
    dvc_not_bug_count = 0
    non_dvc_bug_count = 0
    dvc_bug_count = 0
    number_of_exceptions = 0
    total_commits = 0
    dvc_files_changes_count = []
    non_dvc_files_changes_count = []
    
    for commit in Repository(repo_name).traverse_commits():
        total_commits += 1
        if commit.in_main_branch == False:
            continue
        is_bug = False
        is_dvc = False
        for bug_msg in bug_related_msg:
            if bug_msg in commit.msg:
                is_bug = True
                break
        single_commmit = {}
        changed_files = []
        try:            
            for file in commit.modified_files:
                if 'dvc' in str(file.old_path).lower() or 'dvc' in str(file.new_path).lower():
                    is_dvc = True
                    if is_bug == False:
                        break
                if is_bug == True:
                    file_obj = {}
                    file_obj['file_name'] = file.filename
                    file_obj['old_path'] = file.old_path
                    file_obj['new_path'] = file.new_path
                    file_obj['nloc'] = file.nloc
                    changed_files.append(file_obj)

            if is_bug == False:
                if is_dvc == False:
                    non_dvc_not_bug_count += 1
                else:
                    dvc_not_bug_count += 1
                continue
            if is_dvc == True:
                dvc_bug_count += 1
            else:
                non_dvc_bug_count += 1

            single_commmit['committer_date'] = str(commit.committer_date) 
            single_commmit['author_date'] = str(commit.author_date)
            single_commmit['commit_message'] = commit.msg
            single_commmit['in_main_branch'] = commit.in_main_branch
            single_commmit['committer'] = commit.committer.email
            single_commmit["modified_files"]=changed_files
            single_commmit['is_dvc_related'] = is_dvc
            commits_info[commit.hash] = single_commmit

            if is_dvc:
                dvc_files_changes_count.append(len(commit.modified_files))
            else:
                non_dvc_files_changes_count.append(len(commit.modified_files))
        except Exception as e:
            number_of_exceptions += 1
            # exception_messages.append(str(e))
            # print(str(e))
    commits_info["non_dvc_not_bug_count"] = non_dvc_not_bug_count
    commits_info["dvc_not_bug_count"] = dvc_not_bug_count
    commits_info["non_dvc_bug_count"] = non_dvc_bug_count
    commits_info["dvc_bug_count"] = dvc_bug_count
    print("Repo Name", repo_name)
    print("DVC Bug Ratio Per Commit: ", dvc_bug_count/(dvc_bug_count+dvc_not_bug_count))
    print("NON DVC Bug Ratio Per Commit: ", non_dvc_bug_count/(non_dvc_bug_count+non_dvc_not_bug_count))
    print("DVC Files Median and Average for Number of Modified Files:     ", statistics.median(dvc_files_changes_count), ",", sum(dvc_files_changes_count)/len(dvc_files_changes_count))
    print("NON DVC Files Median and Average for Number of Modified Files: ", statistics.median(non_dvc_files_changes_count), ",", sum(non_dvc_files_changes_count)/len(non_dvc_files_changes_count))
    print("non_dvc_not_bug_count, dvc_not_bug_count, non_dvc_bug_count, dvc_bug_count, non_dvc_not_bug_count + dvc_not_bug_count + non_dvc_bug_count + dvc_bug_count, total_commits")
    print(non_dvc_not_bug_count, dvc_not_bug_count, non_dvc_bug_count, dvc_bug_count, non_dvc_not_bug_count + dvc_not_bug_count + non_dvc_bug_count + dvc_bug_count, total_commits)
    print("Number of Exceptions", str(number_of_exceptions))
    print()
    return commits_info

In [40]:
def save_to_file(root, repo_name, file_name, json_obj):
    repo_name = repo_name.split("/")[1]
    folder_path = os.path.join(root,repo_name)
    file_path = os.path.join(folder_path,file_name)
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    with open(file_path, 'w') as f:
        json.dump(json_obj, f, indent=4)

In [41]:
def load_from_file(root, repo_name, file_name):
    repo_name = repo_name.split("/")[1]
    folder_path = os.path.join(root,repo_name)
    file_path = os.path.join(folder_path,file_name)
    with open(file_path, 'r') as f:
        return json.load(f)

In [42]:
repos = [
    'weiji14/deepicedrain',
    'ipno-llead/processing',
    'nasaharvest/crop-mask',
    'BlueBrain/Search',
    'rootski-io/rootski',
    'dna-seq/dna-seq',
    'GenericMappingTools/pygmt',
    'GenericMappingTools/gmt',
]

In [None]:
commits_infos = []
for i in tq(range(len(repos))):
    commits_infos.append(bugs_from_commits_in_repo("https://github.com/"+repos[i]))
    save_to_file("C:\\Users\\F_RABBI\\Desktop\\se4a1_class_group_project\\data",repos[i],"bugs_from_commits.json",commits_infos[i])

 12%|██████████▌                                                                         | 1/8 [01:29<10:25, 89.32s/it]

Repo Name https://github.com/weiji14/deepicedrain
DVC Bug Ratio Per Commit:  0.5
NON DVC Bug Ratio Per Commit:  0.12452107279693486
DVC Files Median and Average for Number of Modified Files:      10 , 10.0
NON DVC Files Median and Average for Number of Modified Files:  2 , 3.076923076923077
non_dvc_not_bug_count, dvc_not_bug_count, non_dvc_bug_count, dvc_bug_count, non_dvc_not_bug_count + dvc_not_bug_count + non_dvc_bug_count + dvc_bug_count, total_commits
457 1 65 1 524 536
Number of Exceptions 12



 25%|████████████████████▊                                                              | 2/8 [05:04<16:19, 163.18s/it]

Repo Name https://github.com/ipno-llead/processing
DVC Bug Ratio Per Commit:  0.021505376344086023
NON DVC Bug Ratio Per Commit:  0.20855614973262032
DVC Files Median and Average for Number of Modified Files:      3.0 , 3.0
NON DVC Files Median and Average for Number of Modified Files:  1 , 1.6980056980056981
non_dvc_not_bug_count, dvc_not_bug_count, non_dvc_bug_count, dvc_bug_count, non_dvc_not_bug_count + dvc_not_bug_count + non_dvc_bug_count + dvc_bug_count, total_commits
1332 91 351 2 1776 1776
Number of Exceptions 0



 38%|███████████████████████████████▏                                                   | 3/8 [10:17<19:19, 231.88s/it]

Repo Name https://github.com/nasaharvest/crop-mask
DVC Bug Ratio Per Commit:  0.016666666666666666
NON DVC Bug Ratio Per Commit:  0.06510969568294409
DVC Files Median and Average for Number of Modified Files:      4.5 , 4.5
NON DVC Files Median and Average for Number of Modified Files:  1.0 , 1.3478260869565217
non_dvc_not_bug_count, dvc_not_bug_count, non_dvc_bug_count, dvc_bug_count, non_dvc_not_bug_count + dvc_not_bug_count + non_dvc_bug_count + dvc_bug_count, total_commits
1321 118 92 2 1533 1533
Number of Exceptions 0



 50%|█████████████████████████████████████████▌                                         | 4/8 [11:11<10:46, 161.71s/it]

Repo Name https://github.com/BlueBrain/Search
DVC Bug Ratio Per Commit:  0.034482758620689655
NON DVC Bug Ratio Per Commit:  0.12285012285012285
DVC Files Median and Average for Number of Modified Files:      55 , 55.0
NON DVC Files Median and Average for Number of Modified Files:  3.0 , 4.74
non_dvc_not_bug_count, dvc_not_bug_count, non_dvc_bug_count, dvc_bug_count, non_dvc_not_bug_count + dvc_not_bug_count + non_dvc_bug_count + dvc_bug_count, total_commits
357 28 50 1 436 436
Number of Exceptions 0



 62%|███████████████████████████████████████████████████▉                               | 5/8 [12:06<06:08, 122.98s/it]

Repo Name https://github.com/rootski-io/rootski
DVC Bug Ratio Per Commit:  0.2
NON DVC Bug Ratio Per Commit:  0.16777041942604856
DVC Files Median and Average for Number of Modified Files:      51 , 51.0
NON DVC Files Median and Average for Number of Modified Files:  1.0 , 2.8289473684210527
non_dvc_not_bug_count, dvc_not_bug_count, non_dvc_bug_count, dvc_bug_count, non_dvc_not_bug_count + dvc_not_bug_count + non_dvc_bug_count + dvc_bug_count, total_commits
377 4 76 1 458 458
Number of Exceptions 0



 75%|███████████████████████████████████████████████████████████████                     | 6/8 [12:21<02:52, 86.29s/it]

Repo Name https://github.com/dna-seq/dna-seq
DVC Bug Ratio Per Commit:  0.18181818181818182
NON DVC Bug Ratio Per Commit:  0.20689655172413793
DVC Files Median and Average for Number of Modified Files:      6.5 , 8.0
NON DVC Files Median and Average for Number of Modified Files:  2.0 , 2.5
non_dvc_not_bug_count, dvc_not_bug_count, non_dvc_bug_count, dvc_bug_count, non_dvc_not_bug_count + dvc_not_bug_count + non_dvc_bug_count + dvc_bug_count, total_commits
69 18 18 4 109 109
Number of Exceptions 0



 88%|████████████████████████████████████████████████████████████████████████▋          | 7/8 [17:39<02:42, 162.07s/it]

Repo Name https://github.com/GenericMappingTools/pygmt
DVC Bug Ratio Per Commit:  0.1595744680851064
NON DVC Bug Ratio Per Commit:  0.1473463687150838
DVC Files Median and Average for Number of Modified Files:      7 , 6.266666666666667
NON DVC Files Median and Average for Number of Modified Files:  2 , 4.1753554502369665
non_dvc_not_bug_count, dvc_not_bug_count, non_dvc_bug_count, dvc_bug_count, non_dvc_not_bug_count + dvc_not_bug_count + non_dvc_bug_count + dvc_bug_count, total_commits
1221 79 211 15 1526 1526
Number of Exceptions 0



In [8]:
# commits_infos = []
# for i in tq(range(len(repos))):
#     commits_infos.append(load_from_file("C:\\Users\\F_RABBI\\Desktop\\se4a1_class_group_project\\data",repos[i],"bugs_from_commits.json"))

100%|███████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 119.65it/s]


In [10]:
len(commits_infos)

8

In [None]:
# for index in range(8):
#     commits_info = commits_infos[index]
#     dvc_files_changes_count = []
#     non_dvc_files_changes_count = []

#     for commit_hash in commits_info.keys():
#         try:
#             number = len(commits_info[commit_hash]['modified_files'])
#             if commits_info[commit_hash]["is_dvc_related"]:
#                 dvc_files_changes_count.append(number)
#             else:
#                 non_dvc_files_changes_count.append(number)
#         except:
#             continue

#     print("DVC Files::Median and Average for Nmber of Modified Files:     ", statistics.median(dvc_files_changes_count), ",", sum(dvc_files_changes_count)/len(dvc_files_changes_count))
#     print("NON DVC Files::Median and Average for Nmber of Modified Files: ", statistics.median(non_dvc_files_changes_count), ",", sum(non_dvc_files_changes_count)/len(non_dvc_files_changes_count))
#     print()