## Imports

In [1]:
import itertools
import sys
from collections import Counter

import numpy as np
import pandas as pd
from scipy import stats

sys.path.append('..')
from const import *

## Support Functions

In [2]:
def fix_proportion2(fixes, mode='both'):
    absolute = [
        sum(fixes['by'] == 'human') + sum(fixes['by'] == 'bot'),
        len(fixes.index) - sum(fixes['by'] == 'human') - sum(fixes['by'] == 'bot'),
        sum(fixes['by'] == 'bot'),
        sum(fixes['by'] == 'human')
    ]
    relative = [
        str(round((sum(fixes['by'] == 'human') + sum(fixes['by'] == 'bot'))/len(fixes.index) * 100, 2)) + "%",
        str(round((len(fixes.index) - sum(fixes['by'] == 'human') - sum(fixes['by'] == 'bot'))/len(fixes.index) * 100, 2)) + "%",
        str(round(sum(fixes['by'] == 'bot') / (sum(fixes['by'] == 'human') + sum(fixes['by'] == 'bot')) * 100, 2)) + "%",
        str(round(sum(fixes['by'] == 'human') / (sum(fixes['by'] == 'human') + sum(fixes['by'] == 'bot')) * 100, 2)) + "%"
    ]
    if mode == 'both':
        return absolute, relative
    elif mode == 'absolute':
        return absolute
    elif mode == 'relative':
        return relative
    else:
        return None
    
def fix_proportion_constrained(fixes, prs, lower, upper, mode='both'):
    combo = fixes.merge(prs, how='inner', on='repository')
    return fix_proportion2(combo[(combo['prs'] > lower) & (combo['prs'] < upper)], mode)

## Load Data

In [3]:
fixes = pd.read_csv(CSV_DATA['fixes_labels_round_2'], index_col=False)

reasons = pd.read_csv(CSV_DATA['stage_2_second_rater_true'], index_col=False)
pr_vuln = pd.read_csv(CSV_DATA['pr_vulnerabilities'], index_col=False)
pr_vuln = pr_vuln[pr_vuln['state'] != 'OPEN']
pr_nums = []
projects = set(pr_vuln['repository'].to_list())
for project in projects:
    pr_nums.append(len(pr_vuln[pr_vuln['repository'] == project].index))
pr_nums = pd.DataFrame({'repository': list(projects), 'prs': pr_nums})

## Overall Analysis

In [4]:
len(pr_vuln)

4195

In [5]:
len(fixes)

4978

In [6]:
len(fixes[fixes['fixed']==True])

4169

In [7]:
#Statistics only on fixing vulnerabilities
statisticsgrouped1 = fixes.groupby(by=['fixed'])['case'].count().reset_index()
statisticsgrouped1['case_perc'] = (statisticsgrouped1['case'] / statisticsgrouped1.case.sum())*100
statisticsgrouped1

Unnamed: 0,fixed,case,case_perc
0,False,809,16.251507
1,True,4169,83.748493


In [8]:
# statistics of all vulnerabilities
statisticsgrouped2 = fixes.fillna('')

statisticsgrouped2 = statisticsgrouped2.groupby(by=['fixed','by'])['case'].count().reset_index()
statisticsgrouped2['case_perc'] = (statisticsgrouped2['case'] / statisticsgrouped2.case.sum())*100
statisticsgrouped2

Unnamed: 0,fixed,by,case,case_perc
0,False,,809,16.251507
1,True,bot,2662,53.475291
2,True,human,1507,30.273202


In [9]:
fixes[(fixes['fixed']==False) & (fixes['B'].str.find('[\'merged\']')>=0)]

Unnamed: 0.1,Unnamed: 0,case,repository,number,associated,package,ghsa,fixed,by,A,B,C,D
20,20,20,ACM-UCI/ACM-UCI-Website,82,[],acorn,GHSA-6chw-6frg-f759,False,,,['merged'],[],[]
64,64,64,AlexsLemonade/refinebio-frontend,882,[],acorn,GHSA-6chw-6frg-f759,False,,,['merged'],[],[]
417,433,436,FUB-HCC/IKON-projektor,225,[],acorn,GHSA-6chw-6frg-f759,False,,,['merged'],[],[]
630,647,650,Integreat/integreat-react-native-app,223,[],acorn,GHSA-6chw-6frg-f759,False,,,['merged'],[],[]
635,652,655,Integreat/integreat-webapp,330,[],acorn,GHSA-6chw-6frg-f759,False,,,['merged'],[],[]
971,1002,1005,SofthouseVxo/Education,6,[],acorn,GHSA-6chw-6frg-f759,False,,,['merged'],[],[]
1089,1120,1123,Thorium-Sim/thorium,2842,[],acorn,GHSA-6chw-6frg-f759,False,,,['merged'],[],[]
1146,1177,1180,VizierDB/web-ui,132,[],handlebars,GHSA-q42p-pg8m-cqh6,False,,True,['merged'],['other'],[]
1147,1178,1181,VizierDB/web-ui,133,[],webpack-dev-server,GHSA-cf66-xwfp-gvc4,False,,True,['merged'],['other'],[]
1269,1300,1303,appbaseio/reactivesearch,1384,[],acorn,GHSA-6chw-6frg-f759,False,,,['merged'],[],[]


In [10]:
#Statistics of all vulnerabilities
statisticsgrouped3 = fixes[(fixes['by']!='') & (fixes['fixed']==True)]
statisticsgrouped3 = statisticsgrouped3.groupby(by=['by'])['case'].count().reset_index()
statisticsgrouped3['case_perc'] = (statisticsgrouped3['case'] / statisticsgrouped3.case.sum())*100
statisticsgrouped3

Unnamed: 0,by,case,case_perc
0,bot,2662,63.852243
1,human,1507,36.147757


In [11]:
pd.merge(
    statisticsgrouped2.groupby('by')['case'].sum().reset_index(),
    statisticsgrouped2.groupby('by')['case_perc'].sum().reset_index(),
    on='by',
    how='inner'
).rename(columns={'by':'fixed by'})

Unnamed: 0,fixed by,case,case_perc
0,,809,16.251507
1,bot,2662,53.475291
2,human,1507,30.273202


## Project-level Analysis

In [12]:
prs_per_project = pr_vuln.groupby('repository')['number'].count().reset_index()

def get_category(num):
    if num>=1 and num<=2:
        return '[1,2]'
    elif num>=3 and num<=4:
        return '[3,4]'
    elif num>=5 and num<=10:
        return '[5,10]'
    elif num>=11 and num<=67:
        return '[11,67]'
    
prs_per_project['category'] = prs_per_project.apply(lambda x: get_category(x['number']), axis=1)

prs_per_project.head()

Unnamed: 0,repository,number,category
0,11ways/hawkejs,1,"[1,2]"
1,18F/fedramp-dashboard,3,"[3,4]"
2,2600hz/monster-ui,1,"[1,2]"
3,30-seconds/30-seconds-of-code,8,"[5,10]"
4,ACM-UCI/ACM-UCI-Website,9,"[5,10]"


In [13]:
len(prs_per_project)

962

In [14]:
len(fixes.repository.unique())

978

In [15]:
len(pr_vuln)

4195

In [16]:
fixes.fixed.unique()

array([ True, False])

In [17]:
pr_vuln[pr_vuln['state']=='CLOSED']

Unnamed: 0.1,Unnamed: 0,repository,number,url,date,state,package,from,to,vulnerabilities,severities,maximal_severity
0,0,b2wads/grimorio-ui,74,https://github.com/b2wads/grimorio-ui/pull/74,2019-10-10T19:21:58Z,CLOSED,webpack-bundle-analyzer,3.0.3,3.3.2,['GHSA-pgr8-jg6h-8gw6'],['MODERATE'],MODERATE
4,4,idena-network/idena-desktop,35,https://github.com/idena-network/idena-desktop...,2019-09-11T10:02:08Z,CLOSED,mixin-deep,1.3.1,1.3.2,['GHSA-fhjf-83wg-r2j9'],['HIGH'],HIGH
5,5,idena-network/idena-desktop,36,https://github.com/idena-network/idena-desktop...,2019-09-11T10:02:13Z,CLOSED,eslint-utils,1.3.1,1.4.2,['GHSA-3gx7-xhv7-5mx3'],['CRITICAL'],CRITICAL
7,7,idena-network/idena-desktop,215,https://github.com/idena-network/idena-desktop...,2020-03-31T12:14:36Z,CLOSED,next,9.3.1,9.3.2,['GHSA-fq77-7p7r-83rj'],['MODERATE'],MODERATE
8,8,rand256/valetudo,7,https://github.com/rand256/valetudo/pull/7,2019-07-13T13:49:01Z,CLOSED,mqtt-packet,5.6.0,5.6.1,['GHSA-wv67-9jq7-8r69'],['HIGH'],HIGH
...,...,...,...,...,...,...,...,...,...,...,...,...
4411,4479,GMOD/jbrowse,1436,https://github.com/GMOD/jbrowse/pull/1436,2019-10-14T14:46:39Z,CLOSED,mixin-deep,1.3.1,1.3.2,['GHSA-fhjf-83wg-r2j9'],['HIGH'],HIGH
4412,4480,GMOD/jbrowse,1468,https://github.com/GMOD/jbrowse/pull/1468,2020-02-13T22:21:45Z,CLOSED,dojox,1.16.0,1.16.1,['GHSA-pg97-ww7h-5mjr'],['LOW'],LOW
4413,4481,GMOD/jbrowse,1470,https://github.com/GMOD/jbrowse/pull/1470,2020-02-14T23:40:19Z,CLOSED,yarn,1.21.1,1.22.0,['GHSA-5xf4-f2fq-f69j'],['MODERATE'],MODERATE
4414,4482,GMOD/jbrowse,1481,https://github.com/GMOD/jbrowse/pull/1481,2020-03-10T18:06:20Z,CLOSED,dojox,1.16.0,1.16.2,"['GHSA-3hw5-q855-g6cw', 'GHSA-pg97-ww7h-5mjr']","['LOW', 'LOW']",LOW


In [18]:
# there are 10 repositories in the fixes dataframe that do not exist in the pr_vuln dataframe
if 'category' not in fixes.columns:
    fixes = pd.merge(
        fixes,
        prs_per_project[['repository','category']],
        on='repository',
        how='inner'
    )
    fixes['category'] = fixes['category'].astype(str)
fixes.head()

# hamid
# this is where fixes records are reduce

Unnamed: 0.1,Unnamed: 0,case,repository,number,associated,package,ghsa,fixed,by,A,B,C,D,category
0,0,0,11ways/hawkejs,10,[],codecov,GHSA-5q88-cjfq-g2mh,True,human,False,['other'],['other'],[],"[1,2]"
1,1,1,18F/fedramp-dashboard,83,"[83, 84]",angular,GHSA-28hp-fgcr-2r4h,True,bot,True,['merged'],['other'],[],"[3,4]"
2,2,2,18F/fedramp-dashboard,91,[],angular,GHSA-89mq-4x47-5v83,True,human,False,['update dependencies'],['other'],[],"[3,4]"
3,3,3,18F/fedramp-dashboard,91,[],angular,GHSA-28hp-fgcr-2r4h,True,human,False,['update dependencies'],['other'],[],"[3,4]"
4,4,4,2600hz/monster-ui,596,[],acorn,GHSA-6chw-6frg-f759,True,human,True,['update dependencies'],['other'],[],"[1,2]"


In [19]:
resultsrq21 = fixes.groupby(by=['category','fixed'])['case'].count().reset_index()

# hamid
total_fixed_or_not = fixes.groupby(by=['fixed'])['case'].count().reset_index()
total_fixed_or_not['perc'] = (total_fixed_or_not['case'] / total_fixed_or_not.case.sum())*100
print(total_fixed_or_not)

resultsrq21['total'] = 0
for cat in resultsrq21.category.unique():
    tot = resultsrq21[resultsrq21['category']==cat].case.sum()
    resultsrq21.loc[resultsrq21['category']==cat,'total'] = tot
resultsrq21['perc'] = (resultsrq21['case']/resultsrq21['total'])*100
pd.pivot_table(resultsrq21,values='perc',index=['category'],columns=['fixed'],aggfunc=np.sum)

   fixed  case       perc
0  False   757  15.379927
1   True  4165  84.620073


fixed,False,True
category,Unnamed: 1_level_1,Unnamed: 2_level_1
"[1,2]",23.64946,76.35054
"[11,67]",16.071429,83.928571
"[3,4]",9.225513,90.774487
"[5,10]",13.960114,86.039886


In [20]:
resultsrq22 = fixes.groupby(by=['category','by'])['case'].count().reset_index()
resultsrq22['total'] = 0
for cat in resultsrq22.category.unique():
    tot = resultsrq22[resultsrq22['category']==cat].case.sum()
    resultsrq22.loc[resultsrq22['category']==cat,'total'] = tot
resultsrq22['perc'] = (resultsrq22['case']/resultsrq22['total'])*100
pd.pivot_table(resultsrq22,values='perc',index=['category'],columns=['by'],aggfunc=np.sum)

by,bot,human
category,Unnamed: 1_level_1,Unnamed: 2_level_1
"[1,2]",52.830189,47.169811
"[11,67]",71.11293,28.88707
"[3,4]",61.355082,38.644918
"[5,10]",64.10596,35.89404


In [21]:
from statsmodels.stats.multitest import multipletests

observed_values = np.array([fix_proportion_constrained(fixes, pr_nums, 0, 3, mode='absolute')[0:2],
                            fix_proportion_constrained(fixes, pr_nums, 2, 5, mode='absolute')[0:2],
                            fix_proportion_constrained(fixes, pr_nums, 4, 11, mode='absolute')[0:2],
                            fix_proportion_constrained(fixes, pr_nums, 10, 68, mode='absolute')[0:2]])
print(observed_values)
# perform Pearson's chi-squared test
stat, p, dof, expected = stats.chi2_contingency(observed_values)
# retrieve the number of samples in the table
n = observed_values.sum()
# compute the Cramer's V; since the minimum between number of columns/rows - 1 in table is equal to 1
# the formula is square root of (test value, i.e., chi^2, divided by umber of samples in the table)
cramer_v = ((stat/n)/1)**0.5
# create a dataframe
print("\033[1m" + 'Overall Perason Chisquare:' + "\033[0m")
print('p-value: {}; Cramer\'s V: {}'.format(p, cramer_v))
print("\033[1m" + 'Post-hoc pairwise comparisons:' + "\033[0m")
post_hoc_p = np.zeros((4, 4))
post_hoc_sig = np.zeros((4, 4))
p_values = []
combinations = list(itertools.combinations(range(0,4), 2))
for i, j in combinations:
    _, p, _, _ = stats.chi2_contingency(observed_values[[i,j], :])
    p_values.append(p)
pd.DataFrame({'p': p_values}).to_csv('data.csv', index=False)
print(p_values)

# mptest = multipletests(p_values)
# print(mptest)

command = f'Rscript adjust_p.R {DIR_RQ} data.csv'
os.system(command)
os.remove('data.csv')
p_adjusted = []
with open("adjusted_p.txt") as f:
    for line in f:
        p_adjusted_str = line.split()
        p_adjusted = [float(x) for x in p_adjusted_str]
os.remove('adjusted_p.txt')
for index, combo in enumerate(combinations):
    if p_adjusted[index] < 0.001:
        post_hoc_sig[combo[0],combo[1]] = 4
    elif p_adjusted[index] < 0.01:
        post_hoc_sig[combo[0],combo[1]] = 3
    elif p_adjusted[index] < 0.05:
        post_hoc_sig[combo[0],combo[1]] = 2
    else:
        post_hoc_p[combo[0],combo[1]] = 1
    post_hoc_p[combo[0],combo[1]] = p_adjusted[index]
print("\033[1m" + '[p-value]' + "\033[0m")
print(post_hoc_p)
print("\033[1m" + '[different significance levels]' + "\033[0m")
print(post_hoc_sig)

[[ 636  197]
 [ 797   81]
 [1510  245]
 [1222  234]]
[1mOverall Perason Chisquare:[0m
p-value: 1.1978691437080447e-15; Cramer's V: 0.12143036533791564
[1mPost-hoc pairwise comparisons:[0m
[1.0698036662289223e-15, 1.3315975824241285e-09, 1.0512689067747762e-05, 0.000638505584757772, 3.718244115053478e-06, 0.10479294728876147]
[1m[p-value][0m
[[0.000000e+00 6.418822e-15 3.994793e-09 1.576903e-05]
 [0.000000e+00 0.000000e+00 7.662067e-04 7.436488e-06]
 [0.000000e+00 0.000000e+00 0.000000e+00 1.047929e-01]
 [0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00]]
[1m[different significance levels][0m
[[0. 4. 4. 4.]
 [0. 0. 4. 4.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]


In [22]:
observed_values = np.array([fix_proportion_constrained(fixes, pr_nums, 0, 3, mode='absolute')[2:4],
                            fix_proportion_constrained(fixes, pr_nums, 2, 5, mode='absolute')[2:4],
                            fix_proportion_constrained(fixes, pr_nums, 4, 11, mode='absolute')[2:4],
                            fix_proportion_constrained(fixes, pr_nums, 10, 68, mode='absolute')[2:4]])
print(observed_values)
# perform Pearson's chi-squared test
stat, p, dof, expected = stats.chi2_contingency(observed_values)
# retrieve the number of samples in the table
n = observed_values.sum()
# compute the Cramer's V; since the minimum between number of columns/rows - 1 in table is equal to 1
# the formula is square root of (test value, i.e., chi^2, divided by umber of samples in the table)
cramer_v = ((stat/n)/1)**0.5
# create a dataframe
print("\033[1m" + 'Overall Perason Chisquare:' + "\033[0m")
print('p-value: {}; Cramer\'s V: {}'.format(p, cramer_v))
print("\033[1m" + 'Post-hoc pairwise comparisons:' + "\033[0m")
post_hoc_p = np.zeros((4, 4))
post_hoc_sig = np.zeros((4, 4))
p_values = []
combinations = list(itertools.combinations(range(0,4), 2))
for i, j in combinations:
    _, p, _, _ = stats.chi2_contingency(observed_values[[j,i], :])
    p_values.append(p)
pd.DataFrame({'p': p_values}).to_csv('data.csv', index=False)
command = f'Rscript adjust_p.R {DIR_RQ} data.csv'
os.system(command)
os.remove('data.csv')
p_adjusted = []
with open("adjusted_p.txt") as f:
    for line in f:
        p_adjusted_str = line.split()
        p_adjusted = [float(x) for x in p_adjusted_str]
os.remove('adjusted_p.txt')
for index, combo in enumerate(combinations):
    if p_adjusted[index] < 0.001:
        post_hoc_sig[combo[1],combo[0]] = 4
    elif p_adjusted[index] < 0.01:
        post_hoc_sig[combo[1],combo[0]] = 3
    elif p_adjusted[index] < 0.05:
        post_hoc_sig[combo[1],combo[0]] = 2
    else:
        post_hoc_p[combo[1],combo[0]] = 1
    post_hoc_p[combo[1],combo[0]] = p_adjusted[index]
print("\033[1m" + '[p-value]' + "\033[0m")
print(post_hoc_p)
print("\033[1m" + '[different significance levels]' + "\033[0m")
print(post_hoc_sig)

[[336 300]
 [489 308]
 [968 542]
 [869 353]]
[1mOverall Perason Chisquare:[0m
p-value: 9.890043486117468e-14; Cramer's V: 0.12359308525266517
[1mPost-hoc pairwise comparisons:[0m
[1m[p-value][0m
[[0.000000e+00 0.000000e+00 0.000000e+00 0.000000e+00]
 [1.705587e-03 0.000000e+00 0.000000e+00 0.000000e+00]
 [3.958292e-06 2.087154e-01 0.000000e+00 0.000000e+00]
 [4.320853e-14 1.245332e-05 1.853202e-04 0.000000e+00]]
[1m[different significance levels][0m
[[0. 0. 0. 0.]
 [3. 0. 0. 0.]
 [4. 0. 0. 0.]
 [4. 4. 4. 0.]]


## Developer Motivation

In [23]:
# all possible reasons for ignoring dependabot pr
column_d1 = dict(sorted(Counter(reasons['D1'].to_list()).items(),
                       key=lambda item: item[1]))
total_column_d1 = 0
case_name = []
case_abs_value = []
case_rel_value = []
for key in column_d1:
    total_column_d1 += column_d1[key]
for key in column_d1:
    case_name.append(key)
    case_abs_value.append(column_d1[key])
    case_rel_value.append(str(round(column_d1[key]/total_column_d1*100,2))+'%')

column_d1 = pd.DataFrame({
    'Absolute': case_abs_value,
    'Relative': case_rel_value},
    index = case_name
)
print("\033[1m" + 'Meta Reasons' + "\033[0m")
print(column_d1)
print("\n")

column_d = dict(sorted(Counter(reasons['D'].to_list()).items(),
                       key=lambda item: item[1]))

total_column_d = 0
case_name = []
case_abs_value = []
case_rel_value = []
for key in column_d:
    total_column_d += column_d[key]
for key in column_d:
    case_name.append(key)
    case_abs_value.append(column_d[key])
    case_rel_value.append(str(round(column_d[key]/total_column_d*100,2))+'%')

column_d = pd.DataFrame({
    'Absolute': case_abs_value,
    'Relative': case_rel_value},
    index = case_name
)
print("\033[1m" + 'Precise Reasons' + "\033[0m")
print(column_d)
print("\n")

[1mMeta Reasons[0m
                               Absolute Relative
misc                                  5    2.35%
bot dissatisfaction                  20    9.39%
bot limitations                      22   10.33%
dependency usage                     39   18.31%
compatibility challenges             59    27.7%
project management challenges        68   31.92%


[1mPrecise Reasons[0m
                       Absolute Relative
advanced                      1    0.47%
trial run                     1    0.47%
postpone                      1    0.47%
no tests                      1    0.47%
tolerable severity            1    0.47%
ignore minor                  1    0.47%
grouping                      1    0.47%
confusion                     3    1.41%
higher version                3    1.41%
anti-dependabot               4    1.88%
wrong branch                  4    1.88%
spam                          5    2.35%
cla                           6    2.82%
bot extra changes             7    3