In [455]:
import sys
sys.path.append('..')
import sql
import pandas as pd
from functools import reduce
from tabulate import tabulate
import datetime
import matplotlib.pyplot as plt
import numpy as np


In [456]:
q = '''select p.ecosystem, p.name as package, pu.* from package_usage pu
join package p on pu.package_id = p.id'''
usage = pd.DataFrame(sql.execute(q))

In [457]:
# query used to extract random samples
q = ''' -- insert into manual_sample
    with rand_cves as (
    select ecosystem, a.id as advisory_id, p.id as package_id, a.type,
           row_number() over (partition by ecosystem order by rand()) as rand_sample
    from snykvuln.advisory a
    join package p on a.package_id = p.id
    where a.id in (select advisory_id from fixing_releases)
    and year(publish_date) >= 2018
            and ecosystem != 'cocoapods'
        and a.id in (select distinct advisory_id from advisoryCVE)
    ),
    rand_noncves as (
    select ecosystem, a.id as advisory_id, p.id as package_id, a.type,
           row_number() over (partition by ecosystem order by rand()) as rand_sample
    from snykvuln.advisory a
    join package p on a.package_id = p.id
    where a.id in (select advisory_id from fixing_releases)
    and year(publish_date) >= 2018
            and ecosystem != 'cocoapods'
        and a.id not in (select distinct advisory_id from advisoryCVE)
    )
        select * from
    (select ecosystem, advisory_id, package_id, type
    from rand_cves
    where rand_sample <=25
    union
    select ecosystem, advisory_id, package_id, type
    from rand_noncves
    where rand_sample <=25) as sub; '''


In [458]:
q='''select *,
       case
            when exists(select * from advisoryCVE aC where aC.advisory_id = ms.advisory_id)
            then 'CVE'
            else 'non-CVE'
        end as if_cve
from manual_sample ms
join fixing_releases fr on ms.advisory_id = fr.advisory_id
join advisoryCWE ac on fr.advisory_id = ac.advisory_id
where version != 'manual checkup needed';'''
df = pd.DataFrame(sql.execute(q))
manualsample_sql = df
total_types= df['type'].nunique()
total = ['Total', 
            df['advisory_id'].nunique(),
            #str(df['advisory_id'].nunique()) +' (' +str(df[df['if_cve']=='non-CVE']['advisory_id'].nunique()) + ')', 
            df['package_id'].nunique(), 
            str(len(df.groupby(['package_id','version']))) +' (' + str(len(df.groupby(['advisory_id','package_id','version']))) +')', 
            len(df.groupby(['advisory_id','package_id','version'])), df['cwe'].nunique()]
total

['Total', 350, 285, '465 (499)', 499, 68]

In [459]:
advisory = df.groupby('ecosystem')[['advisory_id']].nunique()
package = df.groupby('ecosystem')[['package_id']].nunique()
noncve = (df.loc[df['if_cve']=='non-CVE']).groupby('ecosystem')[['advisory_id']].nunique()
releases = df.groupby(['ecosystem','package_id','version']).size().groupby(level=0).size()
releases = releases.reset_index()
total_rel = df.groupby(['ecosystem','advisory_id','package_id','version']).size().groupby(level=0).size()
total_rel = total_rel.reset_index()
df=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[advisory,package,noncve,releases, total_rel])
df.columns = ['ecosystem','advisory', 'package', 'non-cve','distinct releases', 'releases']
# df['advisory'] = df['advisory'].map(str) + ' (' + df['non-cve'].map(str) + ')'
df['distinct releases'] = df['distinct releases'].map(str) + ' (' + df['releases'].map(str) + ')'
df = df.drop(['non-cve'], axis =1)
df

Unnamed: 0,ecosystem,advisory,package,distinct releases,releases
0,Composer,50,30,72 (98),98
1,Go,50,49,65 (66),66
2,Maven,50,49,76 (76),76
3,NuGet,50,28,56 (60),60
4,RubyGems,50,40,65 (65),65
5,npm,50,44,70 (70),70
6,pip,50,45,61 (64),64


In [460]:
q = '''select p.ecosystem, ac.cwe as cwe
from manual_sample ms
join advisoryCWE aC on ms.advisory_id = aC.advisory_id
join advisory a on aC.advisory_id = a.id
join package p on a.package_id = p.id;'''
cwe = pd.DataFrame(sql.execute(q))
total_cwes = cwe['cwe'].nunique()
cwe = cwe.groupby('ecosystem')[['cwe']].nunique()

In [461]:
df=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[df, cwe])
df

Unnamed: 0,ecosystem,advisory,package,distinct releases,releases,cwe
0,Composer,50,30,72 (98),98,18
1,Go,50,49,65 (66),66,25
2,Maven,50,49,76 (76),76,25
3,NuGet,50,28,56 (60),60,14
4,RubyGems,50,40,65 (65),65,20
5,npm,50,44,70 (70),70,18
6,pip,50,45,61 (64),64,31


In [462]:
df.loc[len(df)] = total
df

Unnamed: 0,ecosystem,advisory,package,distinct releases,releases,cwe
0,Composer,50,30,72 (98),98,18
1,Go,50,49,65 (66),66,25
2,Maven,50,49,76 (76),76,25
3,NuGet,50,28,56 (60),60,14
4,RubyGems,50,40,65 (65),65,20
5,npm,50,44,70 (70),70,18
6,pip,50,45,61 (64),64,31
7,Total,350,285,465 (499),499,68


In [463]:
print(tabulate(df, tablefmt='latex', headers='keys',showindex=False))

\begin{tabular}{lrrlrr}
\hline
 ecosystem   &   advisory &   package & distinct releases   &   releases &   cwe \\
\hline
 Composer    &         50 &        30 & 72 (98)             &         98 &    18 \\
 Go          &         50 &        49 & 65 (66)             &         66 &    25 \\
 Maven       &         50 &        49 & 76 (76)             &         76 &    25 \\
 NuGet       &         50 &        28 & 56 (60)             &         60 &    14 \\
 RubyGems    &         50 &        40 & 65 (65)             &         65 &    20 \\
 npm         &         50 &        44 & 70 (70)             &         70 &    18 \\
 pip         &         50 &        45 & 61 (64)             &         64 &    31 \\
 Total       &        350 &       285 & 465 (499)           &        499 &    68 \\
\hline
\end{tabular}


In [464]:
csv = pd.read_csv('final-frozen.csv')
# csv = csv.rename(columns = {'Is there a documentation source?': 'is_doc','is the fix mentioned?':'is_fix', 'is there unrelated change mentioned?': 'is_uc', 'is breaking change mentioned?':'is_br' , 'main source' :'source', 'How is the fix mentioned? (final codes); separated by semicolon':'fix_pattern',
# 'how the breaking change is mentioned? (final codes); separated by semicolon':'breaking_pattern'})

len(csv)

499

In [465]:
temp = csv[csv['is_doc']=='y']
temp = temp[temp['is_fix']=='n']
temp = temp[temp['is_uc'] == 'n']
temp

Unnamed: 0,snyk_id,vulnerability type,ecosystem,package,security release,repository_url,possible release note,possible changelogs,link,is_doc,...,is_uc,is_br,how the breaking change is mentioned? (initial codes); separated by semicolon,breaking_pattern,Additional Comments,row,agreed,external_codes,Unnamed: 23,Unnamed: 24
6,npm:jsviews:20160320,Template Injection,npm,jsviews,0.9.74,https://github.com/borismoore/jsviews,https://github.com/BorisMoore/jsviews/releases...,no changlog found in script,https://snyk.io/vuln/npm:jsviews:20160320,y,...,n,n,,,release commit include the fix,8,yes,,,
123,SNYK-GOLANG-K8SIOKUBERNETESPKGKUBELETSERVER-56...,Denial of Service (DoS),Go,k8s.io/kubernetes/pkg/kubelet/server,1.17.2,https://github.com/kubernetes/kubernetes,https://github.com/kubernetes/kubernetes/relea...,https://github.com/kubernetes/kubernetes/blob/...,https://snyk.io/vuln/SNYK-GOLANG-K8SIOKUBERNET...,y,...,n,n,,,wtf? no notable changes,126,resolved,f,,
144,SNYK-JAVA-ORGAPACHEJACKRABBIT-460325,Cross-site Scripting (XSS),Maven,org.apache.jackrabbit:jackrabbit-jcr-commons,2.10.2,https://github.com/apache/jackrabbit,not found through script,https://github.com/apache/jackrabbit/blob/mast...,https://snyk.io/vuln/SNYK-JAVA-ORGAPACHEJACKRA...,y,...,n,n,,,,147,resolved,f,,
145,SNYK-JAVA-ORGAPACHEJACKRABBIT-460325,Cross-site Scripting (XSS),Maven,org.apache.jackrabbit:jackrabbit-jcr-commons,2.12.1,https://github.com/apache/jackrabbit,not found through script,https://github.com/apache/jackrabbit/blob/mast...,https://snyk.io/vuln/SNYK-JAVA-ORGAPACHEJACKRA...,y,...,n,n,,,,148,resolved,f,,
148,SNYK-JAVA-ORGAPACHEJACKRABBIT-460325,Cross-site Scripting (XSS),Maven,org.apache.jackrabbit:jackrabbit-jcr-commons,2.8.2,https://github.com/apache/jackrabbit,not found through script,https://github.com/apache/jackrabbit/blob/mast...,https://snyk.io/vuln/SNYK-JAVA-ORGAPACHEJACKRA...,y,...,n,n,,,,151,resolved,f,,
196,SNYK-JS-ACTIONSCORE-1015402,Improper Input Validation,npm,@actions/core,1.2.6,https://github.com/actions/toolkit/tree/master...,not found through script,https://github.com/actions/toolkit/blob/master...,https://snyk.io/vuln/SNYK-JS-ACTIONSCORE-1015402,y,...,n,n,,,discussed offline? sneaky?,204,yes,,,
329,SNYK-PYTHON-APACHESUPERSET-1014658,Information Exposure,pip,apache-superset,0.37.2,https://github.com/apache/superset,https://github.com/apache/superset/releases/ta...,https://github.com/apache/superset/blob/master...,https://snyk.io/vuln/SNYK-PYTHON-APACHESUPERSE...,y,...,n,n,,,,365,yes,y,,
428,SNYK-RUBY-SPREEBACKEND-20476,Cross-site Scripting (XSS),RubyGems,spree_backend,1.1.3,https://github.com/spree/spree,not found through script,https://github.com/spree/spree/blob/master/CHA...,https://snyk.io/vuln/SNYK-RUBY-SPREEBACKEND-20476,y,...,n,n,,,why no cve,474,yes,y,,


In [466]:
assert df.iloc[7,4] == len(csv)

In [467]:
q='''select id as snyk_id,
       case
        when id in (select advisory_id from advisoryCVE) then 'cve'
        else 'noncve'
end as ifcve
from advisory;'''
ifcve = pd.DataFrame(sql.execute(q))
ifcve

Unnamed: 0,snyk_id,ifcve
0,SNYK-PYTHON-SALT-1080588,cve
1,SNYK-PYTHON-SALT-1080589,cve
2,SNYK-PYTHON-SALT-1080590,cve
3,SNYK-PYTHON-SALT-1080591,cve
4,SNYK-PYTHON-SALT-1080592,cve
...,...,...
6951,SNYK-JS-GETIPRANGE-1073612,cve
6952,SNYK-JAVA-COMVAADIN-1074927,noncve
6953,SNYK-JAVA-ORGAPACHEXMLGRAPHICS-1074910,cve
6954,SNYK-JS-PROGFAYSCRAPBOXPARSER-1076803,cve


In [468]:
cvecsv = pd.merge(csv, ifcve, on ='snyk_id')
cvecsv

Unnamed: 0,snyk_id,vulnerability type,ecosystem,package,security release,repository_url,possible release note,possible changelogs,link,is_doc,...,is_br,how the breaking change is mentioned? (initial codes); separated by semicolon,breaking_pattern,Additional Comments,row,agreed,external_codes,Unnamed: 23,Unnamed: 24,ifcve
0,npm:address-rfc2822:20180225,Regular Expression Denial of Service (ReDoS),npm,address-rfc2822,2.0.2,https://github.com/haraka/node-address-rfc2822,not found through script,https://github.com/haraka/node-address-rfc2822...,https://snyk.io/vuln/npm:address-rfc2822:20180225,y,...,n,,,,2,yes,,,,noncve
1,npm:angular-jwt:20180605,Access Restriction Bypass,npm,angular-jwt,0.1.10,https://github.com/auth0/angular-jwt,https://github.com/auth0/angular-jwt/releases/...,https://github.com/auth0/angular-jwt/blob/mast...,https://snyk.io/vuln/npm:angular-jwt:20180605,y,...,n,,,,3,resolved,f,,,cve
2,npm:braces:20180219,Regular Expression Denial of Service (ReDoS),npm,braces,2.3.1,https://github.com/micromatch/braces,not found through script,https://github.com/micromatch/braces/blob/mast...,https://snyk.io/vuln/npm:braces:20180219,y,...,n,,,,4,yes,,,,noncve
3,npm:fastify:20180107,Denial of Service (DoS),npm,fastify,0.38.0,https://github.com/fastify/fastify,https://github.com/fastify/fastify/releases/ta...,no changlog found in script,https://snyk.io/vuln/npm:fastify:20180107,y,...,y,breaking change notice; pr reference,breaking change notice; code change reference,,5,yes,,,,cve
4,npm:is-my-json-valid:20180214,Regular Expression Denial of Service (ReDoS),npm,is-my-json-valid,1.4.1,https://github.com/mafintosh/is-my-json-valid,not found through script,no changlog found in script,https://snyk.io/vuln/npm:is-my-json-valid:2018...,n,...,n,,,"the package that does not have a release note,...",6,yes,,,,noncve
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,SNYK-RUBY-LODASHRAILS-559448,Prototype Pollution,RubyGems,lodash-rails,4.17.11,https://github.com/lodash/lodash,not found through script,https://github.com/lodash/lodash/blob/master/C...,https://snyk.io/vuln/SNYK-RUBY-LODASHRAILS-559448,y,...,n,,,changelog in wiki,438,resolved,d,Nasif could not find the changelog?,,cve
495,SNYK-RUBY-REDPARQUET-483028,Use of Uninitialized Variable,RubyGems,red-parquet,0.15.1,https://github.com/apache/arrow,not found through script,https://github.com/apache/arrow/blob/master/CH...,https://snyk.io/vuln/SNYK-RUBY-REDPARQUET-483028,y,...,n,,,automatically genrated?,455,resolved,d,The issue number is listed,,cve
496,SNYK-DOTNET-FAVICOJS-60151,Cross-site Scripting (XSS),NuGet,favico.js,0.3.10,https://github.com/ejci/favico.js,not found through script,no changlog found in script,https://snyk.io/vuln/SNYK-DOTNET-FAVICOJS-60151,y,...,n,,,,482,yes,y,is_doc is empty in process-frozen-nasif,,noncve
497,SNYK-DOTNET-KNOCKOUT-60180,Cross-site Scripting (XSS),NuGet,knockout,3.0.0,https://github.com/knockout/knockout,https://github.com/knockout/knockout/releases/...,no changlog found in script,https://snyk.io/vuln/SNYK-DOTNET-KNOCKOUT-60180,y,...,y,warning; action required,breaking change notice;action required,,483,yes,y,is_doc is empty in process-frozen-nasif,,noncve


In [469]:
doc = csv
doc['is_doc'] = doc['is_doc'].mask(doc['is_doc'].ne('y'))
temp_count = doc['is_doc'].count()
temp_rate = round(temp_count/total[4]*100,1)
temp = str(temp_count) + ' (' + str(temp_rate) + '%)'
total.append(temp)
doc = doc.groupby('ecosystem')[['is_doc']].count()
doc, 

(           is_doc
 ecosystem        
 Composer       63
 Go             54
 Maven          48
 NuGet          49
 RubyGems       49
 npm            43
 pip            57,)

In [470]:
fix = csv
fix['is_fix'] = fix['is_fix'].mask(fix['is_fix'].ne('y'))
temp_count = fix['is_fix'].count()
temp_rate = round(temp_count/total[4]*100,1)
temp = str(temp_count) + ' (' + str(temp_rate) + '%)'
total.append(temp)
fix = fix.groupby('ecosystem')[['is_fix']].count()
fix

Unnamed: 0_level_0,is_fix
ecosystem,Unnamed: 1_level_1
Composer,58
Go,43
Maven,31
NuGet,40
RubyGems,43
npm,39
pip,53


In [471]:
uc = csv
uc['is_uc'] = uc['is_uc'].mask(uc['is_uc'].ne('y'))
temp_count = uc['is_uc'].count()
temp_rate = round(temp_count/total[4]*100,1)
temp = str(temp_count) + ' (' + str(temp_rate) + '%)'
total.append(temp)
uc = uc.groupby('ecosystem')[['is_uc']].count()
uc


Unnamed: 0_level_0,is_uc
ecosystem,Unnamed: 1_level_1
Composer,51
Go,43
Maven,32
NuGet,46
RubyGems,24
npm,24
pip,41


In [472]:
br = csv
br['is_br'] = br['is_br'].mask(br['is_br'].ne('y'))
temp_count =br['is_br'].count()
temp_rate = round(temp_count/total[4]*100,1)
temp = str(temp_count) + ' (' + str(temp_rate) + '%)'
total.append(temp)
br = br.groupby('ecosystem')[['is_br']].count()
br

Unnamed: 0_level_0,is_br
ecosystem,Unnamed: 1_level_1
Composer,2
Go,10
Maven,2
NuGet,1
RubyGems,1
npm,8
pip,8


In [473]:
df=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[df, doc, fix, uc, br])
df

Unnamed: 0,ecosystem,advisory,package,distinct releases,releases,cwe,is_doc,is_fix,is_uc,is_br
0,Composer,50,30,72 (98),98,18,63,58,51,2
1,Go,50,49,65 (66),66,25,54,43,43,10
2,Maven,50,49,76 (76),76,25,48,31,32,2
3,NuGet,50,28,56 (60),60,14,49,40,46,1
4,RubyGems,50,40,65 (65),65,20,49,43,24,1
5,npm,50,44,70 (70),70,18,43,39,24,8
6,pip,50,45,61 (64),64,31,57,53,41,8


In [474]:
df['is_doc_rate'] = round(df['is_doc']/df['releases']* 100,1) 
df['is_doc'] = df['is_doc'].astype(str)
df['is_doc_rate'] = df['is_doc_rate'].astype(str)
df['is_doc'] = df['is_doc'] + ' (' + df['is_doc_rate'] +'%)'
df = df.drop('is_doc_rate', axis=1)
df['is_fix_rate'] = round(df['is_fix']/df['releases'] * 100 ,1)
df['is_fix'] = df['is_fix'].astype(str)
df['is_fix_rate'] = df['is_fix_rate'].astype(str)
df['is_fix'] = df['is_fix'] + ' (' + df['is_fix_rate'] +'%)'
df = df.drop('is_fix_rate', axis=1)
df['is_uc_rate'] = round(df['is_uc']/df['releases'] * 100,1)
df['is_uc'] = df['is_uc'].astype(str)
df['is_uc_rate'] = df['is_uc_rate'].astype(str)
df['is_uc'] = df['is_uc'] + ' (' + df['is_uc_rate'] +'%)'
df = df.drop('is_uc_rate', axis=1)
df['is_br_rate'] = round(df['is_br']/df['releases']* 100,1)
df['is_br'] = df['is_br'].astype(str)
df['is_br_rate'] = df['is_br_rate'].astype(str)
df['is_br'] = df['is_br'] + ' (' + df['is_br_rate'] +'%)'
df = df.drop('is_br_rate', axis=1)
df

Unnamed: 0,ecosystem,advisory,package,distinct releases,releases,cwe,is_doc,is_fix,is_uc,is_br
0,Composer,50,30,72 (98),98,18,63 (64.3%),58 (59.2%),51 (52.0%),2 (2.0%)
1,Go,50,49,65 (66),66,25,54 (81.8%),43 (65.2%),43 (65.2%),10 (15.2%)
2,Maven,50,49,76 (76),76,25,48 (63.2%),31 (40.8%),32 (42.1%),2 (2.6%)
3,NuGet,50,28,56 (60),60,14,49 (81.7%),40 (66.7%),46 (76.7%),1 (1.7%)
4,RubyGems,50,40,65 (65),65,20,49 (75.4%),43 (66.2%),24 (36.9%),1 (1.5%)
5,npm,50,44,70 (70),70,18,43 (61.4%),39 (55.7%),24 (34.3%),8 (11.4%)
6,pip,50,45,61 (64),64,31,57 (89.1%),53 (82.8%),41 (64.1%),8 (12.5%)


In [475]:
total

['Total',
 350,
 285,
 '465 (499)',
 499,
 68,
 '363 (72.7%)',
 '307 (61.5%)',
 '261 (52.3%)',
 '32 (6.4%)']

In [476]:
df.loc[len(df)]=total
df['temp'] = df['ecosystem'].str.lower()
df = df.sort_values(by = 'temp', ascending=True)
df = df.drop(['releases','temp'], axis= 1)
print(tabulate(df, tablefmt='latex', headers='keys',showindex=False))

\begin{tabular}{lrrlrllll}
\hline
 ecosystem   &   advisory &   package & distinct releases   &   cwe & is\_doc      & is\_fix      & is\_uc       & is\_br      \\
\hline
 Composer    &         50 &        30 & 72 (98)             &    18 & 63 (64.3\%)  & 58 (59.2\%)  & 51 (52.0\%)  & 2 (2.0\%)   \\
 Go          &         50 &        49 & 65 (66)             &    25 & 54 (81.8\%)  & 43 (65.2\%)  & 43 (65.2\%)  & 10 (15.2\%) \\
 Maven       &         50 &        49 & 76 (76)             &    25 & 48 (63.2\%)  & 31 (40.8\%)  & 32 (42.1\%)  & 2 (2.6\%)   \\
 npm         &         50 &        44 & 70 (70)             &    18 & 43 (61.4\%)  & 39 (55.7\%)  & 24 (34.3\%)  & 8 (11.4\%)  \\
 NuGet       &         50 &        28 & 56 (60)             &    14 & 49 (81.7\%)  & 40 (66.7\%)  & 46 (76.7\%)  & 1 (1.7\%)   \\
 pip         &         50 &        45 & 61 (64)             &    31 & 57 (89.1\%)  & 53 (82.8\%)  & 41 (64.1\%)  & 8 (12.5\%)  \\
 RubyGems    &         50 &        40 & 65 (65)  

In [477]:
csv['source'] = csv['source'].str.strip()
csv['source'] = csv['source'].str.lower()
t =  csv[csv['is_doc']=='y']
t['source'].value_counts()

changelog                201
github release note      116
homepage announcement     45
security notice            1
Name: source, dtype: int64

In [478]:
t = t[t['source']=='github release note']
len(t[t['possible release note']== 'not found through script'])
# t = csv[csv['source']=='n']
# t

23

In [479]:
csv['fix_pattern'].value_counts()

fix reference                                                                             33
security notice;advisory reference;fix reference                                          17
security notice;vulnerability description                                                 13
security notice;vulnerability description;advisory reference;affected component listed    13
Fix reference                                                                             12
                                                                                          ..
affected component listed                                                                  1
vulnerability description;fix reference; security notice                                   1
security notice;vulnerability description;exploit                                          1
security notice;vulnerability description;advisory reference;exploit                       1
vulnerability description; advisory reference                         

In [480]:
fp  = csv[['ecosystem', 'snyk_id','fix_pattern']]
fp = fp.values.tolist()
final = []
for row in fp:
    eco = row[0]
    aid = row[1]
    if row[-1] != 'nan' and isinstance(row[-1],str):
        l = row[-1].split(';')
        for elem in l:
            if elem.lower().strip() == 'n':
                print(row)
            final.append([eco,aid,elem.lower().strip()])
fp = pd.DataFrame(final, columns=['ecosystem','advisory_id','fix_pattern'])
fp['fix_pattern'].value_counts()

security notice              198
fix reference                181
vulnerability description    150
advisory reference           121
affected component listed     79
affected version listed       33
exploit                       26
affected configuration         3
Name: fix_pattern, dtype: int64

In [481]:
fp['temp'] = fp['ecosystem'].str.lower()
fp = fp.sort_values(by = 'temp', ascending=True)
fp.groupby(['fix_pattern', 'temp']).size()

fix_pattern                temp    
advisory reference         composer    14
                           go          12
                           maven       12
                           npm         13
                           nuget       27
                           pip         25
                           rubygems    18
affected component listed  composer    10
                           go           9
                           maven        2
                           npm          9
                           nuget        8
                           pip         24
                           rubygems    17
affected configuration     go           1
                           pip          2
affected version listed    composer    17
                           go           3
                           maven        5
                           npm          3
                           nuget        4
                           pip          1
exploit                    composer    1

In [482]:
bp  = csv[['ecosystem', 'snyk_id','breaking_pattern']]
bp = bp.values.tolist()
final = []
for row in bp:
    eco = row[0]
    aid = row[1]
    if row[-1] != 'nan' and isinstance(row[-1],str):
        l = row[-1].split(';')
        for elem in l:
            if elem.strip():
                final.append([eco,aid,elem.lower().strip()])
bp = pd.DataFrame(final, columns=['ecosystem','advisory_id','breaking_pattern'])
bp['breaking_pattern'].unique()

array(['breaking change notice', 'code change reference',
       'affected api listed', 'action required', 'affected configuration'],
      dtype=object)

In [483]:
temp = csv[csv['is_fix']=='y']
temp = temp.to_dict('records')
y = n = 0
for row in temp:
    s = row['fix_pattern']
    s = str(s).split(';')
    for i,e in enumerate(s):
        s[i] =  e.lower().strip()
    if len(s) == 1 and s[0] =='fix reference':
        y+=1
    else:
        n+=1
         
y, n, len(temp)

(44, 263, 307)

In [484]:
bp['temp'] = bp['ecosystem'].str.lower()
bp = bp.sort_values(by = 'temp', ascending=True)
bp.groupby(['breaking_pattern', 'temp']).size()

breaking_pattern        temp    
action required         go          2
                        maven       1
                        npm         1
                        nuget       1
                        pip         4
affected api listed     composer    2
                        go          6
                        maven       1
                        npm         6
                        pip         5
affected configuration  go          3
                        npm         1
                        pip         2
                        rubygems    1
breaking change notice  composer    2
                        go          9
                        maven       2
                        npm         5
                        nuget       1
                        pip         3
code change reference   composer    1
                        go          5
                        maven       1
                        npm         2
dtype: int64

In [485]:
cvecsv['is_fix'] = cvecsv['is_fix'].fillna(0)
cvecsv[cvecsv['is_fix'].isnull()]

Unnamed: 0,snyk_id,vulnerability type,ecosystem,package,security release,repository_url,possible release note,possible changelogs,link,is_doc,...,is_br,how the breaking change is mentioned? (initial codes); separated by semicolon,breaking_pattern,Additional Comments,row,agreed,external_codes,Unnamed: 23,Unnamed: 24,ifcve


In [486]:
cvecsv = cvecsv[cvecsv['is_doc']=='y']
cvecsv['is_fix'] = cvecsv['is_fix'].fillna(0)
cvecsv['is_fix'] = cvecsv['is_fix'].map({'y':1,'n':0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cvecsv['is_fix'] = cvecsv['is_fix'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cvecsv['is_fix'] = cvecsv['is_fix'].map({'y':1,'n':0})


In [487]:
usage = pd.merge(csv, usage, on = ['ecosystem', 'package'])
fy = usage[usage['is_fix'] == 'y']
fn = usage[usage['is_fix'] != 'y']
from scipy.stats import mannwhitneyu
mannwhitneyu(fy['dependent'], fn['dependent']), fy['dependent'].median(), fn['dependent'].median()

(MannwhitneyuResult(statistic=25253.0, pvalue=0.973865463469377), 31.0, 21.0)

In [488]:
mannwhitneyu(fy['dependent_repos'], fn['dependent_repos']), fy['dependent_repos'].median(), fn['dependent_repos'].median()

(MannwhitneyuResult(statistic=25124.5, pvalue=0.9529115486264055), 97.0, 141.0)

In [489]:
cvecsv[cvecsv['is_fix'].isnull()]

Unnamed: 0,snyk_id,vulnerability type,ecosystem,package,security release,repository_url,possible release note,possible changelogs,link,is_doc,...,is_br,how the breaking change is mentioned? (initial codes); separated by semicolon,breaking_pattern,Additional Comments,row,agreed,external_codes,Unnamed: 23,Unnamed: 24,ifcve


In [490]:
cve = cvecsv[cvecsv['ifcve']=='cve']
noncve = cvecsv[cvecsv['ifcve']=='noncve']
len(cve), len(noncve)

(210, 153)

In [491]:
list(cve['is_fix'].value_counts()), list(noncve['is_fix'].value_counts())

([184, 26], [123, 30])

In [492]:
from scipy.stats import chi2_contingency
chi2_contingency([list(cve['is_fix'].value_counts()),list(noncve['is_fix'].value_counts())])

(3.0109251640395556,
 0.08270507405423237,
 1,
 array([[177.60330579,  32.39669421],
        [129.39669421,  23.60330579]]))

In [493]:
cve = cve.to_dict('records')
noncve= noncve.to_dict('records')

In [494]:
print(len(cve))
y = n = 0
for row in cve:
    s = row['fix_pattern']
    s = str(s).split(';')
    for i,e in enumerate(s):
        s[i] =  e.lower().strip()
    if 'security notice' in s:
        y+=1
    else:
        n+=1
         
y, n

210


(128, 82)

In [495]:
print(len(noncve))
y = n = 0
for row in noncve:
    s = row['fix_pattern']
    s = str(s).split(';')
    for i,e in enumerate(s):
        s[i] =  e.lower().strip()
    if 'security notice' in s:
        y+=1
    else:
        n+=1
         
y, n

153


(70, 83)

In [496]:
chi2_contingency([[128,82],[70,83]])

(7.647233893557422,
 0.005685944009974247,
 1,
 array([[114.54545455,  95.45454545],
        [ 83.45454545,  69.54545455]]))

In [497]:
128+82+70+83

363

In [498]:
print(len(cve))
y = n = 0
for row in cve:
    s = row['fix_pattern']
    s = str(s).split(';')
    for i,e in enumerate(s):
        s[i] =  e.lower().strip()
    if 'advisory reference' in s:
        y+=1
    else:
        n+=1
         
y, n

210


(108, 102)

In [499]:
print(len(noncve))
y = n = 0
for row in noncve:
    s = row['fix_pattern']
    s = str(s).split(';')
    for i,e in enumerate(s):
        s[i] =  e.lower().strip()
    if 'advisory reference' in s:
        y+=1
    else:
        n+=1
         
y, n

153


(13, 140)

In [500]:
q = 'select id as snyk_id, severity from advisory'
sev = pd.DataFrame(sql.execute(q))
sev = pd.merge(sev,csv, on = 'snyk_id')


In [501]:
sev = sev[sev['is_doc']=='y']
sev['is_fix'] = sev['is_fix'].map({'y':1,'n':0})
sev['is_fix'] = sev['is_fix'].fillna(0)


In [502]:
sev['is_fix'].value_counts(), sev[sev['is_fix'].isnull()]

(1.0    307
 0.0     56
 Name: is_fix, dtype: int64,
 Empty DataFrame
 Columns: [snyk_id, severity, vulnerability type, ecosystem, package, security release, repository_url, possible release note, possible changelogs, link, is_doc, list of sources (separated by semicolon), source, snyk version wrong listed, is_fix, How is the fix mentioned? (initial codes); separated by semicolon, fix_pattern, is_uc, is_br, how the breaking change is mentioned? (initial codes); separated by semicolon, breaking_pattern, Additional Comments, row, agreed, external_codes, Unnamed: 23, Unnamed: 24]
 Index: []
 
 [0 rows x 27 columns])

In [503]:
h = sev[sev['severity'] == 'H']
m = sev[sev['severity'] == 'M']
l = sev[sev['severity'] == 'L']
list(h['is_fix'].value_counts()), list(m['is_fix'].value_counts()), len(h), len(m), len(l), len(sev)

([125, 23], [164, 31], 148, 195, 20, 363)

In [504]:
list(h['is_fix'].value_counts()), list(m['is_fix'].value_counts())

([125, 23], [164, 31])

In [505]:
chi2_contingency([[125, 23],[164,31]])

(0.0,
 1.0,
 1,
 array([[124.69970845,  23.30029155],
        [164.30029155,  30.69970845]]))

In [506]:
125+23+164+31

343

In [507]:
h = h.to_dict('records')
m = m.to_dict('records')

In [508]:
print(len(h))
y = n = 0
for row in h:
    s = row['fix_pattern']
    s = str(s).split(';')
    for i,e in enumerate(s):
        s[i] =  e.lower().strip()
    if 'security notice' in s:
        y+=1
    else:
        n+=1
         
y, n

148


(90, 58)

In [509]:
print(len(m))
y = n = 0
for row in m:
    s = row['fix_pattern']
    s = str(s).split(';')
    for i,e in enumerate(s):
        s[i] =  e.lower().strip()
    if 'security notice' in s:
        y+=1
    else:
        n+=1
         
y, n

195


(100, 95)

In [510]:
chi2_contingency([[90, 58],[100, 95]])

(2.7182352056152395,
 0.09920764842932461,
 1,
 array([[ 81.98250729,  66.01749271],
        [108.01749271,  86.98250729]]))

In [511]:
90+58+100+95

343

In [512]:
q = '''select a.id as snyk_id, a.severity, p.ecosystem, a.disclose_date, a.publish_date,
fr.version, ri.publish_date as release_date
from advisory a
join package p on a.package_id = p.id
join fixing_releases fr on a.id = fr.advisory_id
join release_info ri on p.id = ri.package_id and ri.version = fr.version'''
frd = pd.DataFrame(sql.execute(q))
frd = frd.rename(columns = {'version':'security release'})



In [513]:
nf = csv[(csv['is_doc'] == 'y') & (csv['is_fix'] != 'y')]
nf = pd.merge(nf, frd, on = ['snyk_id', 'security release'])
nf['rtp'] = (nf['disclose_date'] -  nf['release_date'].dt.date).dt.days

In [514]:
temp = nf[nf['rtp'] > 90]
temp = temp.sample(frac=1).reset_index(drop=True)
temp.to_csv('notmentionpublishafterninety.csv')

In [515]:
temp = csv[csv['is_doc'] == 'y']
temp = temp[temp['is_fix']=='y']
len(temp)

307

In [516]:

temp = pd.merge(temp, frd, on =['snyk_id', 'security release'])
len(temp)

297

In [517]:
temp['delay'] = (temp['disclose_date'] - temp['publish_date']).dt.days
temp = temp[temp['delay'] < -30]

In [518]:
temp = pd.merge(temp, ifcve, on ='snyk_id')

In [519]:
y = n = 0
temp = temp.to_dict('records')
for row in temp:
    s = row['fix_pattern']
    s = str(s).split(';')
    for i,e in enumerate(s):
        s[i] =  e.lower().strip()
    if 'security notice' in s and row['ifcve'] == 'noncve':
        y+=1
    else:
        n+=1
         
y, n

(33, 54)

In [520]:
y = 0
patterns = csv.to_dict('records')
for row in patterns:
    s = row['fix_pattern']
    s = str(s).split(';')
    for i,e in enumerate(s):
        s[i] =  e.lower().strip()
    if 'advisory reference' in s and 'security notice' not in s:
        y+=1
y

26

In [521]:
br = csv[csv['is_br']=='y']
br = list(br['security release'])
list(br)

['0.38.0',
 '3.0.0',
 '5.4.0',
 '1.11.2',
 '1.22.0',
 '1.18.0',
 '1.20.0-alpha.2',
 '4.2.0',
 '0.10.0-beta',
 '1.18.0',
 '1.14.0-alpha.2',
 '1.20.0-alpha.2',
 '6.0.0',
 '6.1.0-RC5',
 '7.0.0',
 '25.0.0',
 '4.1.2',
 '4.0.0-beta.1',
 '11.0.0',
 '3.9.2',
 '1.0.469',
 '4.0.1',
 '0.7',
 '2.0.0',
 '1.3.0',
 '0.9.0',
 '0.56.1',
 '1.1a1',
 '1.0a3',
 '1.8',
 '1.2.2',
 '3.0.0']

In [522]:
csv

Unnamed: 0,snyk_id,vulnerability type,ecosystem,package,security release,repository_url,possible release note,possible changelogs,link,is_doc,...,is_uc,is_br,how the breaking change is mentioned? (initial codes); separated by semicolon,breaking_pattern,Additional Comments,row,agreed,external_codes,Unnamed: 23,Unnamed: 24
0,npm:address-rfc2822:20180225,Regular Expression Denial of Service (ReDoS),npm,address-rfc2822,2.0.2,https://github.com/haraka/node-address-rfc2822,not found through script,https://github.com/haraka/node-address-rfc2822...,https://snyk.io/vuln/npm:address-rfc2822:20180225,y,...,,,,,,2,yes,,,
1,npm:angular-jwt:20180605,Access Restriction Bypass,npm,angular-jwt,0.1.10,https://github.com/auth0/angular-jwt,https://github.com/auth0/angular-jwt/releases/...,https://github.com/auth0/angular-jwt/blob/mast...,https://snyk.io/vuln/npm:angular-jwt:20180605,y,...,,,,,,3,resolved,f,,
2,npm:braces:20180219,Regular Expression Denial of Service (ReDoS),npm,braces,2.3.1,https://github.com/micromatch/braces,not found through script,https://github.com/micromatch/braces/blob/mast...,https://snyk.io/vuln/npm:braces:20180219,y,...,,,,,,4,yes,,,
3,npm:fastify:20180107,Denial of Service (DoS),npm,fastify,0.38.0,https://github.com/fastify/fastify,https://github.com/fastify/fastify/releases/ta...,no changlog found in script,https://snyk.io/vuln/npm:fastify:20180107,y,...,y,y,breaking change notice; pr reference,breaking change notice; code change reference,,5,yes,,,
4,npm:is-my-json-valid:20180214,Regular Expression Denial of Service (ReDoS),npm,is-my-json-valid,1.4.1,https://github.com/mafintosh/is-my-json-valid,not found through script,no changlog found in script,https://snyk.io/vuln/npm:is-my-json-valid:2018...,,...,,,,,"the package that does not have a release note,...",6,yes,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,SNYK-RUBY-SECUREHEADERS-543239,Command Injection,RubyGems,secure_headers,3.8.0,https://github.com/twitter/secureheaders,not found through script,https://github.com/twitter/secureheaders/blob/...,https://snyk.io/vuln/SNYK-RUBY-SECUREHEADERS-5...,y,...,,,,,,460,resolved,d,changelog in the different branch,
495,SNYK-RUBY-SECUREHEADERS-543239,Command Injection,RubyGems,secure_headers,6.2.0,https://github.com/twitter/secureheaders,not found through script,https://github.com/twitter/secureheaders/blob/...,https://snyk.io/vuln/SNYK-RUBY-SECUREHEADERS-5...,y,...,,,,,releases in the old branch did not have a rele...,462,resolved,d,,
496,SNYK-DOTNET-FAVICOJS-60151,Cross-site Scripting (XSS),NuGet,favico.js,0.3.10,https://github.com/ejci/favico.js,not found through script,no changlog found in script,https://snyk.io/vuln/SNYK-DOTNET-FAVICOJS-60151,y,...,y,,,,,482,yes,y,is_doc is empty in process-frozen-nasif,
497,SNYK-DOTNET-KNOCKOUT-60180,Cross-site Scripting (XSS),NuGet,knockout,3.0.0,https://github.com/knockout/knockout,https://github.com/knockout/knockout/releases/...,no changlog found in script,https://snyk.io/vuln/SNYK-DOTNET-KNOCKOUT-60180,y,...,y,y,warning; action required,breaking change notice;action required,,483,yes,y,is_doc is empty in process-frozen-nasif,


In [523]:
q = '''select a.id as snyk_id, p.ecosystem, a.publish_date as snyk_date, ac.cve_date,
       ri.version, ri.publish_date as release_date
from advisory a
    left join (select advisory_id, min(C.publish_Date) as cve_date
from advisoryCVE aC
join CVE C on aC.cve = C.cve
group by advisory_id) as ac
    on ac.advisory_id=a.id
join package p on a.package_id = p.id
join fixing_releases fr on a.id = fr.advisory_id
join release_info ri on p.id = ri.package_id and fr.version=ri.version
where ri.publish_date is not null '''
df = pd.DataFrame(sql.execute(q))
df['cve_date'] = df['cve_date'].dt.date
t1 = df[~df['cve_date'].isna()].copy()
t1['advisory_date'] = t1[['cve_date', 'snyk_date']].min(axis=1)
t2 = df[df['cve_date'].isna()].copy()
t2['advisory_date'] = t2['snyk_date']
df = pd.concat([t1, t2], ignore_index=True, sort=False)
df = df.rename(columns = {'version':'security release'})
df['rtp_delay'] = (df['advisory_date'] - df['release_date'].dt.date).dt.days
df = df[df['rtp_delay'] >= 0]
len(df)

4396

In [524]:
rtp = pd.merge(csv, df, on =['snyk_id', 'security release'])
len(rtp)

362

In [525]:
nodoc = rtp[rtp['is_doc'] != 'y']
len(nodoc), nodoc['rtp_delay'].median()

(89, 21.0)

In [526]:
rtp = rtp[rtp['is_doc'] == 'y']
nofix = rtp[rtp['is_fix']!='y']
nofix['rtp_delay'].describe()

count      44.000000
mean      536.659091
std       810.669231
min         2.000000
25%        21.000000
50%       161.000000
75%       843.000000
max      3424.000000
Name: rtp_delay, dtype: float64

In [527]:
rtp = rtp[rtp['is_fix'] == 'y']
rtp['rtp_delay'].describe()

count     229.000000
mean      167.310044
std       450.383165
min         0.000000
25%         1.000000
50%         9.000000
75%        54.000000
max      2726.000000
Name: rtp_delay, dtype: float64

In [528]:
rtp= rtp.to_dict('records')
delays = []
y=0
for row in rtp:
    s = row['fix_pattern']
    s = str(s).split(';')
    for i,e in enumerate(s):
        s[i] =  e.lower().strip()
    if 'security notice' in s:
        y+=1
        delays.append(row['rtp_delay'])
    else:
        n+=1
y, len(delays)

(156, 156)

In [529]:
delays = pd.Series(delays)
delays.describe()

count     156.000000
mean      132.775641
std       363.849164
min         0.000000
25%         1.000000
50%         8.000000
75%        30.500000
max      2544.000000
dtype: float64

In [530]:
len(delays[delays> 30])

39