In [1]:
import sys
sys.path.append('..')
import sql
import pandas as pd
from functools import reduce
from tabulate import tabulate
import datetime
import matplotlib.pyplot as plt

In [2]:
q= '''with rand_advisory as (
    select distinct ecosystem, a.id as advisory_id,
                row_number() over (partition by ecosystem order by rand()) as rand_sort
    from fixing_releases fr
    join advisory a on fr.advisory_id = a.id
    join snykvuln.package p on a.package_id = p.id
    left join advisoryCVE aC on a.id = aC.advisory_id
    where year(publish_date) >= 2018
        and ecosystem != 'cocoapods'
)
    select ecosystem, advisory_id, a.package_id,
       case
            when exists(select * from advisoryCVE aC where aC.advisory_id = ra.advisory_id)
            then 'CVE'
            else 'non-CVE'
        end as if_cve
    from rand_advisory ra
    join snykvuln.advisory a on a.id = ra.advisory_id
    where rand_sort <= 50;'''
df = pd.DataFrame(sql.execute(q))
advisory = df.groupby('ecosystem')[['advisory_id']].count()
package = df.groupby('ecosystem')[['package_id']].nunique()
noncve = (df.loc[df['if_cve']=='non-CVE']).groupby('ecosystem')[['advisory_id']].count()
df=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[advisory,package,noncve])
df.columns = ['advisory', 'package', 'non-cve']
df

Unnamed: 0_level_0,advisory,package,non-cve
ecosystem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Composer,50,33,10
Go,50,40,6
Maven,50,42,6
RubyGems,50,32,7
npm,50,43,15
pip,50,34,9


In [3]:
# query used to extract random samples
''' insert into manual_sample
    with rand_cves as (
    select ecosystem, a.id as advisory_id, p.id as package_id, a.type,
           row_number() over (partition by ecosystem order by rand()) as rand_sample
    from snykvuln.advisory a
    join package p on a.package_id = p.id
    where a.id in (select advisory_id from fixing_releases)
    and year(publish_date) >= 2018
            and ecosystem != 'cocoapods'
        and a.id in (select distinct advisory_id from advisoryCVE)
    ),
    rand_noncves as (
    select ecosystem, a.id as advisory_id, p.id as package_id, a.type,
           row_number() over (partition by ecosystem order by rand()) as rand_sample
    from snykvuln.advisory a
    join package p on a.package_id = p.id
    where a.id in (select advisory_id from fixing_releases)
    and year(publish_date) >= 2018
            and ecosystem != 'cocoapods'
        and a.id not in (select distinct advisory_id from advisoryCVE)
    )
        select * from
    (select ecosystem, advisory_id, package_id, type
    from rand_cves
    where rand_sample <=25
    union
    select ecosystem, advisory_id, package_id, type
    from rand_noncves
    where rand_sample <=25) as sub; '''

" insert into manual_sample\n    with rand_cves as (\n    select ecosystem, a.id as advisory_id, p.id as package_id, a.type,\n           row_number() over (partition by ecosystem order by rand()) as rand_sample\n    from snykvuln.advisory a\n    join package p on a.package_id = p.id\n    where a.id in (select advisory_id from fixing_releases)\n    and year(publish_date) >= 2018\n            and ecosystem != 'cocoapods'\n        and a.id in (select distinct advisory_id from advisoryCVE)\n    ),\n    rand_noncves as (\n    select ecosystem, a.id as advisory_id, p.id as package_id, a.type,\n           row_number() over (partition by ecosystem order by rand()) as rand_sample\n    from snykvuln.advisory a\n    join package p on a.package_id = p.id\n    where a.id in (select advisory_id from fixing_releases)\n    and year(publish_date) >= 2018\n            and ecosystem != 'cocoapods'\n        and a.id not in (select distinct advisory_id from advisoryCVE)\n    )\n        select * from\n    (

In [4]:
q='''select *,
       case
            when exists(select * from advisoryCVE aC where aC.advisory_id = ms.advisory_id)
            then 'CVE'
            else 'non-CVE'
        end as if_cve
from manual_sample ms
join fixing_releases fr on ms.advisory_id = fr.advisory_id;'''
df = pd.DataFrame(sql.execute(q))
total_types= df['type'].nunique()
advisory = df.groupby('ecosystem')[['advisory_id']].nunique()
package = df.groupby('ecosystem')[['package_id']].nunique()
type = df.groupby('ecosystem')[['type']].nunique()
noncve = (df.loc[df['if_cve']=='non-CVE']).groupby('ecosystem')[['advisory_id']].nunique()
releases = df.groupby('ecosystem')[['advisory_id']].count()
df=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[advisory,package,noncve,type,releases])
df.columns = ['advisory', 'package', 'non-cve','type','releases']
df

Unnamed: 0_level_0,advisory,package,non-cve,type,releases
ecosystem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Composer,50,30,25,22,98
Go,50,49,25,25,67
Maven,50,49,25,28,76
NuGet,40,18,15,13,50
RubyGems,39,32,14,21,54
npm,50,44,25,20,70
pip,50,45,25,33,64


In [5]:
df = df.reset_index()
df =df.drop(['type'],axis=1)

In [6]:
q = '''select p.ecosystem, ac.cwe as cwe
from manual_sample ms
join advisoryCWE aC on ms.advisory_id = aC.advisory_id
join advisory a on aC.advisory_id = a.id
join package p on a.package_id = p.id;'''
cwe = pd.DataFrame(sql.execute(q))
total_cwes = cwe['cwe'].nunique()
cwe = cwe.groupby('ecosystem')[['cwe']].nunique()
cwe

Unnamed: 0_level_0,cwe
ecosystem,Unnamed: 1_level_1
Composer,18
Go,25
Maven,25
NuGet,13
RubyGems,16
npm,18
pip,31


In [7]:
df=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[df, cwe])
df

Unnamed: 0,ecosystem,advisory,package,non-cve,releases,cwe
0,Composer,50,30,25,98,18
1,Go,50,49,25,67,25
2,Maven,50,49,25,76,25
3,NuGet,40,18,15,50,13
4,RubyGems,39,32,14,54,16
5,npm,50,44,25,70,18
6,pip,50,45,25,64,31


In [8]:
df.loc[len(df)] = ['Total', df['advisory'].sum(), df['package'].sum(), df['non-cve'].count(),  df['releases'].sum(), total_cwes]
df

Unnamed: 0,ecosystem,advisory,package,non-cve,releases,cwe
0,Composer,50,30,25,98,18
1,Go,50,49,25,67,25
2,Maven,50,49,25,76,25
3,NuGet,40,18,15,50,13
4,RubyGems,39,32,14,54,16
5,npm,50,44,25,70,18
6,pip,50,45,25,64,31
7,Total,329,267,7,479,67


In [9]:
q = '''select distinct a.id, p.ecosystem, a.severity
from manual_sample ms
join advisoryCWE aC on ms.advisory_id = aC.advisory_id
join advisory a on aC.advisory_id = a.id
join package p on a.package_id = p.id;;'''
sev = pd.DataFrame(sql.execute(q))
#TODO this code is wrong. Replace this style with categorical count
l, m, h = sev[sev['severity']=='L'],sev[sev['severity']=='M'],sev[sev['severity']=='H']
eco_l = l.groupby('ecosystem').agg(lambda x: x.ne(0).sum())
eco_l = eco_l.rename(columns={'severity':'L'})
eco_m = m.groupby('ecosystem').agg(lambda x: x.ne(0).sum())
eco_m = eco_m.rename(columns={'severity':'M'})
eco_h = h.groupby('ecosystem').agg(lambda x: x.ne(0).sum())
eco_h = eco_h.rename(columns={'severity':'H'})
eco_l, eco_m, eco_h
# sf=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[eco_l, eco_m, eco_h])
# sf = sf.reset_index()
# l,m,h = len(l), len(m), len(h)
# sf.loc[len(df)] = ['Total',l,m,h]
# sf['T'] = sf.L + sf.M + sf.H
# sf['L'] = round(sf['L'] / sf['T'],2)
# sf['M'] = round(sf['M'] / sf['T'],2)
# sf['H'] = round(sf['H'] / sf['T'],2)
# sf['severity'] = sf[['L','M','H']].apply(tuple, axis=1)
# sf = sf.drop(['L','M','H','T'], axis=1)
# sf

(           id  L
 ecosystem       
 Go          4  4
 Maven       8  8
 NuGet       2  2
 npm         6  6
 pip         4  4,
            id   M
 ecosystem        
 Composer   29  29
 Go         28  28
 Maven      23  23
 NuGet      15  15
 RubyGems   24  24
 npm        17  17
 pip        32  32,
            id   H
 ecosystem        
 Composer   21  21
 Go         18  18
 Maven      19  19
 NuGet      23  23
 RubyGems   15  15
 npm        27  27
 pip        14  14)

In [10]:
print(tabulate(df, tablefmt='latex', headers='keys',showindex=False))

\begin{tabular}{lrrrrr}
\hline
 ecosystem   &   advisory &   package &   non-cve &   releases &   cwe \\
\hline
 Composer    &         50 &        30 &        25 &         98 &    18 \\
 Go          &         50 &        49 &        25 &         67 &    25 \\
 Maven       &         50 &        49 &        25 &         76 &    25 \\
 NuGet       &         40 &        18 &        15 &         50 &    13 \\
 RubyGems    &         39 &        32 &        14 &         54 &    16 \\
 npm         &         50 &        44 &        25 &         70 &    18 \\
 pip         &         50 &        45 &        25 &         64 &    31 \\
 Total       &        329 &       267 &         7 &        479 &    67 \\
\hline
\end{tabular}


In [11]:
csv = pd.read_csv('qual.csv')
csv = csv.rename(columns = {'Is there a documentation source?': 'is_doc','is the fix mentioned?':'is_fix', 'is there unrelated change mentioned?': 'is_uc', 'is breaking change mentioned?':'is_br' , 'main source' :'doc_source', 'How is the fix mentioned? (final codes); separated by semicolon':'fix_pattern'})
csv

Unnamed: 0,snyk_id,vulnerability type,ecosystem,package,security release,repository,link,is_doc,list of sources (separated by semicolon),doc_source,is_fix,How is the fix mentioned? (initial codes); separated by semicolon,fix_pattern,is_uc,is_br,how the breaking change is mentioned? (initial codes); separated by semicolon,how the breaking change is mentioned? (final codes); separated by semicolon,Additional Comments
0,npm:address-rfc2822:20180225,Regular Expression Denial of Service (ReDoS),npm,address-rfc2822,2.0.2,https://github.com/haraka/node-address-rfc2822,https://snyk.io/vuln/npm:address-rfc2822:20180225,Y,changes.md,changelog,Y,vulnerability type mentioned; PR referenced,vulnerability description; fix referenced,N,N,,,
1,npm:angular-jwt:20180605,Access Restriction Bypass,npm,angular-jwt,0.1.10,https://github.com/auth0/angular-jwt,https://snyk.io/vuln/npm:angular-jwt:20180605,Y,github release note; Security-notices.md,security notice,Y,release note: summary of fix; security-notices...,fix referenced; security notice; vulnerability...,N,N,,,
2,npm:braces:20180219,Regular Expression Denial of Service (ReDoS),npm,braces,2.3.1,https://github.com/micromatch/braces,https://snyk.io/vuln/npm:braces:20180219,Y,changelog.MD,changelog,Y,summary fix,fix referenced,N,N,,,
3,npm:fastify:20180107,Denial of Service (DoS),npm,fastify,0.38.0,https://github.com/fastify/fastify,https://snyk.io/vuln/npm:fastify:20180107,Y,github release note,github release note,Y,CVE mentioned;,advisory reference,Y,Y,breaking changes,breaking change notice,
4,npm:is-my-json-valid:20180214,Regular Expression Denial of Service (ReDoS),npm,is-my-json-valid,1.4.1,https://github.com/mafintosh/is-my-json-valid,https://snyk.io/vuln/npm:is-my-json-valid:2018...,N,N,N,,,,,,,,"the package that does not have a release note,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
474,SNYK-RUBY-SPREEBACKEND-20476,Cross-site Scripting (XSS),RubyGems,spree_backend,1.2.1,https://github.com/spree/spree,,,,,,,,,,,,
475,SNYK-RUBY-SPREEFRONTEND-20477,Json Hijacking,RubyGems,spree_frontend,3.0.7,https://github.com/spree/spree,,,,,,,,,,,,
476,SNYK-RUBY-SQLITE3RUBY-536098,Access Restriction Bypass,RubyGems,sqlite3-ruby,1.2.4,https://github.com/luislavena/sqlite3-ruby,,,,,,,,,,,,
477,SNYK-RUBY-USERAGENTPARSER-559792,Regular Expression Denial of Service (ReDoS),RubyGems,user_agent_parser,2.6.0,https://github.com/ua-parser/uap-core,,,,,,,,,,,,


In [12]:
doc = csv
doc['is_doc'] = doc['is_doc'].mask(doc['is_doc'].ne('Y'))
doc = doc.groupby('ecosystem')[['is_doc']].count()
doc

Unnamed: 0_level_0,is_doc
ecosystem,Unnamed: 1_level_1
Composer,0
Go,0
Maven,0
NuGet,0
RubyGems,0
npm,43
pip,0


In [13]:
fix = csv
fix['is_fix'] = fix['is_fix'].mask(fix['is_fix'].ne('Y'))
fix = fix.groupby('ecosystem')[['is_fix']].count()
fix

Unnamed: 0_level_0,is_fix
ecosystem,Unnamed: 1_level_1
Composer,0
Go,0
Maven,0
NuGet,0
RubyGems,0
npm,31
pip,0


In [14]:
uc = csv
uc['is_uc'] = uc['is_uc'].mask(uc['is_uc'].ne('Y'))
uc = uc.groupby('ecosystem')[['is_uc']].count()
uc


Unnamed: 0_level_0,is_uc
ecosystem,Unnamed: 1_level_1
Composer,0
Go,0
Maven,0
NuGet,0
RubyGems,0
npm,21
pip,0


In [15]:
br = csv
br['is_br'] = br['is_br'].mask(br['is_br'].ne('Y'))
br = br.groupby('ecosystem')[['is_br']].count()
br

Unnamed: 0_level_0,is_br
ecosystem,Unnamed: 1_level_1
Composer,0
Go,0
Maven,0
NuGet,0
RubyGems,0
npm,8
pip,0


In [16]:
df=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[df, doc, fix, uc, br])
df

Unnamed: 0,ecosystem,advisory,package,non-cve,releases,cwe,is_doc,is_fix,is_uc,is_br
0,Composer,50,30,25,98,18,0,0,0,0
1,Go,50,49,25,67,25,0,0,0,0
2,Maven,50,49,25,76,25,0,0,0,0
3,NuGet,40,18,15,50,13,0,0,0,0
4,RubyGems,39,32,14,54,16,0,0,0,0
5,npm,50,44,25,70,18,43,31,21,8
6,pip,50,45,25,64,31,0,0,0,0


In [17]:
df['is_doc_rate'] = round(df['is_doc']/df['releases']* 100,1) 
df['is_doc'] = df['is_doc'].astype(str)
df['is_doc_rate'] = df['is_doc_rate'].astype(str)
df['is_doc'] = df['is_doc'] + ' (' + df['is_doc_rate'] +'%)'
df = df.drop('is_doc_rate', axis=1)
df['is_fix_rate'] = round(df['is_fix']/df['releases'] * 100 ,1)
df['is_fix'] = df['is_fix'].astype(str)
df['is_fix_rate'] = df['is_fix_rate'].astype(str)
df['is_fix'] = df['is_fix'] + ' (' + df['is_fix_rate'] +'%)'
df = df.drop('is_fix_rate', axis=1)
df['is_uc_rate'] = round(df['is_uc']/df['releases'] * 100,1)
df['is_uc'] = df['is_uc'].astype(str)
df['is_uc_rate'] = df['is_uc_rate'].astype(str)
df['is_uc'] = df['is_uc'] + ' (' + df['is_uc_rate'] +'%)'
df = df.drop('is_uc_rate', axis=1)
df['is_br_rate'] = round(df['is_br']/df['releases']* 100,1)
df['is_br'] = df['is_br'].astype(str)
df['is_br_rate'] = df['is_br_rate'].astype(str)
df['is_br'] = df['is_br'] + ' (' + df['is_br_rate'] +'%)'
df = df.drop('is_br_rate', axis=1)
df

Unnamed: 0,ecosystem,advisory,package,non-cve,releases,cwe,is_doc,is_fix,is_uc,is_br
0,Composer,50,30,25,98,18,0 (0.0%),0 (0.0%),0 (0.0%),0 (0.0%)
1,Go,50,49,25,67,25,0 (0.0%),0 (0.0%),0 (0.0%),0 (0.0%)
2,Maven,50,49,25,76,25,0 (0.0%),0 (0.0%),0 (0.0%),0 (0.0%)
3,NuGet,40,18,15,50,13,0 (0.0%),0 (0.0%),0 (0.0%),0 (0.0%)
4,RubyGems,39,32,14,54,16,0 (0.0%),0 (0.0%),0 (0.0%),0 (0.0%)
5,npm,50,44,25,70,18,43 (61.4%),31 (44.3%),21 (30.0%),8 (11.4%)
6,pip,50,45,25,64,31,0 (0.0%),0 (0.0%),0 (0.0%),0 (0.0%)


In [18]:
print(tabulate(df, tablefmt='latex', headers='keys',showindex=False))

\begin{tabular}{lrrrrrllll}
\hline
 ecosystem   &   advisory &   package &   non-cve &   releases &   cwe & is\_doc     & is\_fix     & is\_uc      & is\_br     \\
\hline
 Composer    &         50 &        30 &        25 &         98 &    18 & 0 (0.0\%)   & 0 (0.0\%)   & 0 (0.0\%)   & 0 (0.0\%)  \\
 Go          &         50 &        49 &        25 &         67 &    25 & 0 (0.0\%)   & 0 (0.0\%)   & 0 (0.0\%)   & 0 (0.0\%)  \\
 Maven       &         50 &        49 &        25 &         76 &    25 & 0 (0.0\%)   & 0 (0.0\%)   & 0 (0.0\%)   & 0 (0.0\%)  \\
 NuGet       &         40 &        18 &        15 &         50 &    13 & 0 (0.0\%)   & 0 (0.0\%)   & 0 (0.0\%)   & 0 (0.0\%)  \\
 RubyGems    &         39 &        32 &        14 &         54 &    16 & 0 (0.0\%)   & 0 (0.0\%)   & 0 (0.0\%)   & 0 (0.0\%)  \\
 npm         &         50 &        44 &        25 &         70 &    18 & 43 (61.4\%) & 31 (44.3\%) & 21 (30.0\%) & 8 (11.4\%) \\
 pip         &         50 &        45 &        25 &    

In [19]:
csv['doc_source'].value_counts()

N                               27
github release note             27
changelog                       11
readme                           2
security notice                  1
mailing list                     1
package homepage releasenote     1
Name: doc_source, dtype: int64

In [20]:
csv['fix_pattern'].value_counts()

fix referenced                                                                           5
Fix referenced; vulnerability description                                                4
Security notice; advisory referenced; fix referenced                                     4
security notice; vulnerability description; fix referenced                               3
vulnerability description; fix referenced                                                3
security notice; fix referenced; vulnerability description; affected component listed    2
Security notice; advisory referenced; vulnerability description; Fix referenced          2
advisory referenced; vulnerability description; fix referenced                           1
advisory reference                                                                       1
affected component listed                                                                1
security notice; advisory referenced; vulnerability description                          1

In [21]:
fp  = csv[['ecosystem', 'snyk_id','fix_pattern']]
fp = fp.values.tolist()
final = []
for row in fp:
    eco = row[0]
    aid = row[1]
    if row[-1] != 'nan' and isinstance(row[-1],str):
        l = row[-1].split(';')
        for elem in l:
            if elem.strip() == 'fix reference':
                print(row[1])
            final.append([eco,aid,elem.lower().strip()])
df = pd.DataFrame(final, columns=['ecosystem','advisory_id','fix_pattern'])
df['fix_pattern'].value_counts()

fix referenced               32
security notice              22
vulnerability description    21
advisory referenced           9
affected component listed     9
advisory reference            4
Name: fix_pattern, dtype: int64