In [107]:
import sys
sys.path.append('..')
import sql
import pandas as pd
from functools import reduce
from tabulate import tabulate
import datetime
import matplotlib.pyplot as plt

In [108]:
q= '''with rand_advisory as (
    select distinct ecosystem, a.id as advisory_id,
                row_number() over (partition by ecosystem order by rand()) as rand_sort
    from fixing_releases fr
    join advisory a on fr.advisory_id = a.id
    join snykvuln.package p on a.package_id = p.id
    left join advisoryCVE aC on a.id = aC.advisory_id
    where year(publish_date) >= 2018
        and ecosystem != 'cocoapods'
)
    select ecosystem, advisory_id, a.package_id,
       case
            when exists(select * from advisoryCVE aC where aC.advisory_id = ra.advisory_id)
            then 'CVE'
            else 'non-CVE'
        end as if_cve
    from rand_advisory ra
    join snykvuln.advisory a on a.id = ra.advisory_id
    where rand_sort <= 50;'''
df = pd.DataFrame(sql.execute(q))
advisory = df.groupby('ecosystem')[['advisory_id']].count()
package = df.groupby('ecosystem')[['package_id']].nunique()
noncve = (df.loc[df['if_cve']=='non-CVE']).groupby('ecosystem')[['advisory_id']].count()
df=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[advisory,package,noncve])
df.columns = ['advisory', 'package', 'non-cve']
df

Unnamed: 0_level_0,advisory,package,non-cve
ecosystem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Composer,50,27,14
Go,50,43,7
Maven,50,40,3
NuGet,50,18,1
RubyGems,50,30,7
npm,50,44,21
pip,50,27,6


In [109]:
# query used to extract random samples
''' insert into manual_sample
    with rand_cves as (
    select ecosystem, a.id as advisory_id, p.id as package_id, a.type,
           row_number() over (partition by ecosystem order by rand()) as rand_sample
    from snykvuln.advisory a
    join package p on a.package_id = p.id
    where a.id in (select advisory_id from fixing_releases)
    and year(publish_date) >= 2018
            and ecosystem != 'cocoapods'
        and a.id in (select distinct advisory_id from advisoryCVE)
    ),
    rand_noncves as (
    select ecosystem, a.id as advisory_id, p.id as package_id, a.type,
           row_number() over (partition by ecosystem order by rand()) as rand_sample
    from snykvuln.advisory a
    join package p on a.package_id = p.id
    where a.id in (select advisory_id from fixing_releases)
    and year(publish_date) >= 2018
            and ecosystem != 'cocoapods'
        and a.id not in (select distinct advisory_id from advisoryCVE)
    )
        select * from
    (select ecosystem, advisory_id, package_id, type
    from rand_cves
    where rand_sample <=25
    union
    select ecosystem, advisory_id, package_id, type
    from rand_noncves
    where rand_sample <=25) as sub; '''

" insert into manual_sample\n    with rand_cves as (\n    select ecosystem, a.id as advisory_id, p.id as package_id, a.type,\n           row_number() over (partition by ecosystem order by rand()) as rand_sample\n    from snykvuln.advisory a\n    join package p on a.package_id = p.id\n    where a.id in (select advisory_id from fixing_releases)\n    and year(publish_date) >= 2018\n            and ecosystem != 'cocoapods'\n        and a.id in (select distinct advisory_id from advisoryCVE)\n    ),\n    rand_noncves as (\n    select ecosystem, a.id as advisory_id, p.id as package_id, a.type,\n           row_number() over (partition by ecosystem order by rand()) as rand_sample\n    from snykvuln.advisory a\n    join package p on a.package_id = p.id\n    where a.id in (select advisory_id from fixing_releases)\n    and year(publish_date) >= 2018\n            and ecosystem != 'cocoapods'\n        and a.id not in (select distinct advisory_id from advisoryCVE)\n    )\n        select * from\n    (

In [110]:
q='''select *,
       case
            when exists(select * from advisoryCVE aC where aC.advisory_id = ms.advisory_id)
            then 'CVE'
            else 'non-CVE'
        end as if_cve
from manual_sample ms
join fixing_releases fr on ms.advisory_id = fr.advisory_id;'''
df = pd.DataFrame(sql.execute(q))
total_types= df['type'].nunique()
advisory = df.groupby('ecosystem')[['advisory_id']].nunique()
package = df.groupby('ecosystem')[['package_id']].nunique()
type = df.groupby('ecosystem')[['type']].nunique()
noncve = (df.loc[df['if_cve']=='non-CVE']).groupby('ecosystem')[['advisory_id']].nunique()
releases = df.groupby('ecosystem')[['advisory_id']].count()
df=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[advisory,package,noncve,type,releases])
df.columns = ['advisory', 'package', 'non-cve','type','releases']
df

Unnamed: 0_level_0,advisory,package,non-cve,type,releases
ecosystem,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Composer,50,30,25,22,98
Go,50,49,25,25,67
Maven,50,49,25,28,76
NuGet,40,18,15,13,50
RubyGems,39,32,14,21,54
npm,50,44,25,20,70
pip,50,45,25,33,64


In [111]:
df = df.reset_index()
df =df.drop(['type'],axis=1)

In [112]:
q = '''select p.ecosystem, ac.cwe as cwe
from manual_sample ms
join advisoryCWE aC on ms.advisory_id = aC.advisory_id
join advisory a on aC.advisory_id = a.id
join package p on a.package_id = p.id;'''
cwe = pd.DataFrame(sql.execute(q))
total_cwes = cwe['cwe'].nunique()
cwe = cwe.groupby('ecosystem')[['cwe']].nunique()
cwe

Unnamed: 0_level_0,cwe
ecosystem,Unnamed: 1_level_1
Composer,18
Go,25
Maven,25
NuGet,13
RubyGems,16
npm,18
pip,31


In [113]:
df=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[df, cwe])
df

Unnamed: 0,ecosystem,advisory,package,non-cve,releases,cwe
0,Composer,50,30,25,98,18
1,Go,50,49,25,67,25
2,Maven,50,49,25,76,25
3,NuGet,40,18,15,50,13
4,RubyGems,39,32,14,54,16
5,npm,50,44,25,70,18
6,pip,50,45,25,64,31


In [114]:
df.loc[len(df)] = ['Total', df['advisory'].sum(), df['package'].sum(), df['non-cve'].count(),  df['releases'].sum(), total_cwes]
df

Unnamed: 0,ecosystem,advisory,package,non-cve,releases,cwe
0,Composer,50,30,25,98,18
1,Go,50,49,25,67,25
2,Maven,50,49,25,76,25
3,NuGet,40,18,15,50,13
4,RubyGems,39,32,14,54,16
5,npm,50,44,25,70,18
6,pip,50,45,25,64,31
7,Total,329,267,7,479,67


In [115]:
q = '''select distinct a.id, p.ecosystem, a.severity
from manual_sample ms
join advisoryCWE aC on ms.advisory_id = aC.advisory_id
join advisory a on aC.advisory_id = a.id
join package p on a.package_id = p.id;;'''
sev = pd.DataFrame(sql.execute(q))
#this code is wrong. Replace this style with categorical count
l, m, h = sev[sev['severity']=='L'],sev[sev['severity']=='M'],sev[sev['severity']=='H']
eco_l = l.groupby('ecosystem').agg(lambda x: x.ne(0).sum())
eco_l = eco_l.rename(columns={'severity':'L'})
eco_m = m.groupby('ecosystem').agg(lambda x: x.ne(0).sum())
eco_m = eco_m.rename(columns={'severity':'M'})
eco_h = h.groupby('ecosystem').agg(lambda x: x.ne(0).sum())
eco_h = eco_h.rename(columns={'severity':'H'})
eco_l, eco_m, eco_h
# sf=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[eco_l, eco_m, eco_h])
# sf = sf.reset_index()
# l,m,h = len(l), len(m), len(h)
# sf.loc[len(df)] = ['Total',l,m,h]
# sf['T'] = sf.L + sf.M + sf.H
# sf['L'] = round(sf['L'] / sf['T'],2)
# sf['M'] = round(sf['M'] / sf['T'],2)
# sf['H'] = round(sf['H'] / sf['T'],2)
# sf['severity'] = sf[['L','M','H']].apply(tuple, axis=1)
# sf = sf.drop(['L','M','H','T'], axis=1)
# sf

(           id  L
 ecosystem       
 Go          4  4
 Maven       8  8
 NuGet       2  2
 npm         6  6
 pip         4  4,
            id   M
 ecosystem        
 Composer   29  29
 Go         28  28
 Maven      23  23
 NuGet      15  15
 RubyGems   24  24
 npm        17  17
 pip        32  32,
            id   H
 ecosystem        
 Composer   21  21
 Go         18  18
 Maven      19  19
 NuGet      23  23
 RubyGems   15  15
 npm        27  27
 pip        14  14)

In [116]:
print(tabulate(df, tablefmt='latex', headers='keys',showindex=False))

\begin{tabular}{lrrrrr}
\hline
 ecosystem   &   advisory &   package &   non-cve &   releases &   cwe \\
\hline
 Composer    &         50 &        30 &        25 &         98 &    18 \\
 Go          &         50 &        49 &        25 &         67 &    25 \\
 Maven       &         50 &        49 &        25 &         76 &    25 \\
 NuGet       &         40 &        18 &        15 &         50 &    13 \\
 RubyGems    &         39 &        32 &        14 &         54 &    16 \\
 npm         &         50 &        44 &        25 &         70 &    18 \\
 pip         &         50 &        45 &        25 &         64 &    31 \\
 Total       &        329 &       267 &         7 &        479 &    67 \\
\hline
\end{tabular}
