In [45]:
import sys
sys.path.append('..')
import sql
import pandas as pd
from functools import reduce
from tabulate import tabulate
import datetime
import matplotlib.pyplot as plt

In [46]:
# query used to extract random samples
q = ''' -- insert into manual_sample
    with rand_cves as (
    select ecosystem, a.id as advisory_id, p.id as package_id, a.type,
           row_number() over (partition by ecosystem order by rand()) as rand_sample
    from snykvuln.advisory a
    join package p on a.package_id = p.id
    where a.id in (select advisory_id from fixing_releases)
    and year(publish_date) >= 2018
            and ecosystem != 'cocoapods'
        and a.id in (select distinct advisory_id from advisoryCVE)
    ),
    rand_noncves as (
    select ecosystem, a.id as advisory_id, p.id as package_id, a.type,
           row_number() over (partition by ecosystem order by rand()) as rand_sample
    from snykvuln.advisory a
    join package p on a.package_id = p.id
    where a.id in (select advisory_id from fixing_releases)
    and year(publish_date) >= 2018
            and ecosystem != 'cocoapods'
        and a.id not in (select distinct advisory_id from advisoryCVE)
    )
        select * from
    (select ecosystem, advisory_id, package_id, type
    from rand_cves
    where rand_sample <=25
    union
    select ecosystem, advisory_id, package_id, type
    from rand_noncves
    where rand_sample <=25) as sub; '''


In [47]:
q='''select *,
       case
            when exists(select * from advisoryCVE aC where aC.advisory_id = ms.advisory_id)
            then 'CVE'
            else 'non-CVE'
        end as if_cve
from manual_sample ms
join fixing_releases fr on ms.advisory_id = fr.advisory_id
join advisoryCWE a on fr.advisory_id = a.advisory_id
where version != 'manual checkup needed';'''
df = pd.DataFrame(sql.execute(q))
manualsample_sql = df
total_types= df['type'].nunique()
total = ['Total', 
            str(df['advisory_id'].nunique()) +' (' +str(df[df['if_cve']=='non-CVE']['advisory_id'].nunique()) + ')', 
            df['package_id'].nunique(), 
            str(len(df.groupby(['package_id','version']))) +' (' + str(len(df.groupby(['advisory_id','package_id','version']))) +')', 
            len(df.groupby(['advisory_id','package_id','version'])), df['cwe'].nunique()]
total

['Total', '350 (175)', 285, '465 (499)', 499, 68]

In [48]:
advisory = df.groupby('ecosystem')[['advisory_id']].nunique()
package = df.groupby('ecosystem')[['package_id']].nunique()
noncve = (df.loc[df['if_cve']=='non-CVE']).groupby('ecosystem')[['advisory_id']].nunique()
releases = df.groupby(['ecosystem','package_id','version']).size().groupby(level=0).size()
releases = releases.reset_index()
total_rel = df.groupby(['ecosystem','advisory_id','package_id','version']).size().groupby(level=0).size()
total_rel = total_rel.reset_index()
df=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[advisory,package,noncve,releases, total_rel])
df.columns = ['ecosystem','advisory', 'package', 'non-cve','distinct releases', 'releases']
df['advisory'] = df['advisory'].map(str) + ' (' + df['non-cve'].map(str) + ')'
df['distinct releases'] = df['distinct releases'].map(str) + ' (' + df['releases'].map(str) + ')'
df = df.drop(['non-cve'], axis =1)
df

Unnamed: 0,ecosystem,advisory,package,distinct releases,releases
0,Composer,50 (25),30,72 (98),98
1,Go,50 (25),49,65 (66),66
2,Maven,50 (25),49,76 (76),76
3,NuGet,50 (25),28,56 (60),60
4,RubyGems,50 (25),40,65 (65),65
5,npm,50 (25),44,70 (70),70
6,pip,50 (25),45,61 (64),64


In [49]:
q = '''select p.ecosystem, ac.cwe as cwe
from manual_sample ms
join advisoryCWE aC on ms.advisory_id = aC.advisory_id
join advisory a on aC.advisory_id = a.id
join package p on a.package_id = p.id;'''
cwe = pd.DataFrame(sql.execute(q))
total_cwes = cwe['cwe'].nunique()
cwe = cwe.groupby('ecosystem')[['cwe']].nunique()

In [50]:
df=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[df, cwe])
df

Unnamed: 0,ecosystem,advisory,package,distinct releases,releases,cwe
0,Composer,50 (25),30,72 (98),98,18
1,Go,50 (25),49,65 (66),66,25
2,Maven,50 (25),49,76 (76),76,25
3,NuGet,50 (25),28,56 (60),60,14
4,RubyGems,50 (25),40,65 (65),65,20
5,npm,50 (25),44,70 (70),70,18
6,pip,50 (25),45,61 (64),64,31


In [51]:
df.loc[len(df)] = total
df

Unnamed: 0,ecosystem,advisory,package,distinct releases,releases,cwe
0,Composer,50 (25),30,72 (98),98,18
1,Go,50 (25),49,65 (66),66,25
2,Maven,50 (25),49,76 (76),76,25
3,NuGet,50 (25),28,56 (60),60,14
4,RubyGems,50 (25),40,65 (65),65,20
5,npm,50 (25),44,70 (70),70,18
6,pip,50 (25),45,61 (64),64,31
7,Total,350 (175),285,465 (499),499,68


In [52]:
print(tabulate(df, tablefmt='latex', headers='keys',showindex=False))

\begin{tabular}{llrlrr}
\hline
 ecosystem   & advisory   &   package & distinct releases   &   releases &   cwe \\
\hline
 Composer    & 50 (25)    &        30 & 72 (98)             &         98 &    18 \\
 Go          & 50 (25)    &        49 & 65 (66)             &         66 &    25 \\
 Maven       & 50 (25)    &        49 & 76 (76)             &         76 &    25 \\
 NuGet       & 50 (25)    &        28 & 56 (60)             &         60 &    14 \\
 RubyGems    & 50 (25)    &        40 & 65 (65)             &         65 &    20 \\
 npm         & 50 (25)    &        44 & 70 (70)             &         70 &    18 \\
 pip         & 50 (25)    &        45 & 61 (64)             &         64 &    31 \\
 Total       & 350 (175)  &       285 & 465 (499)           &        499 &    68 \\
\hline
\end{tabular}


In [53]:
csv = pd.read_csv('process-frozen-aniqa.csv')
csv = csv.rename(columns = {'Is there a documentation source?': 'is_doc','is the fix mentioned?':'is_fix', 'is there unrelated change mentioned?': 'is_uc', 'is breaking change mentioned?':'is_br' , 'main source' :'doc_source', 'How is the fix mentioned? (final codes); separated by semicolon':'fix_pattern',
'how the breaking change is mentioned? (final codes); separated by semicolon':'breaking_pattern'})
len(csv)

499

In [54]:
assert df.iloc[7,4] == len(csv)

In [55]:
doc = csv
doc['is_doc'] = doc['is_doc'].mask(doc['is_doc'].ne('Y'))
temp_count = doc['is_doc'].count()
temp_rate = round(temp_count/total[4]*100,1)
temp = str(temp_count) + ' (' + str(temp_rate) + '%)'
total.append(temp)
doc = doc.groupby('ecosystem')[['is_doc']].count()
doc, 

(           is_doc
 ecosystem        
 Composer       57
 Go             50
 Maven          44
 NuGet          52
 RubyGems       43
 npm            38
 pip            53,)

In [56]:
fix = csv
fix['is_fix'] = fix['is_fix'].mask(fix['is_fix'].ne('Y'))
temp_count = fix['is_fix'].count()
temp_rate = round(temp_count/total[4]*100,1)
temp = str(temp_count) + ' (' + str(temp_rate) + '%)'
total.append(temp)
fix = fix.groupby('ecosystem')[['is_fix']].count()
fix

Unnamed: 0_level_0,is_fix
ecosystem,Unnamed: 1_level_1
Composer,60
Go,43
Maven,37
NuGet,38
RubyGems,40
npm,31
pip,49


In [57]:
uc = csv
uc['is_uc'] = uc['is_uc'].mask(uc['is_uc'].ne('Y'))
temp_count = uc['is_uc'].count()
temp_rate = round(temp_count/total[4]*100,1)
temp = str(temp_count) + ' (' + str(temp_rate) + '%)'
total.append(temp)
uc = uc.groupby('ecosystem')[['is_uc']].count()
uc


Unnamed: 0_level_0,is_uc
ecosystem,Unnamed: 1_level_1
Composer,58
Go,37
Maven,33
NuGet,44
RubyGems,23
npm,21
pip,43


In [58]:
br = csv
br['is_br'] = br['is_br'].mask(br['is_br'].ne('Y'))
temp_count =br['is_br'].count()
temp_rate = round(temp_count/total[4]*100,1)
temp = str(temp_count) + ' (' + str(temp_rate) + '%)'
total.append(temp)
br = br.groupby('ecosystem')[['is_br']].count()
br

Unnamed: 0_level_0,is_br
ecosystem,Unnamed: 1_level_1
Composer,2
Go,7
Maven,2
NuGet,0
RubyGems,1
npm,7
pip,9


In [59]:
df=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[df, doc, fix, uc, br])
df

Unnamed: 0,ecosystem,advisory,package,distinct releases,releases,cwe,is_doc,is_fix,is_uc,is_br
0,Composer,50 (25),30,72 (98),98,18,57,60,58,2
1,Go,50 (25),49,65 (66),66,25,50,43,37,7
2,Maven,50 (25),49,76 (76),76,25,44,37,33,2
3,NuGet,50 (25),28,56 (60),60,14,52,38,44,0
4,RubyGems,50 (25),40,65 (65),65,20,43,40,23,1
5,npm,50 (25),44,70 (70),70,18,38,31,21,7
6,pip,50 (25),45,61 (64),64,31,53,49,43,9


In [60]:
df['is_doc_rate'] = round(df['is_doc']/df['releases']* 100,1) 
df['is_doc'] = df['is_doc'].astype(str)
df['is_doc_rate'] = df['is_doc_rate'].astype(str)
df['is_doc'] = df['is_doc'] + ' (' + df['is_doc_rate'] +'%)'
df = df.drop('is_doc_rate', axis=1)
df['is_fix_rate'] = round(df['is_fix']/df['releases'] * 100 ,1)
df['is_fix'] = df['is_fix'].astype(str)
df['is_fix_rate'] = df['is_fix_rate'].astype(str)
df['is_fix'] = df['is_fix'] + ' (' + df['is_fix_rate'] +'%)'
df = df.drop('is_fix_rate', axis=1)
df['is_uc_rate'] = round(df['is_uc']/df['releases'] * 100,1)
df['is_uc'] = df['is_uc'].astype(str)
df['is_uc_rate'] = df['is_uc_rate'].astype(str)
df['is_uc'] = df['is_uc'] + ' (' + df['is_uc_rate'] +'%)'
df = df.drop('is_uc_rate', axis=1)
df['is_br_rate'] = round(df['is_br']/df['releases']* 100,1)
df['is_br'] = df['is_br'].astype(str)
df['is_br_rate'] = df['is_br_rate'].astype(str)
df['is_br'] = df['is_br'] + ' (' + df['is_br_rate'] +'%)'
df = df.drop('is_br_rate', axis=1)
df

Unnamed: 0,ecosystem,advisory,package,distinct releases,releases,cwe,is_doc,is_fix,is_uc,is_br
0,Composer,50 (25),30,72 (98),98,18,57 (58.2%),60 (61.2%),58 (59.2%),2 (2.0%)
1,Go,50 (25),49,65 (66),66,25,50 (75.8%),43 (65.2%),37 (56.1%),7 (10.6%)
2,Maven,50 (25),49,76 (76),76,25,44 (57.9%),37 (48.7%),33 (43.4%),2 (2.6%)
3,NuGet,50 (25),28,56 (60),60,14,52 (86.7%),38 (63.3%),44 (73.3%),0 (0.0%)
4,RubyGems,50 (25),40,65 (65),65,20,43 (66.2%),40 (61.5%),23 (35.4%),1 (1.5%)
5,npm,50 (25),44,70 (70),70,18,38 (54.3%),31 (44.3%),21 (30.0%),7 (10.0%)
6,pip,50 (25),45,61 (64),64,31,53 (82.8%),49 (76.6%),43 (67.2%),9 (14.1%)


In [61]:
total

['Total',
 '350 (175)',
 285,
 '465 (499)',
 499,
 68,
 '337 (67.5%)',
 '298 (59.7%)',
 '259 (51.9%)',
 '28 (5.6%)']

In [62]:
df.loc[len(df)]=total
df = df.drop('releases', axis= 1)
print(tabulate(df, tablefmt='latex', headers='keys',showindex=False))

\begin{tabular}{llrlrllll}
\hline
 ecosystem   & advisory   &   package & distinct releases   &   cwe & is\_doc      & is\_fix      & is\_uc       & is\_br     \\
\hline
 Composer    & 50 (25)    &        30 & 72 (98)             &    18 & 57 (58.2\%)  & 60 (61.2\%)  & 58 (59.2\%)  & 2 (2.0\%)  \\
 Go          & 50 (25)    &        49 & 65 (66)             &    25 & 50 (75.8\%)  & 43 (65.2\%)  & 37 (56.1\%)  & 7 (10.6\%) \\
 Maven       & 50 (25)    &        49 & 76 (76)             &    25 & 44 (57.9\%)  & 37 (48.7\%)  & 33 (43.4\%)  & 2 (2.6\%)  \\
 NuGet       & 50 (25)    &        28 & 56 (60)             &    14 & 52 (86.7\%)  & 38 (63.3\%)  & 44 (73.3\%)  & 0 (0.0\%)  \\
 RubyGems    & 50 (25)    &        40 & 65 (65)             &    20 & 43 (66.2\%)  & 40 (61.5\%)  & 23 (35.4\%)  & 1 (1.5\%)  \\
 npm         & 50 (25)    &        44 & 70 (70)             &    18 & 38 (54.3\%)  & 31 (44.3\%)  & 21 (30.0\%)  & 7 (10.0\%) \\
 pip         & 50 (25)    &        45 & 61 (64)         

In [63]:
csv['doc_source'] = csv['doc_source'].str.strip()
csv['doc_source'] = csv['doc_source'].str.lower()
csv['doc_source'].value_counts()

changelog                        151
n                                149
github release note              132
homepage announcement             60
mailing list                       2
homepage release announcement      1
Name: doc_source, dtype: int64

In [64]:
csv['fix_pattern'].value_counts()

fix reference                                                                                                                                 57
fix reference; advisory reference                                                                                                             23
fix reference; vulnerability description; affected version listed                                                                             21
fix reference; security notice                                                                                                                19
fix reference; security notice; vulnerability description; advisory reference; affected version listed                                        18
fix reference; vulnerability description                                                                                                      15
fix reference; security notice; vulnerability description                                                                         

In [65]:
fp  = csv[['ecosystem', 'snyk_id','fix_pattern']]
fp = fp.values.tolist()
final = []
for row in fp:
    eco = row[0]
    aid = row[1]
    if row[-1] != 'nan' and isinstance(row[-1],str):
        l = row[-1].split(';')
        for elem in l:
            final.append([eco,aid,elem.lower().strip()])
fp = pd.DataFrame(final, columns=['ecosystem','advisory_id','fix_pattern'])
fp['fix_pattern'].value_counts()

fix reference                254
vulnerability description    159
security notice              132
advisory reference           126
affected version listed       74
affected component listed     44
exploit                        9
affected configuration         1
Name: fix_pattern, dtype: int64

In [66]:
bp  = csv[['ecosystem', 'snyk_id','breaking_pattern']]
bp = bp.values.tolist()
final = []
for row in bp:
    eco = row[0]
    aid = row[1]
    if row[-1] != 'nan' and isinstance(row[-1],str):
        l = row[-1].split(';')
        for elem in l:
            if elem.strip():
                final.append([eco,aid,elem.lower().strip()])
bp = pd.DataFrame(final, columns=['ecosystem','advisory_id','breaking_pattern'])
bp['breaking_pattern'].value_counts()

affected api listed       19
breaking change notice    17
code change reference      9
action required            7
affected configuration     5
Name: breaking_pattern, dtype: int64