In [1393]:
import sys
sys.path.append('..')
import sql
import pandas as pd
from functools import reduce
from tabulate import tabulate
import datetime
import matplotlib.pyplot as plt
import numpy as np

In [1394]:
# query used to extract random samples
q = ''' -- insert into manual_sample
    with rand_cves as (
    select ecosystem, a.id as advisory_id, p.id as package_id, a.type,
           row_number() over (partition by ecosystem order by rand()) as rand_sample
    from snykvuln.advisory a
    join package p on a.package_id = p.id
    where a.id in (select advisory_id from fixing_releases)
    and year(publish_date) >= 2018
            and ecosystem != 'cocoapods'
        and a.id in (select distinct advisory_id from advisoryCVE)
    ),
    rand_noncves as (
    select ecosystem, a.id as advisory_id, p.id as package_id, a.type,
           row_number() over (partition by ecosystem order by rand()) as rand_sample
    from snykvuln.advisory a
    join package p on a.package_id = p.id
    where a.id in (select advisory_id from fixing_releases)
    and year(publish_date) >= 2018
            and ecosystem != 'cocoapods'
        and a.id not in (select distinct advisory_id from advisoryCVE)
    )
        select * from
    (select ecosystem, advisory_id, package_id, type
    from rand_cves
    where rand_sample <=25
    union
    select ecosystem, advisory_id, package_id, type
    from rand_noncves
    where rand_sample <=25) as sub; '''


In [1395]:
q='''select *,
       case
            when exists(select * from advisoryCVE aC where aC.advisory_id = ms.advisory_id)
            then 'CVE'
            else 'non-CVE'
        end as if_cve
from manual_sample ms
join fixing_releases fr on ms.advisory_id = fr.advisory_id
join advisoryCWE a on fr.advisory_id = a.advisory_id
where version != 'manual checkup needed';'''
df = pd.DataFrame(sql.execute(q))
manualsample_sql = df
total_types= df['type'].nunique()
total = ['Total', 
            df['advisory_id'].nunique(),
            #str(df['advisory_id'].nunique()) +' (' +str(df[df['if_cve']=='non-CVE']['advisory_id'].nunique()) + ')', 
            df['package_id'].nunique(), 
            str(len(df.groupby(['package_id','version']))) +' (' + str(len(df.groupby(['advisory_id','package_id','version']))) +')', 
            len(df.groupby(['advisory_id','package_id','version'])), df['cwe'].nunique()]
total

['Total', 350, 285, '465 (499)', 499, 68]

In [1396]:
advisory = df.groupby('ecosystem')[['advisory_id']].nunique()
package = df.groupby('ecosystem')[['package_id']].nunique()
noncve = (df.loc[df['if_cve']=='non-CVE']).groupby('ecosystem')[['advisory_id']].nunique()
releases = df.groupby(['ecosystem','package_id','version']).size().groupby(level=0).size()
releases = releases.reset_index()
total_rel = df.groupby(['ecosystem','advisory_id','package_id','version']).size().groupby(level=0).size()
total_rel = total_rel.reset_index()
df=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[advisory,package,noncve,releases, total_rel])
df.columns = ['ecosystem','advisory', 'package', 'non-cve','distinct releases', 'releases']
# df['advisory'] = df['advisory'].map(str) + ' (' + df['non-cve'].map(str) + ')'
df['distinct releases'] = df['distinct releases'].map(str) + ' (' + df['releases'].map(str) + ')'
df = df.drop(['non-cve'], axis =1)
df

Unnamed: 0,ecosystem,advisory,package,distinct releases,releases
0,Composer,50,30,72 (98),98
1,Go,50,49,65 (66),66
2,Maven,50,49,76 (76),76
3,NuGet,50,28,56 (60),60
4,RubyGems,50,40,65 (65),65
5,npm,50,44,70 (70),70
6,pip,50,45,61 (64),64


In [1397]:
q = '''select p.ecosystem, ac.cwe as cwe
from manual_sample ms
join advisoryCWE aC on ms.advisory_id = aC.advisory_id
join advisory a on aC.advisory_id = a.id
join package p on a.package_id = p.id;'''
cwe = pd.DataFrame(sql.execute(q))
total_cwes = cwe['cwe'].nunique()
cwe = cwe.groupby('ecosystem')[['cwe']].nunique()

In [1398]:
df=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[df, cwe])
df

Unnamed: 0,ecosystem,advisory,package,distinct releases,releases,cwe
0,Composer,50,30,72 (98),98,18
1,Go,50,49,65 (66),66,25
2,Maven,50,49,76 (76),76,25
3,NuGet,50,28,56 (60),60,14
4,RubyGems,50,40,65 (65),65,20
5,npm,50,44,70 (70),70,18
6,pip,50,45,61 (64),64,31


In [1399]:
df.loc[len(df)] = total
df

Unnamed: 0,ecosystem,advisory,package,distinct releases,releases,cwe
0,Composer,50,30,72 (98),98,18
1,Go,50,49,65 (66),66,25
2,Maven,50,49,76 (76),76,25
3,NuGet,50,28,56 (60),60,14
4,RubyGems,50,40,65 (65),65,20
5,npm,50,44,70 (70),70,18
6,pip,50,45,61 (64),64,31
7,Total,350,285,465 (499),499,68


In [1400]:
print(tabulate(df, tablefmt='latex', headers='keys',showindex=False))

\begin{tabular}{lrrlrr}
\hline
 ecosystem   &   advisory &   package & distinct releases   &   releases &   cwe \\
\hline
 Composer    &         50 &        30 & 72 (98)             &         98 &    18 \\
 Go          &         50 &        49 & 65 (66)             &         66 &    25 \\
 Maven       &         50 &        49 & 76 (76)             &         76 &    25 \\
 NuGet       &         50 &        28 & 56 (60)             &         60 &    14 \\
 RubyGems    &         50 &        40 & 65 (65)             &         65 &    20 \\
 npm         &         50 &        44 & 70 (70)             &         70 &    18 \\
 pip         &         50 &        45 & 61 (64)             &         64 &    31 \\
 Total       &        350 &       285 & 465 (499)           &        499 &    68 \\
\hline
\end{tabular}


In [1401]:
csv = pd.read_csv('final-frozen.csv')
# csv = csv.rename(columns = {'Is there a documentation source?': 'is_doc','is the fix mentioned?':'is_fix', 'is there unrelated change mentioned?': 'is_uc', 'is breaking change mentioned?':'is_br' , 'main source' :'source', 'How is the fix mentioned? (final codes); separated by semicolon':'fix_pattern',
# 'how the breaking change is mentioned? (final codes); separated by semicolon':'breaking_pattern'})

len(csv)

499

In [1402]:
assert df.iloc[7,4] == len(csv)

In [1403]:
q='''select id as snyk_id,
       case
        when id in (select advisory_id from advisoryCVE) then 'cve'
        else 'noncve'
end as ifcve
from advisory;'''
ifcve = pd.DataFrame(sql.execute(q))
ifcve

Unnamed: 0,snyk_id,ifcve
0,SNYK-PYTHON-SALT-1080588,cve
1,SNYK-PYTHON-SALT-1080589,cve
2,SNYK-PYTHON-SALT-1080590,cve
3,SNYK-PYTHON-SALT-1080591,cve
4,SNYK-PYTHON-SALT-1080592,cve
...,...,...
6951,SNYK-JS-GETIPRANGE-1073612,cve
6952,SNYK-JAVA-COMVAADIN-1074927,noncve
6953,SNYK-JAVA-ORGAPACHEXMLGRAPHICS-1074910,cve
6954,SNYK-JS-PROGFAYSCRAPBOXPARSER-1076803,cve


In [1404]:
cvecsv = pd.merge(csv, ifcve, on ='snyk_id')
cvecsv

Unnamed: 0,snyk_id,vulnerability type,ecosystem,package,security release,repository_url,possible release note,possible changelogs,link,is_doc,...,is_br,how the breaking change is mentioned? (initial codes); separated by semicolon,breaking_pattern,Additional Comments,row,agreed,external_codes,Unnamed: 23,Unnamed: 24,ifcve
0,npm:address-rfc2822:20180225,Regular Expression Denial of Service (ReDoS),npm,address-rfc2822,2.0.2,https://github.com/haraka/node-address-rfc2822,not found through script,https://github.com/haraka/node-address-rfc2822...,https://snyk.io/vuln/npm:address-rfc2822:20180225,y,...,n,,,,2,yes,,,,noncve
1,npm:angular-jwt:20180605,Access Restriction Bypass,npm,angular-jwt,0.1.10,https://github.com/auth0/angular-jwt,https://github.com/auth0/angular-jwt/releases/...,https://github.com/auth0/angular-jwt/blob/mast...,https://snyk.io/vuln/npm:angular-jwt:20180605,y,...,n,,,,3,resolved,f,,,cve
2,npm:braces:20180219,Regular Expression Denial of Service (ReDoS),npm,braces,2.3.1,https://github.com/micromatch/braces,not found through script,https://github.com/micromatch/braces/blob/mast...,https://snyk.io/vuln/npm:braces:20180219,y,...,n,,,,4,yes,,,,noncve
3,npm:fastify:20180107,Denial of Service (DoS),npm,fastify,0.38.0,https://github.com/fastify/fastify,https://github.com/fastify/fastify/releases/ta...,no changlog found in script,https://snyk.io/vuln/npm:fastify:20180107,y,...,y,breaking change notice; pr reference,breaking change notice; code change reference,,5,yes,,,,cve
4,npm:is-my-json-valid:20180214,Regular Expression Denial of Service (ReDoS),npm,is-my-json-valid,1.4.1,https://github.com/mafintosh/is-my-json-valid,not found through script,no changlog found in script,https://snyk.io/vuln/npm:is-my-json-valid:2018...,n,...,n,,,"the package that does not have a release note,...",6,yes,,,,noncve
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
494,SNYK-RUBY-LODASHRAILS-559448,Prototype Pollution,RubyGems,lodash-rails,4.17.11,https://github.com/lodash/lodash,not found through script,https://github.com/lodash/lodash/blob/master/C...,https://snyk.io/vuln/SNYK-RUBY-LODASHRAILS-559448,y,...,n,,,changelog in wiki,438,resolved,d,Nasif could not find the changelog?,,cve
495,SNYK-RUBY-REDPARQUET-483028,Use of Uninitialized Variable,RubyGems,red-parquet,0.15.1,https://github.com/apache/arrow,not found through script,https://github.com/apache/arrow/blob/master/CH...,https://snyk.io/vuln/SNYK-RUBY-REDPARQUET-483028,y,...,n,,,automatically genrated?,455,resolved,d,The issue number is listed,,cve
496,SNYK-DOTNET-FAVICOJS-60151,Cross-site Scripting (XSS),NuGet,favico.js,0.3.10,https://github.com/ejci/favico.js,not found through script,no changlog found in script,https://snyk.io/vuln/SNYK-DOTNET-FAVICOJS-60151,y,...,n,,,,482,yes,y,is_doc is empty in process-frozen-nasif,,noncve
497,SNYK-DOTNET-KNOCKOUT-60180,Cross-site Scripting (XSS),NuGet,knockout,3.0.0,https://github.com/knockout/knockout,https://github.com/knockout/knockout/releases/...,no changlog found in script,https://snyk.io/vuln/SNYK-DOTNET-KNOCKOUT-60180,y,...,y,warning; action required,breaking change notice;action required,,483,yes,y,is_doc is empty in process-frozen-nasif,,noncve


In [1405]:
doc = csv
doc['is_doc'] = doc['is_doc'].mask(doc['is_doc'].ne('y'))
temp_count = doc['is_doc'].count()
temp_rate = round(temp_count/total[4]*100,1)
temp = str(temp_count) + ' (' + str(temp_rate) + '%)'
total.append(temp)
doc = doc.groupby('ecosystem')[['is_doc']].count()
doc, 

(           is_doc
 ecosystem        
 Composer       63
 Go             54
 Maven          48
 NuGet          49
 RubyGems       49
 npm            43
 pip            57,)

In [1406]:
fix = csv
fix['is_fix'] = fix['is_fix'].mask(fix['is_fix'].ne('y'))
temp_count = fix['is_fix'].count()
temp_rate = round(temp_count/total[4]*100,1)
temp = str(temp_count) + ' (' + str(temp_rate) + '%)'
total.append(temp)
fix = fix.groupby('ecosystem')[['is_fix']].count()
fix

Unnamed: 0_level_0,is_fix
ecosystem,Unnamed: 1_level_1
Composer,58
Go,43
Maven,31
NuGet,40
RubyGems,43
npm,39
pip,53


In [1407]:
uc = csv
uc['is_uc'] = uc['is_uc'].mask(uc['is_uc'].ne('y'))
temp_count = uc['is_uc'].count()
temp_rate = round(temp_count/total[4]*100,1)
temp = str(temp_count) + ' (' + str(temp_rate) + '%)'
total.append(temp)
uc = uc.groupby('ecosystem')[['is_uc']].count()
uc


Unnamed: 0_level_0,is_uc
ecosystem,Unnamed: 1_level_1
Composer,50
Go,43
Maven,32
NuGet,46
RubyGems,24
npm,24
pip,41


In [1408]:
br = csv
br['is_br'] = br['is_br'].mask(br['is_br'].ne('y'))
temp_count =br['is_br'].count()
temp_rate = round(temp_count/total[4]*100,1)
temp = str(temp_count) + ' (' + str(temp_rate) + '%)'
total.append(temp)
br = br.groupby('ecosystem')[['is_br']].count()
br

Unnamed: 0_level_0,is_br
ecosystem,Unnamed: 1_level_1
Composer,2
Go,10
Maven,2
NuGet,1
RubyGems,1
npm,8
pip,8


In [1409]:
df=reduce(lambda x,y : pd.merge(x,y,on='ecosystem'),[df, doc, fix, uc, br])
df

Unnamed: 0,ecosystem,advisory,package,distinct releases,releases,cwe,is_doc,is_fix,is_uc,is_br
0,Composer,50,30,72 (98),98,18,63,58,50,2
1,Go,50,49,65 (66),66,25,54,43,43,10
2,Maven,50,49,76 (76),76,25,48,31,32,2
3,NuGet,50,28,56 (60),60,14,49,40,46,1
4,RubyGems,50,40,65 (65),65,20,49,43,24,1
5,npm,50,44,70 (70),70,18,43,39,24,8
6,pip,50,45,61 (64),64,31,57,53,41,8


In [1410]:
df['is_doc_rate'] = round(df['is_doc']/df['releases']* 100,1) 
df['is_doc'] = df['is_doc'].astype(str)
df['is_doc_rate'] = df['is_doc_rate'].astype(str)
df['is_doc'] = df['is_doc'] + ' (' + df['is_doc_rate'] +'%)'
df = df.drop('is_doc_rate', axis=1)
df['is_fix_rate'] = round(df['is_fix']/df['releases'] * 100 ,1)
df['is_fix'] = df['is_fix'].astype(str)
df['is_fix_rate'] = df['is_fix_rate'].astype(str)
df['is_fix'] = df['is_fix'] + ' (' + df['is_fix_rate'] +'%)'
df = df.drop('is_fix_rate', axis=1)
df['is_uc_rate'] = round(df['is_uc']/df['releases'] * 100,1)
df['is_uc'] = df['is_uc'].astype(str)
df['is_uc_rate'] = df['is_uc_rate'].astype(str)
df['is_uc'] = df['is_uc'] + ' (' + df['is_uc_rate'] +'%)'
df = df.drop('is_uc_rate', axis=1)
df['is_br_rate'] = round(df['is_br']/df['releases']* 100,1)
df['is_br'] = df['is_br'].astype(str)
df['is_br_rate'] = df['is_br_rate'].astype(str)
df['is_br'] = df['is_br'] + ' (' + df['is_br_rate'] +'%)'
df = df.drop('is_br_rate', axis=1)
df

Unnamed: 0,ecosystem,advisory,package,distinct releases,releases,cwe,is_doc,is_fix,is_uc,is_br
0,Composer,50,30,72 (98),98,18,63 (64.3%),58 (59.2%),50 (51.0%),2 (2.0%)
1,Go,50,49,65 (66),66,25,54 (81.8%),43 (65.2%),43 (65.2%),10 (15.2%)
2,Maven,50,49,76 (76),76,25,48 (63.2%),31 (40.8%),32 (42.1%),2 (2.6%)
3,NuGet,50,28,56 (60),60,14,49 (81.7%),40 (66.7%),46 (76.7%),1 (1.7%)
4,RubyGems,50,40,65 (65),65,20,49 (75.4%),43 (66.2%),24 (36.9%),1 (1.5%)
5,npm,50,44,70 (70),70,18,43 (61.4%),39 (55.7%),24 (34.3%),8 (11.4%)
6,pip,50,45,61 (64),64,31,57 (89.1%),53 (82.8%),41 (64.1%),8 (12.5%)


In [1411]:
total

['Total',
 350,
 285,
 '465 (499)',
 499,
 68,
 '363 (72.7%)',
 '307 (61.5%)',
 '260 (52.1%)',
 '32 (6.4%)']

In [1412]:
df.loc[len(df)]=total
df['temp'] = df['ecosystem'].str.lower()
df = df.sort_values(by = 'temp', ascending=True)
df = df.drop(['releases','temp'], axis= 1)
print(tabulate(df, tablefmt='latex', headers='keys',showindex=False))

\begin{tabular}{lrrlrllll}
\hline
 ecosystem   &   advisory &   package & distinct releases   &   cwe & is\_doc      & is\_fix      & is\_uc       & is\_br      \\
\hline
 Composer    &         50 &        30 & 72 (98)             &    18 & 63 (64.3\%)  & 58 (59.2\%)  & 50 (51.0\%)  & 2 (2.0\%)   \\
 Go          &         50 &        49 & 65 (66)             &    25 & 54 (81.8\%)  & 43 (65.2\%)  & 43 (65.2\%)  & 10 (15.2\%) \\
 Maven       &         50 &        49 & 76 (76)             &    25 & 48 (63.2\%)  & 31 (40.8\%)  & 32 (42.1\%)  & 2 (2.6\%)   \\
 npm         &         50 &        44 & 70 (70)             &    18 & 43 (61.4\%)  & 39 (55.7\%)  & 24 (34.3\%)  & 8 (11.4\%)  \\
 NuGet       &         50 &        28 & 56 (60)             &    14 & 49 (81.7\%)  & 40 (66.7\%)  & 46 (76.7\%)  & 1 (1.7\%)   \\
 pip         &         50 &        45 & 61 (64)             &    31 & 57 (89.1\%)  & 53 (82.8\%)  & 41 (64.1\%)  & 8 (12.5\%)  \\
 RubyGems    &         50 &        40 & 65 (65)  

In [1413]:
csv['source'] = csv['source'].str.strip()
csv['source'] = csv['source'].str.lower()
t =  csv[csv['is_doc']=='y']
t['source'].value_counts()

changelog                201
github release note      116
homepage announcement     45
security notice            1
Name: source, dtype: int64

In [1414]:
t = t[t['source']=='github release note']
len(t[t['possible release note']== 'not found through script'])
# t = csv[csv['source']=='n']
# t

23

In [1415]:
csv['fix_pattern'].value_counts()

fix reference                                                                                                 33
security notice;advisory reference;fix reference                                                              17
security notice;vulnerability description;advisory reference;affected component listed                        13
security notice;vulnerability description                                                                     13
security notice;fix reference                                                                                 12
                                                                                                              ..
security notice; advisory reference                                                                            1
security notice; advisory reference; fix reference; affected component listed                                  1
security notice;vulnerability description;advisory reference;exploit                            

In [1416]:
fp  = csv[['ecosystem', 'snyk_id','fix_pattern']]
fp = fp.values.tolist()
final = []
for row in fp:
    eco = row[0]
    aid = row[1]
    if row[-1] != 'nan' and isinstance(row[-1],str):
        l = row[-1].split(';')
        for elem in l:
            if elem.lower().strip() == 'n':
                print(row)
            final.append([eco,aid,elem.lower().strip()])
fp = pd.DataFrame(final, columns=['ecosystem','advisory_id','fix_pattern'])
fp['fix_pattern'].unique()

array(['vulnerability description', 'fix reference', 'security notice',
       'affected version listed', 'advisory reference', 'exploit',
       'affected component listed', 'affected configuration'],
      dtype=object)

In [1417]:
fp['temp'] = fp['ecosystem'].str.lower()
fp = fp.sort_values(by = 'temp', ascending=True)
fp.groupby(['fix_pattern', 'temp']).size()

fix_pattern                temp    
advisory reference         composer    14
                           go          12
                           maven       12
                           npm         13
                           nuget       27
                           pip         25
                           rubygems    18
affected component listed  composer    10
                           go           9
                           maven        2
                           npm          9
                           nuget        8
                           pip         24
                           rubygems    17
affected configuration     go           1
                           pip          2
affected version listed    composer    17
                           go           3
                           maven        5
                           npm          3
                           nuget        4
                           pip          1
exploit                    composer    1

In [1418]:
bp  = csv[['ecosystem', 'snyk_id','breaking_pattern']]
bp = bp.values.tolist()
final = []
for row in bp:
    eco = row[0]
    aid = row[1]
    if row[-1] != 'nan' and isinstance(row[-1],str):
        l = row[-1].split(';')
        for elem in l:
            if elem.strip():
                final.append([eco,aid,elem.lower().strip()])
bp = pd.DataFrame(final, columns=['ecosystem','advisory_id','breaking_pattern'])
bp['breaking_pattern'].unique()

array(['breaking change notice', 'code change reference',
       'affected api listed', 'action required', 'affected configuration'],
      dtype=object)

In [1419]:
bp['temp'] = bp['ecosystem'].str.lower()
bp = bp.sort_values(by = 'temp', ascending=True)
bp.groupby(['breaking_pattern', 'temp']).size()

breaking_pattern        temp    
action required         go          2
                        maven       1
                        npm         1
                        nuget       1
                        pip         4
affected api listed     composer    2
                        go          6
                        maven       1
                        npm         6
                        pip         5
affected configuration  go          3
                        npm         1
                        pip         2
                        rubygems    1
breaking change notice  composer    2
                        go          9
                        maven       2
                        npm         5
                        nuget       1
                        pip         3
code change reference   composer    1
                        go          5
                        maven       1
                        npm         2
dtype: int64

In [1420]:
cvecsv['is_fix'] = cvecsv['is_fix'].fillna(0)
cvecsv[cvecsv['is_fix'].isnull()]

Unnamed: 0,snyk_id,vulnerability type,ecosystem,package,security release,repository_url,possible release note,possible changelogs,link,is_doc,...,is_br,how the breaking change is mentioned? (initial codes); separated by semicolon,breaking_pattern,Additional Comments,row,agreed,external_codes,Unnamed: 23,Unnamed: 24,ifcve


In [1421]:
cvecsv = cvecsv[cvecsv['is_doc']=='y']
cvecsv['is_fix'] = cvecsv['is_fix'].fillna(0)
cvecsv['is_fix'] = cvecsv['is_fix'].map({'y':1,'n':0})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cvecsv['is_fix'] = cvecsv['is_fix'].fillna(0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cvecsv['is_fix'] = cvecsv['is_fix'].map({'y':1,'n':0})


In [1422]:
cvecsv[cvecsv['is_fix'].isnull()]

Unnamed: 0,snyk_id,vulnerability type,ecosystem,package,security release,repository_url,possible release note,possible changelogs,link,is_doc,...,is_br,how the breaking change is mentioned? (initial codes); separated by semicolon,breaking_pattern,Additional Comments,row,agreed,external_codes,Unnamed: 23,Unnamed: 24,ifcve


In [1423]:
cve = cvecsv[cvecsv['ifcve']=='cve']
noncve = cvecsv[cvecsv['ifcve']=='noncve']
len(cve), len(noncve)

(210, 153)

In [1424]:
list(cve['is_fix'].value_counts()), list(noncve['is_fix'].value_counts())

([184, 26], [123, 30])

In [1425]:
from scipy.stats import chi2_contingency
chi2_contingency([list(cve['is_fix'].value_counts()),list(noncve['is_fix'].value_counts())])

(3.0109251640395556,
 0.08270507405423237,
 1,
 array([[177.60330579,  32.39669421],
        [129.39669421,  23.60330579]]))

In [1426]:
cve = cve.to_dict('records')
noncve= noncve.to_dict('records')

In [1427]:
print(len(cve))
y = n = 0
for row in cve:
    s = row['fix_pattern']
    s = str(s).split(';')
    for i,e in enumerate(s):
        s[i] =  e.lower().strip()
    if 'security notice' in s:
        y+=1
    else:
        n+=1
         
y, n

210


(128, 82)

In [1428]:
print(len(noncve))
y = n = 0
for row in noncve:
    s = row['fix_pattern']
    s = str(s).split(';')
    for i,e in enumerate(s):
        s[i] =  e.lower().strip()
    if 'security notice' in s:
        y+=1
    else:
        n+=1
         
y, n

153


(70, 83)

In [1429]:
chi2_contingency([[128,82],[70,83]])

(7.647233893557422,
 0.005685944009974247,
 1,
 array([[114.54545455,  95.45454545],
        [ 83.45454545,  69.54545455]]))