# Challenge
## Part 1

In [1]:
import pandas as pd
from data import load_data
import predict
from pathlib import Path

In [2]:
data = load_data(vectorize_descriptions=True, max_features=1_000)

Downloading and unzipping data: 100%|##########| 20/20 [02:12<00:00,  6.62s/it]
Loading data: 100%|##########| 20/20 [00:15<00:00,  1.32it/s]


Number of CVEs: 161469
Number of non-rejected CVEs: 151908
Number of CVEs missing CVSSv3: 73373
Number of CVEs with CVSSv3: 78535
Number of CVEs for training: 54975
Number of CVEs for testing: 11780
Number of CVEs for validation: 11780


In [3]:
cvssV2 = pd.concat(
    [
        data['predict']['train'],
         data['predict']['test'],
         data['predict']['valid']
    ]).filter(regex='cvssV2')
cvssV2

Unnamed: 0,cvssV2_exploitabilityScore,cvssV2_authentication,cvssV2_availabilityImpact,cvssV2_confidentialityImpact,cvssV2_integrityImpact,cvssV2_accessComplexity,cvssV2_severity,cvssV2_impactScore,cvssV2_obtainAllPrivilege,cvssV2_obtainOtherPrivilege,cvssV2_baseScore,cvssV2_acInsufInfo,cvssV2_obtainUserPrivilege,cvssV2_userInteractionRequired,cvssV2_accessVector
0,8.6,NONE,PARTIAL,PARTIAL,PARTIAL,MEDIUM,MEDIUM,6.4,False,False,6.8,False,False,True,NETWORK
1,10.0,NONE,PARTIAL,NONE,NONE,LOW,MEDIUM,2.9,False,False,5.0,False,False,False,NETWORK
2,8.6,NONE,NONE,NONE,PARTIAL,MEDIUM,MEDIUM,2.9,False,False,4.3,False,False,True,NETWORK
3,8.6,NONE,NONE,PARTIAL,NONE,MEDIUM,MEDIUM,2.9,False,False,4.3,,False,False,NETWORK
4,3.9,NONE,NONE,PARTIAL,NONE,LOW,LOW,2.9,False,False,2.1,False,False,False,LOCAL
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11775,8.6,NONE,COMPLETE,COMPLETE,COMPLETE,MEDIUM,HIGH,10.0,False,False,9.3,False,False,True,NETWORK
11776,10.0,NONE,NONE,PARTIAL,NONE,LOW,MEDIUM,2.9,False,False,5.0,False,False,False,NETWORK
11777,8.0,SINGLE,NONE,PARTIAL,NONE,LOW,MEDIUM,2.9,False,False,4.0,False,False,False,NETWORK
11778,3.4,NONE,COMPLETE,COMPLETE,COMPLETE,MEDIUM,MEDIUM,10.0,False,False,6.9,False,False,True,LOCAL


In [4]:
cvssV3 = pd.concat(
    [
        data['cvssV3']['train'],
         data['cvssV3']['test'],
         data['cvssV3']['valid']
    ])
cvssV3

Unnamed: 0,cvssV3_privilegesRequired,cvssV3_integrityImpact,cvssV3_baseSeverity,cvssV3_availabilityImpact,cvssV3_attackComplexity,cvssV3_userInteraction,cvssV3_confidentialityImpact,cvssV3_scope,cvssV3_baseScore,cvssV3_attackVector
0,NONE,HIGH,HIGH,HIGH,LOW,REQUIRED,HIGH,UNCHANGED,8.8,NETWORK
1,NONE,NONE,HIGH,HIGH,LOW,NONE,NONE,UNCHANGED,7.5,NETWORK
2,NONE,HIGH,MEDIUM,NONE,LOW,REQUIRED,NONE,UNCHANGED,6.5,NETWORK
3,NONE,NONE,MEDIUM,NONE,HIGH,NONE,HIGH,UNCHANGED,5.9,NETWORK
4,LOW,NONE,MEDIUM,NONE,LOW,NONE,HIGH,UNCHANGED,5.5,LOCAL
...,...,...,...,...,...,...,...,...,...,...
11775,NONE,HIGH,HIGH,HIGH,LOW,REQUIRED,HIGH,UNCHANGED,7.8,LOCAL
11776,NONE,NONE,MEDIUM,NONE,LOW,NONE,LOW,UNCHANGED,5.3,NETWORK
11777,LOW,NONE,MEDIUM,NONE,LOW,NONE,HIGH,UNCHANGED,6.5,NETWORK
11778,LOW,HIGH,HIGH,HIGH,LOW,REQUIRED,HIGH,UNCHANGED,7.3,LOCAL


In [5]:
def plot_feature(feature):
    df = pd.DataFrame()
    df['cvssV2'] = cvssV2['cvssV2_' + feature]
    df['cvssV3'] = cvssV3['cvssV3_' + feature]
    df['ones'] = [1 for _ in range(df.shape[0])]
    return df.pivot_table(values='ones', index='cvssV2', columns='cvssV3', aggfunc='count')

In [6]:
plot_feature('confidentialityImpact')

cvssV3,HIGH,LOW,NONE
cvssV2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
COMPLETE,13084,7,4
NONE,135,9014,17003
PARTIAL,33113,6117,58


In [7]:
plot_feature('integrityImpact')

cvssV3,HIGH,LOW,NONE
cvssV2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
COMPLETE,12756,1,4
NONE,1176,51,24457
PARTIAL,26321,13741,28


In [8]:
plot_feature('availabilityImpact')

cvssV3,HIGH,LOW,NONE
cvssV2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
COMPLETE,15382,4,54
NONE,1236,63,29911
PARTIAL,29346,1993,546


### Answer
There are __73373__ CVEs missing CVSSv3 and __78438__ CVEs with CVSSv3.
The rejected CVEs are the ones without CVSSv2.

No, it is not possible to map the old to the new metric values as can be seen with the results of `plot_feature()` above.

## Part 2 and 3

In [9]:
model = predict.Model(data)

In [10]:
def train_predict_summary(target, valid, missing):
    y_true = data['cvssV3']['valid'][target]
    model.fit(target)
    _, _, valid_summary = model.predict_summary(valid, y_true)
    missing_labels_, _ = model.predict(missing)
    return missing_labels_, valid_summary


def train_predict_summary_all():
    valid = data['predict']['valid'].copy()
    missing = data['predict']['missing'].copy()
    targets = ['scope', 'confidentialityImpact', 'integrityImpact',
               'availabilityImpact']
    missing_labels_ = {}
    summaries_ = {}
    for target in targets:
        missing_labels_['pred_' + target], summaries_[
            target] = train_predict_summary(
            'cvssV3_' + target, valid, missing)
    return missing_labels_, summaries_


missing_labels, summaries = train_predict_summary_all()

{'objective': 'binary', 'metric': 'auc', 'is_unbalance': 'true', 'boosting': 'gbdt', 'num_leaves': 31, 'feature_fraction': 0.5, 'bagging_fraction': 0.5, 'bagging_freq': 20, 'learning_rate': 0.05, 'verbose': 0, 'random_state': 42}
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[1]	valid_0's auc: 0.941424
Training until validation scores don't improve for 100 rounds
[2]	valid_0's auc: 0.959346
[3]	valid_0's auc: 0.958908
[4]	valid_0's auc: 0.961296
[5]	valid_0's auc: 0.963838
[6]	valid_0's auc: 0.96669
[7]	valid_0's auc: 0.967299
[8]	valid_0's auc: 0.968066
[9]	valid_0's auc: 0.968475
[10]	valid_0's auc: 0.968279
[11]	valid_0's auc: 0.968857
[12]	valid_0's auc: 0.969541
[13]	valid_0's auc: 0.969202
[14]	valid_0's auc: 0.96932
[15]	valid_0's auc: 0.969989
[16]	valid_0's auc: 0.969888
[17]	valid_0's auc: 0.969826
[18]	valid_0's auc: 0.96996
[19]	valid_0's auc: 0.970398
[20]	valid_0's auc: 0.970926
[21]	valid_0's auc

[258]	valid_0's auc: 0.980878
[259]	valid_0's auc: 0.980866
[260]	valid_0's auc: 0.98088
[261]	valid_0's auc: 0.980834
[262]	valid_0's auc: 0.980737
[263]	valid_0's auc: 0.980731
[264]	valid_0's auc: 0.980688
[265]	valid_0's auc: 0.980667
[266]	valid_0's auc: 0.980649
[267]	valid_0's auc: 0.980636
[268]	valid_0's auc: 0.980633
[269]	valid_0's auc: 0.980673
[270]	valid_0's auc: 0.980674
[271]	valid_0's auc: 0.980735
[272]	valid_0's auc: 0.980763
[273]	valid_0's auc: 0.980762
[274]	valid_0's auc: 0.980716
[275]	valid_0's auc: 0.980731
[276]	valid_0's auc: 0.980684
[277]	valid_0's auc: 0.980727
[278]	valid_0's auc: 0.980702
[279]	valid_0's auc: 0.980667
[280]	valid_0's auc: 0.980639
[281]	valid_0's auc: 0.980635
[282]	valid_0's auc: 0.980617
[283]	valid_0's auc: 0.980616
[284]	valid_0's auc: 0.980623
[285]	valid_0's auc: 0.980592
[286]	valid_0's auc: 0.980576
[287]	valid_0's auc: 0.980525
[288]	valid_0's auc: 0.980506
[289]	valid_0's auc: 0.980456
[290]	valid_0's auc: 0.980459
[291]	valid

[145]	valid_0's auc_mu: 0.989347
[146]	valid_0's auc_mu: 0.989361
[147]	valid_0's auc_mu: 0.989373
[148]	valid_0's auc_mu: 0.98937
[149]	valid_0's auc_mu: 0.989379
[150]	valid_0's auc_mu: 0.989387
[151]	valid_0's auc_mu: 0.989398
[152]	valid_0's auc_mu: 0.989398
[153]	valid_0's auc_mu: 0.989408
[154]	valid_0's auc_mu: 0.98941
[155]	valid_0's auc_mu: 0.98942
[156]	valid_0's auc_mu: 0.989423
[157]	valid_0's auc_mu: 0.989455
[158]	valid_0's auc_mu: 0.989448
[159]	valid_0's auc_mu: 0.989463
[160]	valid_0's auc_mu: 0.989459
[161]	valid_0's auc_mu: 0.989463
[162]	valid_0's auc_mu: 0.989461
[163]	valid_0's auc_mu: 0.989453
[164]	valid_0's auc_mu: 0.989456
[165]	valid_0's auc_mu: 0.989456
[166]	valid_0's auc_mu: 0.989468
[167]	valid_0's auc_mu: 0.989477
[168]	valid_0's auc_mu: 0.989478
[169]	valid_0's auc_mu: 0.989475
[170]	valid_0's auc_mu: 0.989476
[171]	valid_0's auc_mu: 0.989458
[172]	valid_0's auc_mu: 0.989465
[173]	valid_0's auc_mu: 0.989468
[174]	valid_0's auc_mu: 0.989465
[175]	valid_0

[396]	valid_0's auc_mu: 0.989821
[397]	valid_0's auc_mu: 0.989814
[398]	valid_0's auc_mu: 0.989811
[399]	valid_0's auc_mu: 0.989813
[400]	valid_0's auc_mu: 0.989818
[401]	valid_0's auc_mu: 0.989822
[402]	valid_0's auc_mu: 0.989821
[403]	valid_0's auc_mu: 0.989821
[404]	valid_0's auc_mu: 0.989828
[405]	valid_0's auc_mu: 0.989831
[406]	valid_0's auc_mu: 0.989823
[407]	valid_0's auc_mu: 0.98982
[408]	valid_0's auc_mu: 0.989815
[409]	valid_0's auc_mu: 0.989809
[410]	valid_0's auc_mu: 0.989808
[411]	valid_0's auc_mu: 0.989806
[412]	valid_0's auc_mu: 0.989798
[413]	valid_0's auc_mu: 0.9898
[414]	valid_0's auc_mu: 0.989793
[415]	valid_0's auc_mu: 0.989785
[416]	valid_0's auc_mu: 0.989788
[417]	valid_0's auc_mu: 0.989788
[418]	valid_0's auc_mu: 0.989785
[419]	valid_0's auc_mu: 0.989786
[420]	valid_0's auc_mu: 0.989784
[421]	valid_0's auc_mu: 0.98979
[422]	valid_0's auc_mu: 0.98979
[423]	valid_0's auc_mu: 0.989789
[424]	valid_0's auc_mu: 0.98979
[425]	valid_0's auc_mu: 0.98979
[426]	valid_0's a

[120]	valid_0's auc_mu: 0.993
[121]	valid_0's auc_mu: 0.993007
[122]	valid_0's auc_mu: 0.993001
[123]	valid_0's auc_mu: 0.993002
[124]	valid_0's auc_mu: 0.993012
[125]	valid_0's auc_mu: 0.993015
[126]	valid_0's auc_mu: 0.993026
[127]	valid_0's auc_mu: 0.993035
[128]	valid_0's auc_mu: 0.993051
[129]	valid_0's auc_mu: 0.99305
[130]	valid_0's auc_mu: 0.993057
[131]	valid_0's auc_mu: 0.993069
[132]	valid_0's auc_mu: 0.993077
[133]	valid_0's auc_mu: 0.993078
[134]	valid_0's auc_mu: 0.993078
[135]	valid_0's auc_mu: 0.993085
[136]	valid_0's auc_mu: 0.993086
[137]	valid_0's auc_mu: 0.993088
[138]	valid_0's auc_mu: 0.993087
[139]	valid_0's auc_mu: 0.993087
[140]	valid_0's auc_mu: 0.99309
[141]	valid_0's auc_mu: 0.993083
[142]	valid_0's auc_mu: 0.993075
[143]	valid_0's auc_mu: 0.993073
[144]	valid_0's auc_mu: 0.993073
[145]	valid_0's auc_mu: 0.993074
[146]	valid_0's auc_mu: 0.993072
[147]	valid_0's auc_mu: 0.993067
[148]	valid_0's auc_mu: 0.993064
[149]	valid_0's auc_mu: 0.993058
[150]	valid_0's

[75]	valid_0's auc_mu: 0.968758
[76]	valid_0's auc_mu: 0.968899
[77]	valid_0's auc_mu: 0.968858
[78]	valid_0's auc_mu: 0.968779
[79]	valid_0's auc_mu: 0.968821
[80]	valid_0's auc_mu: 0.969019
[81]	valid_0's auc_mu: 0.969233
[82]	valid_0's auc_mu: 0.96962
[83]	valid_0's auc_mu: 0.969574
[84]	valid_0's auc_mu: 0.969555
[85]	valid_0's auc_mu: 0.969495
[86]	valid_0's auc_mu: 0.969499
[87]	valid_0's auc_mu: 0.96953
[88]	valid_0's auc_mu: 0.969614
[89]	valid_0's auc_mu: 0.969621
[90]	valid_0's auc_mu: 0.969674
[91]	valid_0's auc_mu: 0.969583
[92]	valid_0's auc_mu: 0.969538
[93]	valid_0's auc_mu: 0.969539
[94]	valid_0's auc_mu: 0.969545
[95]	valid_0's auc_mu: 0.969544
[96]	valid_0's auc_mu: 0.96952
[97]	valid_0's auc_mu: 0.96952
[98]	valid_0's auc_mu: 0.969504
[99]	valid_0's auc_mu: 0.969528
[100]	valid_0's auc_mu: 0.969527
[101]	valid_0's auc_mu: 0.969469
[102]	valid_0's auc_mu: 0.969417
[103]	valid_0's auc_mu: 0.969527
[104]	valid_0's auc_mu: 0.969586
[105]	valid_0's auc_mu: 0.969562
[106]	

In [11]:
OUTPUT = Path('output')
OUTPUT.mkdir(exist_ok=True)

results = pd.concat([data['predict']['missing'].filter(regex='cvssV2'),
                     pd.DataFrame(missing_labels)], axis=1)
results.to_csv(OUTPUT / 'results.csv', index=False)

In [12]:
summaries

{'scope': {'accuracy': 0.9595076400679117,
  'precision': {'CHANGED': 0.8680935788949726,
   'UNCHANGED': 0.9783031419506704},
  'recall': {'CHANGED': 0.8916155419222904, 'UNCHANGED': 0.9730252442996743},
  'fscore': {'CHANGED': 0.8796973518284994, 'UNCHANGED': 0.9756570553712681},
  'support': {'CHANGED': 1956, 'UNCHANGED': 9824}},
 'confidentialityImpact': {'accuracy': 0.9409168081494058,
  'precision': {'HIGH': 0.9340840428453722,
   'LOW': 0.9064565327910523,
   'NONE': 0.9873567759778744},
  'recall': {'HIGH': 0.9738010021474588,
   'LOW': 0.7844258688957325,
   'NONE': 0.9908802537668517},
  'fscore': {'HIGH': 0.9535291231513282,
   'LOW': 0.8410377358490565,
   'NONE': 0.9891153770037602},
  'support': {'HIGH': 6985, 'LOW': 2273, 'NONE': 2522}},
 'integrityImpact': {'accuracy': 0.9449915110356536,
  'precision': {'HIGH': 0.937956797141465,
   'LOW': 0.9264392324093816,
   'NONE': 0.9658393381371764},
  'recall': {'HIGH': 0.9575526446692091,
   'LOW': 0.8355769230769231,
   'NONE