In [202]:
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import StratifiedShuffleSplit
import pandas as pd
import numpy as np

In [203]:
obs_df = pd.read_csv("../datasets/TrainingV3/observed-examples.csv")
cwe_desc_df = pd.read_csv("../datasets/TrainingV3/cwe-schema-descriptions.csv")
nvd_df = pd.read_csv("../datasets/TrainingV3/nvd-parsed-dataset.csv")
xfrc_df = pd.read_csv("../datasets/TrainingV3/x_force_dataset.csv")

merged_df = pd.concat(
    [obs_df, cwe_desc_df, nvd_df, xfrc_df],
    sort=True
)

print("Length: {}".format(len(merged_df)))

Length: 190685


In [204]:
merged_df.loc[merged_df["CWE-ID"] == 1004]

Unnamed: 0,CVE,CWE-ID,Description
0,CVE-2014-3852,1004,CMS written in Python does not include the HTT...
1,CVE-2015-4138,1004,Appliance for managing encrypted communication...
0,,1004,The software uses a cookie to store sensitive ...
1,,1004,The HttpOnly flag directs compatible browsers ...
2,,1004,An HTTP cookie is a small piece of data attrib...
3,,1004,"If the HttpOnly flag is not set, then sensitiv..."
4,,1004,If the cookie in question is an authentication...
66389,CVE-2014-3852,1004,Pyplate could allow a remote attacker to obtai...
66390,CVE-2015-4138,1004,"Blue Coat SSL Visibility Appliance SV800, SV18..."


In [205]:
class ObsExampleEnrichment(BaseEstimator, TransformerMixin):
    """
    The observed examples often are more specific. 
    """
    def __init__(self, obs_df):
        self.obs_df = obs_df
        
        # If the observed example is the member of a class, do not replace
        self.cwe_classes = [1023, 1038, 1039, 1059, 1061, 1076, 1078, 1093, 1120, 114, 116, 1164, 1176, 1177, 118, 119, 1229, 138, 159, 172, 185, 20, 200, 216, 221, 228, 269, 271, 282, 285, 286, 287, 300, 311, 326, 327, 330, 340, 345, 362, 377, 400, 402, 404, 405, 406, 407, 424, 436, 441, 446, 451, 506, 514, 522, 573, 592, 610, 636, 637, 638, 642, 657, 662, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 684, 696, 704, 705, 706, 732, 74, 75, 754, 755, 758, 77, 790, 799, 834, 862, 863, 912, 913, 922, 923, 943, 99]
    def fit(self, frame):
        return self
    def transform(self, frame):
        count = 0
        for row, observed_example in self.obs_df.iterrows():
            if observed_example["CWE-ID"] not in self.cwe_classes:
                frame.loc[frame["CVE"] == observed_example["CVE"], 'CWE-ID'] = observed_example["CWE-ID"]
                count += 1
        print("[+] Found and replaced {} examples with observed examples".format(count))
        return frame

In [206]:

class StripVersionNumbers(BaseEstimator, TransformerMixin):
    """
    Strip all version strings.
    
    Examples:
        1.0
        1.7.6
        7.54
        4.3.2229
        0.7rc1
        1.0b
        0.08
        2.2.x
        2.2.7-dev
        2.6.26.4
        3.x
        0.848b
        1.1.21rc1
        4.1.7.F
        1.913-2.fc7
    """
    def __init__(self):
        self.reg = "(\d+\.)([a-zA-Z-0-9]+\.)?([a-zA-Z-0-9]+\.)?([a-zA-Z-0-9]+\.?)"
    def fit(self, frame):
        return self
    def transform(self, frame):
        import re
        frame["Description"] = frame["Description"].apply(lambda desc: re.sub(self.reg, '', desc))
        return frame

In [207]:
class StripWhitespace(BaseEstimator, TransformerMixin):
    """
    Replace all runs of whitespace with a single space
    """
    def __init__(self):
        pass
    def fit(self, frame):
        return self
    def transform(self, frame):
        import re
        frame["Description"] = frame["Description"].apply(lambda desc: re.sub(r"\s+", ' ', desc))#desc.replace('\n', '').replace('\t', ''))
        return frame

In [208]:
underrep_frame = pd.DataFrame()
class ExtractAndRemoveUnderrespresentedCWEs(BaseEstimator, TransformerMixin):
    """
   Sets a variable `underrep_frame` in the global scope to the extracted frame
    """
    def __init__(self, threshold: int = 5):
        self.threshold = threshold
    def fit(self, frame):
        return self
    def transform(self, frame):
        counts = frame["CWE-ID"].value_counts()
        underrepresented_cwes = counts.where(counts <= self.threshold).dropna().index.tolist()
        ur_frame = frame[frame['CWE-ID'].isin(underrepresented_cwes)]
        
        global underrep_frame 
        underrep_frame = ur_frame
        
        frame = frame[~frame['CWE-ID'].isin(counts[counts <= self.threshold].index)]
        final_counts = frame["CWE-ID"].value_counts()
        print("Original Number of CWEs: {}, final count: {}".format(len(counts), len(final_counts)))
        return frame

In [209]:
lookup_table = None
class CreateAndMapLookupTable(BaseEstimator, TransformerMixin):
    """
    Map all CWEs to numbers `0...num_uniq_cwe`. Set global `lookup_table` variable
    """
    def __init__(self):
        pass
    def fit(self, frame):
        return self
    def transform(self, frame):
        labels = frame['CWE-ID'].unique()
        global lookup_table
        lookup_table = dict(zip(list(labels), range(0, len(labels))))
        frame["CWE-ID"] = frame['CWE-ID'].apply(lambda x:lookup_table[x])
        return frame

In [210]:

cve_transform_pipeline = Pipeline(
                        steps=[
                            
                            ('obs_enrichment',       ObsExampleEnrichment(obs_df=obs_df)),
                            ('strip_version',        StripVersionNumbers()),
                            ('strip_whitespace',     StripWhitespace()),
                            ('extract_underrep_cwe', ExtractAndRemoveUnderrespresentedCWEs(threshold=5)),
                            ('lookup_table',         CreateAndMapLookupTable())
                        ]
                        )

In [211]:
merged_and_pipelined_df = cve_transform_pipeline.fit_transform(merged_df)

[+] Found and replaced 3358 examples with observed examples
Original Number of CWEs: 882, final count: 409


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [212]:
sss_train_test = StratifiedShuffleSplit(n_splits=1, test_size=0.3)
sss_train_test.get_n_splits(merged_and_pipelined_df['Description'], merged_and_pipelined_df['CWE-ID'])

# 70 / 30 train/test split
train_idx, test_idx = [], []
for train, test in sss_train_test.split(merged_and_pipelined_df['Description'], merged_and_pipelined_df['CWE-ID']):
        train_idx.append(train)
        test_idx.append(test)
        
train_frame = merged_and_pipelined_df.iloc[train_idx[0]]
test_frame = merged_and_pipelined_df.iloc[test_idx[0]]

# Split the current test set in two to create a Validation set
# 70/15/15

sss_test_validation = StratifiedShuffleSplit(n_splits=1, test_size=0.5)
sss_train_test.get_n_splits(test_frame['Description'], test_frame['CWE-ID'])
test_idx, validation_idx = [], []
for test, validation in sss_test_validation.split(test_frame['Description'], test_frame['CWE-ID']):
        test_idx.append(test)
        validation_idx.append(validation)

test_frame = merged_and_pipelined_df.iloc[test_idx[0]]
validation_frame = merged_and_pipelined_df.iloc[validation_idx[0]]

In [194]:
output_dir = "../datasets/TrainingV3/TTVDatasets/"

merged_and_pipelined_df.to_csv(output_dir + "all_data.csv", index=False)
train_frame.to_csv(output_dir + "train_cwe_nlp.csv", index=False)
test_frame.to_csv(output_dir + "test_cwe_nlp.csv", index=False)
validation_frame.to_csv(output_dir + "validation_cwe_nlp.csv", index=False)
underrep_frame.to_csv(output_dir + "underrep_cwe_nlp.csv", index=False)

import pickle
pickle.dump( lookup_table, open( output_dir + "lookup_table.p", "wb" ) )

In [213]:
train_labels = train_frame['CWE-ID'].unique()
validation_labels = validation_frame['CWE-ID'].unique()
test_labels = test_frame['CWE-ID'].unique()
merged_labels = np.concatenate((train_labels, validation_labels, test_labels))
merged_labels = np.unique(merged_labels)

print(len(train_labels))
print(len(validation_labels))
print(len(test_labels))
print("Unique: {}".format(len(merged_labels)))

409
407
403
Unique: 409


In [198]:
print(lookup_table)

{1004: 0, 1007: 1, 1021: 2, 1037: 3, 113: 4, 93: 5, 115: 6, 116: 7, 788: 8, 823: 9, 119: 10, 680: 11, 1244: 12, 120: 13, 130: 14, 170: 15, 1236: 16, 787: 17, 786: 18, 125: 19, 129: 20, 781: 21, 833: 22, 131: 23, 682: 24, 155: 25, 783: 26, 134: 27, 426: 28, 138: 29, 150: 30, 141: 31, 48: 32, 143: 33, 144: 34, 147: 35, 151: 36, 157: 37, 154: 38, 152: 39, 56: 40, 156: 41, 158: 42, 42: 43, 177: 44, 626: 45, 166: 46, 239: 47, 85: 48, 841: 49, 174: 50, 176: 51, 33: 52, 46: 53, 54: 54, 178: 55, 58: 56, 439: 57, 433: 58, 180: 59, 57: 60, 181: 61, 55: 62, 182: 63, 35: 64, 502: 65, 183: 66, 184: 67, 86: 68, 79: 69, 78: 70, 28: 71, 625: 72, 50: 73, 185: 74, 209: 75, 186: 76, 187: 77, 305: 78, 416: 79, 190: 80, 191: 81, 193: 82, 617: 83, 194: 84, 681: 85, 95: 86, 89: 87, 20: 88, 824: 89, 204: 90, 213: 91, 208: 92, 250: 93, 215: 94, 207: 95, 449: 96, 206: 97, 532: 98, 210: 99, 211: 100, 212: 101, 214: 102, 219: 103, 22: 104, 73: 105, 98: 106, 222: 107, 223: 108, 65: 109, 226: 110, 316: 111, 394: 11

In [200]:
for i in merged_labels:
    print(lookup_table[i])

KeyError: 0