In [None]:
import pandas as pd
import numpy as np
import string
df = pd.read_csv('drive/MyDrive/QED/cybersecurity_training.csv', sep='|')

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline


In [None]:
df["notified"].value_counts()

0    10865
1      652
Name: notified, dtype: int64

In [None]:
quantitative = ['overallseverity', 'timestamp_dist', 'correlatedcount',
                'srcip_cd', 'dstip_cd', 'srcport_cd', 'dstport_cd',
                'alerttype_cd', 'direction_cd', 'eventname_cd', 'severity_cd',
                'reportingdevice_cd', 'devicetype_cd', 'devicevendor_cd',
                'domain_cd', 'protocol_cd', 'username_cd',  'srcipcategory_cd',
                'dstipcategory_cd', 'untrustscore', 'flowscore', 'trustscore',
                'enforcementscore', 'thrcnt_month', 'thrcnt_week', 'thrcnt_day',
                'p6', 'p9', 'p5m', 'p5w', 'p5d', 'p8m', 'p8w', 'p8d' ]
qualitative=['categoryname','ipcategory_name','ipcategory_scope',
             'parent_category', 'grandparent_category', 'isiptrusted',
             'dstipcategory_dominate', 'srcipcategory_dominate', 
             'dstportcategory_dominate',  'srcportcategory_dominate'
             ]
cyclic=['start_hour', 'start_minute', 'start_second']
level=["client_code"]
cyclic_level=['weekday']
deleted=["alert_ids"]

In [None]:
df.drop(columns=deleted, inplace=True)

In [None]:
y=df["notified"]
X=df.drop(columns=["notified"])

In [None]:
for el in qualitative:
  print(el, '   ',df[el].unique().shape[0])

categoryname     9
ipcategory_name     8
ipcategory_scope     4
parent_category     4
grandparent_category     2
isiptrusted     3
dstipcategory_dominate     6
srcipcategory_dominate     8
dstportcategory_dominate     6
srcportcategory_dominate     6


In [None]:
#Transformer to Numeric
class AbstarctFeatureToNumericTransformer:
    def __init__(self, name_of_features):
        self.name_of_features=name_of_features
        
    def fit(self, x, y=None):
        self.name_of_features = [f for f in self.name_of_features if f in x.columns]
        return self 
    
    def fit_transform(self, x, y=None):
        self.fit(x,y)
        return self.transform(x)
    
    def transform(self,x:pd.DataFrame):
        raise NonImplemented("the method is not implemented, instatiate a child class")
        return
     
class SelectedFeaturesOrdinalEncoder(OrdinalEncoder):
    def __init__(self,name_of_level_features,*args, **kwargs):
        super().__init__(*args, **kwargs)
        self.name_of_level_features=name_of_level_features
    def fit(self, x, y=None):
        return super().fit(x[self.name_of_level_features],y)
    def transform(self,x:pd.DataFrame):
        x=x.copy()
        x[self.name_of_level_features]=super().transform(x[self.name_of_level_features])
        return x

class SelectedFeaturesNanSimpleImputer(SimpleImputer):
    def __init__(self, feature_names, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.feature_names = feature_names

    def fit(self, x, y = None):
        return super().fit(x[self.feature_names], y)
    
    def transform(self,x:pd.DataFrame):
        x=x.copy()
        x[self.feature_names]=super().transform(x[self.feature_names])
        return x
    
class CyclicalToNumericTransformer(AbstarctFeatureToNumericTransformer):
     def transform(self,x:pd.DataFrame):
        x=x.copy()
        for c in self.name_of_features:
            xc = x[c].astype(np.float)
            y_i=np.sin(2*np.pi*xc/np.max(xc))
            x_i=np.cos(2*np.pi*xc/np.max(xc))
            x=x.drop(columns=c)
            dx = pd.DataFrame(list(x_i),columns =[c+'_x'])
            dy = pd.DataFrame(list(y_i),columns =[c+'_y'])
            x.reset_index(drop=True, inplace=True)
            x=pd.concat([x,dx,dy], axis=1)
        return x

class QualitativeToNumericTransformer(AbstarctFeatureToNumericTransformer):
    def __init__(self,name_of_level_features,category_names=None):
        super().__init__(name_of_level_features)
        self.category_names=category_names

    def fit(self, X,y=None):
        if self.category_names is None:
            self.category_names=[]
            for el in self.name_of_features:
                self.category_names.append(list(X[el].unique()))
        return super().fit(X,y)

    def qualitative_features_to_one_hot(self,feature, categories, data):
        data = pd.Categorical(data, categories = categories)
        one_hot = pd.get_dummies(data, drop_first=True, prefix=feature)
        return one_hot
    
    def transform(self,x:pd.DataFrame):
        x=x.copy()
        for feature, cat_name in zip(self.name_of_features, self.category_names):
            one_hot=self.qualitative_features_to_one_hot(feature, cat_name, x[feature])
            x.reset_index(drop=True, inplace=True)
            x=pd.concat([x, one_hot], axis=1)
            x=x.drop(columns=[feature])
        return x

class NanReplacer(AbstarctFeatureToNumericTransformer):
    def transform(self, x):
        x=x.copy()
        x[self.name_of_features]=x[self.name_of_features].fillna(-1)
        return x

        

In [None]:
class EncryptedIPTransformer:
    def __init__(self):
        letters = string.ascii_uppercase
        codes = list(range(256)) + [ll+l for ll in letters for l in letters]
        self.map = {str(c):int(i) for i, c in enumerate(codes)}
        self.map[''] = -1

    def fit(self, x, y = None):
        return self
    
    def transform(self, x):
        x=x.copy()
        data=list(x["ip"].str.split(pat='.'))
        ips=pd.DataFrame(columns=["ip1","ip2","ip3","ip4"], data=data)
        cols_with_map= {c:self.map for c in ips.columns}
        ips=ips.replace(cols_with_map)
        x[["ip1","ip2","ip3","ip4"]]=ips.values
        x.drop(columns="ip", inplace=True)
        return x
    
    def fit_transform(self, x,y=None):
        self.fit(x,y)
        return self.transform(x)



In [None]:
#bez wiersza z nan
mask=X.isna()
mask=mask[mask.columns[26:]].sum(axis=1)>0
X=X[~mask]
y=y[~mask]
nan_fetaures = ['n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9','n10', 'score']
weekdays=['Mon', 'Tue','Wed','Thu','Fri','Sat', 'Sun']
constant_pipe=Pipeline([["EncryptedIPTransformer()", EncryptedIPTransformer()],
                        ["NanReplacer", NanReplacer(nan_fetaures)],
                        ["QualitativeToNumericTransformer", QualitativeToNumericTransformer(qualitative)],
                        ["SelectedFeaturesOrdinalEncoder1", SelectedFeaturesOrdinalEncoder(level)],
                        ["SelectedFeaturesOrdinalEncoder2", SelectedFeaturesOrdinalEncoder(cyclic_level, categories=[weekdays])],
                        ["CyclicalToNumericTransformer", CyclicalToNumericTransformer(cyclic+cyclic_level)]
                      ])

X=constant_pipe.fit_transform(X)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


In [None]:
import warnings
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
warnings.filterwarnings("ignore")
features = list(X.columns)
target_feat_cnt = len(features)
selected_features = []
rocauc_history = []

rocauc_history = [0.7102294509818448, 0.7474142744195535, 0.7689796377686984, 0.8154099650882308, 0.837725021984364, 0.8554998113333901, 0.868865934027103, 0.8738854199837197, 0.8791823283999429, 0.8880104393659607, 0.8936416087792456, 0.897455449821598, 0.8979330276962469, 0.900245691460815, 0.8979407154151395, 0.8983210850365143, 0.9004484173830545, 0.8998938803288166, 0.9009116069475922, 0.9010200632742981, 0.9003008424925211, 0.9011951536999273, 0.9034135230659835, 0.9019453623466449, 0.9042696627190164, 0.904424551221003, 0.9054036925449569, 0.9046089501759464, 0.9033658426972526, 0.9037443786524714, 0.9060161813967236, 0.9066600318476311, 0.9062571821983431, 0.9049236568240702, 0.9061715291693446, 0.9069639484239449, 0.9070739854304112, 0.9069910211382949, 0.9070618877156372, 0.9062221467202782, 0.9073973342915684, 0.9070091640290748, 0.9069194505425523, 0.9076988378451645, 0.906867939059244, 0.9076533967066146, 0.9071905825483876, 0.9069997997502899, 0.9085697793472078, 0.9064383038980681]
selected_features = ['ip1', 'untrustscore', 'client_code', 'thrcnt_month', 'correlatedcount', 'categoryname_Control and Maintain', 'srcportcategory_dominate_0.0', 'ip2', 'categoryname_Exploit', 'overallseverity', 'srcipcategory_dominate_INTERNET', 'dstportcategory_dominate_1.0', 'n5', 'n7', 'n2', 'flowscore', 'score', 'n4', 'ipcategory_name_PRIV-CGN', 'srcportcategory_dominate_2.0', 'p5d', 'parent_category_1', 'dstport_cd', 'enforcementscore', 'dstipcategory_dominate_PRIV-10', 'dstip_cd', 'ipcategory_scope_Private network', 'dstipcategory_dominate_PRIV-CGN', 'reportingdevice_cd', 'protocol_cd', 'devicevendor_cd', 'categoryname_Attack Preparation', 'srcportcategory_dominate_1.0', 'isiptrusted_1.0', 'srcipcategory_dominate_PRIV-10', 'srcipcategory_dominate_LOOPBACK', 'dstipcategory_cd', 'dstipcategory_dominate_PRIV-192', 'ipcategory_name_BROADCAST', 'categoryname_Suspicious Network Activity', 'ipcategory_scope_Subnet', 'ipcategory_name_PRIV-172', 'alerttype_cd', 'thrcnt_week', 'eventname_cd', 'ipcategory_scope_Host', 'p8d', 'categoryname_Suspicious Reputation', 'devicetype_cd', 'domain_cd']

cv = StratifiedKFold(5)
clf = RandomForestClassifier()
curr_feat_cnt = len(rocauc_history)
features = list(set(features) - set(selected_features))
for i in range(target_feat_cnt):
    current_rocaucs = {}
    for f in features:
        x_cecha = X[selected_features+[f]].values
        rocauc = cross_val_score(clf, x_cecha, y.values, cv = cv, scoring='roc_auc')
        rocauc = rocauc.mean()
        current_rocaucs[rocauc] = f
    max_current_roc_auc = np.max(list(current_rocaucs.keys()))
    rocauc_history.append(max_current_roc_auc)
    selected_features.append(current_rocaucs[max_current_roc_auc])
    features.remove(current_rocaucs[max_current_roc_auc])      

In [None]:
aucs = [0.7102294509818448, 0.7474142744195535, 0.7689796377686984, 0.8154099650882308, 0.837725021984364, 0.8554998113333901, 0.868865934027103, 0.8738854199837197, 0.8791823283999429, 0.8880104393659607, 0.8936416087792456, 0.897455449821598, 0.8979330276962469, 0.900245691460815, 0.8979407154151395, 0.8983210850365143, 0.9004484173830545, 0.8998938803288166, 0.9009116069475922, 0.9010200632742981, 0.9003008424925211, 0.9011951536999273, 0.9034135230659835, 0.9019453623466449, 0.9042696627190164, 0.904424551221003, 0.9054036925449569, 0.9046089501759464, 0.9033658426972526, 0.9037443786524714, 0.9060161813967236, 0.9066600318476311, 0.9062571821983431, 0.9049236568240702, 0.9061715291693446, 0.9069639484239449, 0.9070739854304112, 0.9069910211382949, 0.9070618877156372, 0.9062221467202782, 0.9073973342915684, 0.9070091640290748, 0.9069194505425523, 0.9076988378451645, 0.906867939059244, 0.9076533967066146, 0.9071905825483876, 0.9069997997502899, 0.9085697793472078, 0.9064383038980681, 0.9084540801278929, 0.9078881662377327, 0.9097554468311569, 0.9075764979569143, 0.9080758012497693, 0.9100758761717828, 0.9084066370371705, 0.9068099700280247, 0.9103345376443237, 0.9078587427892838, 0.9078531292368524, 0.9084809795025697, 0.9101616105046846, 0.9076584250500959, 0.908961334539405, 0.9093612021257405, 0.9087945698922659, 0.9089112910027385, 0.9070911131781951, 0.9080801473272215, 0.9071191375027923, 0.9073329039203291, 0.9070008013397324, 0.9033259268674725, 0.9046480674832024, 0.905377143437698, 0.9049296490872221, 0.905916125506268, 0.9045117130986654, 0.903570124483833, 0.9011852912739302, 0.9022620120075209, 0.9020510336432854, 0.9026970508811651, 0.9005856427109297, 0.9017157277511003, 0.9010924651878346, 0.8974779748696037, 0.9006229945054779, 0.8996807261576656, 0.8960330228676785, 0.89730972092655, 0.8970846382846217, 0.8974061026444664, 0.8919056321958205, 0.8920773772127835, 0.8888686061672892, 0.8866868019472351, 0.8824841425675439]
selected_features = ['ip1', 'untrustscore', 'client_code', 'thrcnt_month', 'correlatedcount', 'categoryname_Control and Maintain', 'srcportcategory_dominate_0.0', 'ip2', 'categoryname_Exploit', 'overallseverity', 'srcipcategory_dominate_INTERNET', 'dstportcategory_dominate_1.0', 'n5', 'n7', 'n2', 'flowscore', 'score', 'n4', 'ipcategory_name_PRIV-CGN', 'srcportcategory_dominate_2.0', 'p5d', 'parent_category_1', 'dstport_cd', 'enforcementscore', 'dstipcategory_dominate_PRIV-10', 'dstip_cd', 'ipcategory_scope_Private network', 'dstipcategory_dominate_PRIV-CGN', 'reportingdevice_cd', 'protocol_cd', 'devicevendor_cd', 'categoryname_Attack Preparation', 'srcportcategory_dominate_1.0', 'isiptrusted_1.0', 'srcipcategory_dominate_PRIV-10', 'srcipcategory_dominate_LOOPBACK', 'dstipcategory_cd', 'dstipcategory_dominate_PRIV-192', 'ipcategory_name_BROADCAST', 'categoryname_Suspicious Network Activity', 'ipcategory_scope_Subnet', 'ipcategory_name_PRIV-172', 'alerttype_cd', 'thrcnt_week', 'eventname_cd', 'ipcategory_scope_Host', 'p8d', 'categoryname_Suspicious Reputation', 'devicetype_cd', 'domain_cd', 'dstportcategory_dominate_4.0', 'categoryname_Compromise', 'grandparent_category_B', 'categoryname_Reconnaissance', 'weekday_y', 'ipcategory_name_PRIV-192', 'dstipcategory_dominate_PRIV-172', 'srcportcategory_dominate_3.0', 'severity_cd', 'srcipcategory_cd', 'n8', 'srcipcategory_dominate_PRIV-172', 'n1', 'n6', 'trustscore', 'dstportcategory_dominate_3.0', 'parent_category_4', 'username_cd', 'timestamp_dist', 'ipcategory_name_PRIV-10', 'n9', 'ip3', 'n3', 'categoryname_Malicious Activity', 'direction_cd', 'start_hour_y', 'srcipcategory_dominate_PRIV-CGN', 'dstportcategory_dominate_0.0', 'srcip_cd', 'ip4', 'p9', 'p5w', 'ipcategory_name_LOOPBACK', 'thrcnt_day', 'p8w', 'srcport_cd', 'p6', 'n10', 'p5m', 'start_hour_x', 'srcipcategory_dominate_LINK-LOCAL', 'p8m', 'parent_category_3', 'weekday_x', 'start_minute_x', 'start_minute_y', 'ipcategory_name_LINK-LOCAL', 'start_second_y', 'start_second_x']
ind = np.argmax(aucs)

In [None]:
df = pd.read_csv('drive/MyDrive/QED/cybersecurity_training.csv', sep='|')
df.drop(columns=deleted, inplace=True)
y_train=df["notified"]
X_train=df.drop(columns=["notified"])
x_test = pd.read_csv('drive/MyDrive/QED/cybersecurity_test.csv', sep='|')
x_test.drop(columns=deleted, inplace=True)

final_pipe=Pipeline([["EncryptedIPTransformer()", EncryptedIPTransformer()],
                        ["NanReplacer", NanReplacer(nan_fetaures)],
                        ["SelectedFeaturesNanSimpleImputerQuan", SelectedFeaturesNanSimpleImputer(quantitative, strategy = 'mean')],
                        ["SelectedFeaturesNanSimpleImputerQual", SelectedFeaturesNanSimpleImputer(qualitative+cyclic+level+cyclic_level, strategy='most_frequent')],
                        ["QualitativeToNumericTransformer", QualitativeToNumericTransformer(qualitative)],
                        ["SelectedFeaturesOrdinalEncoder1", SelectedFeaturesOrdinalEncoder(level, 
                                                                                           handle_unknown = 'use_encoded_value',
                                                                                           unknown_value=-1)],
                        ["SelectedFeaturesOrdinalEncoder2", SelectedFeaturesOrdinalEncoder(cyclic_level, categories=[weekdays])],
                        ["CyclicalToNumericTransformer", CyclicalToNumericTransformer(cyclic+cyclic_level)]
                      ])

X_train = final_pipe.fit_transform(X_train)[selected_features[:ind]]
clfr = RandomForestClassifier()
clfr.fit(X_train, y_train)

x_test = final_pipe.transform(x_test)[selected_features[:ind]]
y_pred = clfr.predict_proba(x_test)[:,1]
y_pred = pd.DataFrame(y_pred)
y_pred.to_csv('drive/MyDrive/QED/mkac_solution.txt', index = False, header = False)