# Anomaly Detection Challenge 4
## Miguel Sandim and Paula Fortuna

## 0 - Library Imports & Utils

In [1]:
# json
import json

# math
import math

# pandas
import pandas as pd
from pandas import Series,DataFrame

# numpy, matplotlib, seaborn
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Random libraries and seeds:
import random
random.seed(2)
np.random.seed(2)

pd.set_option('display.max_columns', None)

In [2]:
# From: http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html
import itertools
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

######################################
# Function Save Data To CSV
######################################

def saveDataToCSV(Y_pred):
    id_list = range(1, len(Y_pred) + 1)
    submission = pd.DataFrame({
        "Id": id_list,
        "Label": Y_pred
    })
    submission = submission[['Id', 'Label']]
    submission.to_csv('submission.csv', index=False)


# 1 - Data Reading

### 1.1 Load data

In [3]:
def readData(filename):
    data = []
    with open(filename) as f:
        for line in f:
            data.append(json.loads(line))
            
    return pd.io.json.json_normalize(data)

In [4]:
train_df = readData("data/training_set_dedup.csv")

In [5]:
test_df = readData("data/test_set_dedup.csv")

In [None]:
train_df

### Save data for Paula:

In [6]:
train_df.to_csv("train_df.csv", index = False, encoding='utf-8')

In [7]:
test_df.to_csv("test_df.csv", index = False, encoding='utf-8')

### Some statistics:

In [8]:
train_df

Unnamed: 0,label,results.peinfo.debug,results.peinfo.error,results.peinfo.exports,results.peinfo.imphash,results.peinfo.imports,results.peinfo.pe_sections,results.peinfo.pehash,results.peinfo.rich_header,results.peinfo.rich_header.checksum,results.peinfo.rich_header.sha256,results.peinfo.rich_header.values_parsed,results.peinfo.rich_header.values_raw,results.peinfo.thread_local_storage,results.peinfo.timestamp.human_timestamp,results.peinfo.timestamp.timestamp,results.peinfo.version_info,results.peinfo.version_var,results.sha256,sha256
0,benign,"[{'TimeDateStamp': 1366921401, 'TimeDateString...",,"[{'function': 'GetCommandManager'}, {'function...",4a128e034b29626b284abefbf44f9089,"[{'dll': 'msvcrt.dll', 'function': 'tolower'},...","[{'size': 188928, 'virt_size': 188590, 'virt_a...",510c88d075f79fb337c3d3d77d7963b2bf1f0e78,,1.738446e+08,4947798090ff53af2546e2606c4fad310bfa4480fbb3c6...,"[{'times_used': 1, 'id': 205, 'version': 65501...","[13500381, 1, 13565917, 13, 65536, 141, 133693...",,2013-04-25T20:23:21Z,1.366921e+09,"[{'key': 'LegalCopyright', 'value': 'Copyright...",[],001bd69f3e309ed655400f8145125a79ca0eb93479ac4f...,001bd69f3e309ed655400f8145125a79ca0eb93479ac4f...
1,benign,"[{'TimeDateStamp': 1171692007, 'TimeDateString...",,,ba7cbf0e758b850647e1d0073574c54e,"[{'dll': 'ADVAPI32.dll', 'function': 'RegQuery...","[{'size': 38912, 'virt_size': 38414, 'virt_add...",9dbf036adb61576094245c61b2bff72f4033483c,,9.995327e+08,60d5b7093fa9b1711caf14a3f03270db706261663a30d8...,"[{'times_used': 13, 'id': 93, 'version': 4035}...","[6098883, 13, 65536, 146, 987075, 9, 6295491, ...",,2007-02-17T06:00:07Z,1.171692e+09,"[{'key': 'LegalCopyright', 'value': '\xa9 Micr...",[],001d0ac7bd3ee8e988df300389a37445c54bb264020101...,001d0ac7bd3ee8e988df300389a37445c54bb264020101...
2,benign,"[{'TimeDateStamp': 1377137904, 'TimeDateString...",,"[{'function': 'DllCanUnloadNow'}, {'function':...",528b2992052d989f619b0052eeaeaa20,"[{'dll': 'msvcrt.dll', 'function': '??_V@YAXPA...","[{'size': 101376, 'virt_size': 101211, 'virt_a...",0853f028928ce31adb4e7a690e9f6ea31b87e27b,,3.701114e+08,6bd15d25b5d59324ffda966bc39144f134aaba12c22627...,"[{'times_used': 11, 'id': 126, 'version': 5072...","[8308263, 11, 9664521, 58, 13500381, 3, 136314...",,2013-08-22T02:18:24Z,1.377138e+09,"[{'key': 'LegalCopyright', 'value': '\xa9 Micr...",[],003cf74e9edbe9dfc3e32ade773e2470c91a528859ab2b...,003cf74e9edbe9dfc3e32ade773e2470c91a528859ab2b...
3,benign,"[{'TimeDateStamp': 1111711302, 'TimeDateString...",,"[{'function': 'CsrAddStaticServerThread'}, {'f...",9fdb1479a9bb89029d164541dcbc7fb1,"[{'dll': 'ntdll.dll', 'function': 'RtlFreeHeap...","[{'size': 43520, 'virt_size': 43416, 'virt_add...",6be43b55f661ce9c490f249d1547d32882bb2718,,2.071590e+09,8fa473e8cd57c3ed58445b0c13435da87c1f3b911d69d7...,"[{'times_used': 3, 'id': 123, 'version': 40310...","[8101238, 3, 65536, 117, 8232310, 1, 8035702, ...",,2005-03-25T03:53:52Z,1.111723e+09,"[{'key': 'LegalCopyright', 'value': '\xa9 Micr...",[],004132e2de0b55716f8f3b1c162e881f202945fa8836f9...,004132e2de0b55716f8f3b1c162e881f202945fa8836f9...
4,benign,"[{'TimeDateStamp': 1377166767, 'TimeDateString...",,"[{'function': 'DllRegisterServer'}, {'function...",829e5e983dc65830de4f874e2621dea5,"[{'dll': 'msvcrt.dll', 'function': 'wcschr'}, ...","[{'size': 476160, 'virt_size': 475694, 'virt_a...",034639e71972a27bc0be17a8aadbcb3f429d6a43,,1.196304e+09,723a1c363bd6d2ad676f0d88d0b342848681815332020e...,"[{'times_used': 1, 'id': 205, 'version': 65501...","[13500381, 1, 13565917, 65, 65536, 366, 133693...",,2013-08-22T10:19:27Z,1.377167e+09,"[{'key': 'LegalCopyright', 'value': '\xa9 Micr...",[],0044246249f4b945d72bbc0fef9bf3c31e62f57cbf7761...,0044246249f4b945d72bbc0fef9bf3c31e62f57cbf7761...
5,benign,,,,,,"[{'size': 2048, 'virt_size': 4096, 'virt_addre...",d0165f57b80c120c9c39b923e6341d33952c2133,,2.340238e+09,2ccb618821b38fda1aedff74a76110d6cd2f2371ada49b...,"[{'times_used': 1, 'id': 201, 'version': 65501...","[13238237, 1, 13434845, 1]",,1970-01-01T00:00:00Z,0.000000e+00,"[{'key': 'LegalCopyright', 'value': '\xa9 Micr...",[],0057c6638b45ff94bcbe327c87d36f261666de0e94cfb0...,0057c6638b45ff94bcbe327c87d36f261666de0e94cfb0...
6,benign,"[{'TimeDateStamp': 1111710486, 'TimeDateString...",,,9e4683def9f4497c47282b066fd8888b,"[{'dll': 'KERNEL32.dll', 'function': 'DeleteCr...","[{'size': 14848, 'virt_size': 14658, 'virt_add...",bf2a1bdf6f6be001d23a339714f51adba64c1593,,4.072897e+09,de5d72c001013de03980b4568897c9964fc5bbd88d6eb1...,"[{'times_used': 1, 'id': 15, 'version': 4035},...","[987075, 1, 6098883, 13, 65536, 90, 6229955, 1...",,2005-03-25T00:28:06Z,1.111710e+09,"[{'key': 'LegalCopyright', 'value': '\xa9 Micr...",[],006a8155bb0e93800078a2f9d2cd3a9c7839c99b328e26...,006a8155bb0e93800078a2f9d2cd3a9c7839c99b328e26...
7,benign,"[{'TimeDateStamp': 1132736680, 'TimeDateString...",,"[{'function': 'DllCanUnloadNow'}, {'function':...",0a2f547c09ae773ce66f8387e3db6f07,"[{'dll': 'msvcrt.dll', 'function': 'wcsncmp'},...","[{'size': 213504, 'virt_size': 213319, 'virt_a...",6f030efacc76fe34422313b8cec6bdfc09dea9ba,,2.985514e+09,535364e67b6a735b04dfdbf62fb44256cd60644c274fce...,"[{'times_used': 3, 'id': 110, 'version': 31001...","[7239961, 3, 65536, 288, 6098883, 33, 987075, ...",,2005-11-23T10:18:47Z,1.132741e+09,"[{'key': 'LegalCopyright', 'value': '\xa9 Micr...",[],00a2627a3d699b4ff599f871c4a276ff435ccbc5b685cf...,00a2627a3d699b4ff599f871c4a276ff435ccbc5b685cf...
8,benign,"[{'TimeDateStamp': 1045795662, 'TimeDateString...",,"[{'function': 'CreateCab'}, {'function': 'call...",c0b5767e9f9334929a4df19ca41c9ce0,"[{'dll': 'USER32.dll', 'function': 'GetClassLo...","[{'size': 5632, 'virt_size': 5316, 'virt_addre...",450c58b85a8414d9c69b6a4c2cac4827f93e9c78,,3.656823e+09,7b256a413e25bdb9bd6246c6c7b39b6bb3240ede191f5b...,"[{'times_used': 2, 'id': 93, 'version': 2148},...","[6096996, 2, 985188, 1, 6228068, 9, 1647610, 7...",,2003-02-21T02:47:42Z,1.045796e+09,"[{'key': 'LegalCopyright', 'value': 'Copyright...",[],00a7bffa80d179f7b169d0c3eed177bd6ace267fb3ef43...,00a7bffa80d179f7b169d0c3eed177bd6ace267fb3ef43...
9,benign,"[{'TimeDateStamp': 1329537515, 'TimeDateString...",,,77b35f28a42119f8f0e46a481132f34f,"[{'dll': 'KERNEL32.dll', 'function': 'CloseHan...","[{'size': 8704, 'virt_size': 8620, 'virt_addre...",ebb5895b098e110b42767018f1698b45bcd13a5b,,1.326691e+09,d30f949302e81af749d59fafc81ce41006e3c88e719b29...,"[{'times_used': 2, 'id': 187, 'version': 30716...","[12285948, 2, 12417020, 16, 12154876, 13, 6553...",,2012-02-18T03:58:35Z,1.329538e+09,"[{'key': 'LegalCopyright', 'value': '\xa9 Micr...",[],00ab6a08ec74d80c0d202ef451d4831076ebe80a1cd9cf...,00ab6a08ec74d80c0d202ef451d4831076ebe80a1cd9cf...


In [13]:
type(train_df["results.peinfo.exports"][0][0])

dict

In [33]:
math.isnan(train_df["results.peinfo.debug"][20])

True

In [14]:
train_df["label"].value_counts()

malicious    5000
benign       4754
Name: label, dtype: int64

In [None]:
41/(41 + 56000) * 100

## About 0.07% of our dataset are anomalous cases.

# 2 - Data Pre-Processing

In [None]:
import sklearn.preprocessing as skpre

## 2.1 - Solve Missing Values

No missing values exist!

## 2.2 - Feature Extraction

Change label to numeric

In [18]:
train_df["label_numeric"] = train_df.apply(lambda x: 1 if x["label"] == "malicious" else 0, axis=1)

In [43]:
train_df["has_debug"] = train_df.apply(lambda x: 1 if (type(x["results.peinfo.debug"]) is list) else 0, axis=1)

In [45]:
train_df["has_debug"] = train_df.apply(lambda x: 1 if (type(x["results.peinfo.debug"]) is list) else 0, axis=1)

0

## 2.4 Sampling

### 2.4.1 Undersampling

In [None]:
def undersampling(data):
    ids_to_sample = data[data["label"] == 0].index.values
    sample_size = data[data["label"] == 1].shape[0]
    anomalies = data[data["label"] == 1].index.values
    
    final_ids = np.append(np.random.choice(ids_to_sample, size = sample_size), anomalies)
    #print(ids_to_sample)
    #data[fake_review]
    return data.iloc[final_ids]
    
#train_df = undersampling(train_df)  

### 2.4.2 Oversampling

In [None]:
def oversampling(data):
    anomalies = data[data["fake_review"] == "Y"]
    data = data.append(anomalies)
    data = data.append(anomalies)
    data = data.append(anomalies)
    data = data.append(anomalies)
    data = data.append(anomalies)
    data = data.append(anomalies)
    return data

#train_df_joined = oversampling(train_df_joined)  

## 2.5 Define global variables for the model

In [None]:
def encodeVariables():
    
    target_variable = "label" # target variable
    
    dropVariableList = [
        
        ### All variables in this list will be deleted
        "id",
        "dur",
        "proto", # categorical
        "service", # categorical
        "state", # categorical
        #"spkts",
        #"dpkts",
        #"sbytes",
        #"dbytes",
        #"rate",
        #"sttl",
        #"dttl",
        #"sload",
        #"dload",
        #"sloss",
        #"dloss",
        ##"sinpkt",
        #"dinpkt",
        ##"sjit",
        #"djit",
        ##"swin",
        #"stcpb",
        ##"dtcpb",
        ##"dwin",
        #"tcprtt",
        #"synack",
        #"ackdat",
        ##"smean",
        #"dmean",
        #"trans_depth",
        ##"response_body_len",
        #"ct_srv_src",
        ##"ct_state_ttl",
        #"ct_dst_ltm",
        ##"ct_src_dport_ltm",
        #"ct_dst_sport_ltm",
        ##"ct_dst_src_ltm",
        #"is_ftp_login",
        ##"ct_ftp_cmd",
        #"ct_flw_http_mthd",
        #"ct_src_ltm",
        ##"ct_srv_dst"
        #"is_sm_ips_ports",
      
        # features deleted by the importance being 0:
       'djit',
       'swin',
       'dtcpb',
       'is_ftp_login',
       'ct_ftp_cmd',
       'ct_flw_http_mthd',
       'proto1',
       'proto2'
        
        ### Our variables:
        #proto1,
        #proto2,
        #service1
        ]
 
    # Drop variables in the X_train:
    X_train = train_df.drop(dropVariableList, axis=1)
    X_train.to_csv("X_train.csv", index = False, encoding='utf-8') # Save this in a CSV before dropping target
    
    # Drop target variables in the X_train:
    X_train = X_train.drop([target_variable, "attack_cat"], axis=1)
    
    # Drop variables in the X_test:
    X_test = test_df.drop(dropVariableList, axis=1)
    
    # Drop variables in the Y_train:
    Y_train = train_df[target_variable]

    # The following variables are categorical:
    # - proto
    # - service
    # - state
    # - attack_cat (we're not using this one)
    # - label
    
    # Transform categorical variables for X_train:
    categoricalVariableList = [
        #"proto",
        #"service",
        #"state"
    ]
    
    # Number of dimensions must be the same in X_train and X_test
    assert X_train.shape[1] == X_test.shape[1]
    
    # Apply dummy variables
    #X_all = X_train.append(X_test)
    #X_all = pd.get_dummies(X_all, columns = categoricalVariableList)
    
    #X_train = X_all.iloc[np.arange(0, X_train.shape[0]), :]
    #X_test = X_all.iloc[np.arange(X_train.shape[0], X_all.shape[0]), :]
    
    return X_train, Y_train, X_test

#train_df_joined = train_df_joined.iloc[np.random.permutation(len(train_df_joined))]

#X_train, Y_train, X_test = encodeVariables()

In [None]:
X_train.shape

In [None]:
train_df.shape

In [None]:
X_test.shape

In [None]:
test_df.shape

In [None]:
X_train.columns.values

In [None]:
def plotCorrelations(data):
    plt.figure(figsize=(8, 6), dpi=500)
    corr = data.corr()
    sns.heatmap(corr, 
                xticklabels=corr.columns.values,
                yticklabels=corr.columns.values)
    
#plotCorrelations(train_df)

In [None]:
import sklearn.decomposition

def performPCA():
    global X_train
    global X_test
    pca = sklearn.decomposition.PCA(n_components = 30)
    pca.fit(X_train)
    X_train = pca.transform(X_train)
    X_test = pca.transform(X_test)
    
#performPCA()

### Normalize features

In [None]:
def featureNormalization():
    global X_train
    global X_test
    
    scaler = skpre.StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
#featureNormalization()

## 3.1 - Model selection based on which models do best in CV using default settings:

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state = 2).fit(X_train, Y_train)

In [None]:
X_train.columns.values

In [None]:
rf.feature_importances_

In [None]:
fe_v = rf.feature_importances_

fe_n = X_train.columns.values

assert len(fe_v) == len(fe_n)

features = []
for i in range(0, len(fe_v)):
    features.append((fe_v[i], fe_n[i]))
    
features.sort(key = lambda x: x[0], reverse = True)
features

In [None]:
#inspired in http://machinelearningmastery.com/compare-machine-learning-algorithms-python-scikit-learn/

import sklearn.model_selection as mds
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
import xgboost as xgb

def modelSelection():

    # prepare data

    #Y_train = train_df[:,-1]
    #X_train = train_df[:,:-1]

    #Y_train = train_df[len(train_df.columns)-1]
    #X_train = train_df.drop(len(train_df.columns)-1,axis=1)

    # prepare configuration for cross validation test harness
    num_folds = 10

    # prepare models
    models = []
    models.append(('LR', LogisticRegression(random_state = 2)))
    models.append(('LDA', LinearDiscriminantAnalysis()))
    models.append(('KNN', KNeighborsClassifier()))
    models.append(('CART', DecisionTreeClassifier(random_state = 2)))
    models.append(('NB', GaussianNB()))
    #models.append(('SVM-Linear', SVC(kernel="linear")))
    #models.append(('SVM-Poly', SVC(kernel="poly")))
    #models.append(('SVM-RBF', SVC(kernel="rbf", random_state = 2)))
    models.append(('NN', MLPClassifier(random_state = 2))) 
    models.append(('RF', RandomForestClassifier(criterion="entropy", n_estimators=47, random_state = 2)))
    models.append(('AB', AdaBoostClassifier(random_state = 2)))
    models.append(('XGB', xgb.XGBClassifier()))

    # evaluate each model in turn
    results = []
    scoring = 'roc_auc' # try with 'roc_auc', f1'

    for model_name, model in models:
        kfold = mds.StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=2)
        cv_results = mds.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring)
        results.append({"name": model_name, "cv_results": cv_results, "mean": cv_results.mean(), "std": cv_results.std()})
        print("%s: %f (%f)" % (model_name, cv_results.mean(), cv_results.std()))

    # boxplot algorithm comparison
    fig = plt.figure(figsize=(13, 5), dpi=500)
    fig.suptitle('Algorithm Comparison')
    ax = fig.add_subplot(111)
    plt.boxplot([x["cv_results"] for x in results])
    ax.set_xticklabels([x["name"] for x in results])
    plt.show()

    # order the models by the mean auc
    #results_by_strategy.sort(key=lambda x: x["mean"], reverse=True)
    #print([(x["name"], x["mean"]) for x in results])

#modelSelection()

## 3.2 - Model selection based on anomaly detection techniques

In [None]:
import sklearn.ensemble
import sklearn.covariance
from sklearn import svm
import sklearn.metrics

def modelSelectionAnomaly():
    np.random.seed(2)
    ids_not_anomaly = np.random.choice(range(0, X_train.shape[0]), size = 41, replace = False)
    X_train_normal = X_train.values[ids_not_anomaly, :]
    Y_train_normal = Y_train[ids_not_anomaly]
    
    #model = sklearn.ensemble.IsolationForest(n_estimators = 60, contamination = 0.4, random_state = 2)
    model = svm.OneClassSVM(nu=0.3, kernel="linear", gamma=0.1, random_state = 2)
    #model = svm.OneClassSVM(nu=0.07, kernel="rbf", gamma=0.4, random_state = 2)
    #model = sklearn.covariance.EllipticEnvelope()
    
    model.fit(X_train[(Y_train != 1).values])
    anomaly_results = model.predict(X_train[(Y_train == 1).values])
    normal_results = model.predict(X_train_normal)
    
    print("%f %% of the anomalies were correctly classified" % (sum(anomaly_results == -1)/len(anomaly_results)))
    print("%f %% of the ok instances were correctly classified" % (sum(normal_results == 1)/len(normal_results)))
    print("ROC: %f" % 
          sklearn.metrics.roc_auc_score(np.append(np.repeat(-1, 41), np.repeat(1, 41)), np.append(anomaly_results, normal_results)))
    
#modelSelectionAnomaly()

# 4 - Submission to Kaggle

In [None]:
from sklearn.ensemble import RandomForestClassifier

# choose algorithm
def makeSubmissionKaggle(algorithm):
    algorithm.fit(X_train, Y_train)
    Y_pred = algorithm.predict(X_test)
    Y_pred = Y_pred.astype(int)

    # save data to CSV
    saveDataToCSV(Y_pred)
    
def makeSubmissionAnomalyKaggle(algorithm):
    algorithm.fit(X_train, Y_train)
    Y_pred = algorithm.predict(X_test)
    Y_pred = Y_pred.astype(int)
    Y_pred = [1 if x == 1 else 0 for x in Y_pred]

    # save data to CSV
    saveDataToCSV(Y_pred)
    
#makeSubmissionKaggle("07-spec-mean", xgb.XGBClassifier( learning_rate=0.01, n_estimators=5000, max_depth=7,
#                      min_child_weight=1, gamma=0, subsample=0.55, colsample_bytree=0.85,
#                      reg_alpha=1e-5, objective= 'binary:logistic', scale_pos_weight=1, seed=2))

#makeSubmissionKaggle(RandomForestClassifier(criterion="entropy", n_estimators=47, random_state = 2))
#makeSubmissionKaggle(xgb.XGBClassifier())

#makeSubmissionKaggle("09-spec-min", LinearDiscriminantAnalysis(n_components = 1, shrinkage = "auto", solver="lsqr"))
#makeSubmissionKaggle(RandomForestClassifier(criterion = 'entropy', max_features = 'log2', n_estimators = 630))
#makeSubmissionKaggle(RandomForestClassifier(criterion = 'entropy', max_features = 'log2', n_estimators = 310))
#makeSubmissionAnomalyKaggle(svm.OneClassSVM(nu=0.07, kernel="linear", gamma=0.1, random_state = 2))
#makeSubmissionKaggle(svm.OneClassSVM(nu=0.07, kernel="poly", gamma=0.1, random_state = 2))
#makeSubmissionAnomalyKaggle(sklearn.ensemble.IsolationForest(contamination = 0.4, random_state = 2))
#makeSubmissionKaggle(RandomForestClassifier())