In [None]:
from time import time
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import IsolationForest
from sklearn.metrics import roc_curve, auc
from sklearn.datasets import fetch_kddcup99
%matplotlib inline

In [None]:
dataset = fetch_kddcup99(subset=None, shuffle=True, percent10=True)
# http://www.kdd.org/kdd-cup/view/kdd-cup-1999/Tasks
X = dataset.data
y = dataset.target

In [None]:
feature_cols = ['duration', 'protocol_type', 'service', 'flag', 'src_bytes', 'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 'num_access_files', 'num_outbound_cmds', 'is_host_login', 'is_guest_login', 'count', 'srv_count', 'serror_rate', 'srv_serrer_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']
X = pd.DataFrame(X, columns = feature_cols)

y = pd.Series(y)

In [None]:
X.head()

In [None]:
for col in X.columns:  # turn whatever columns into floats that we can
    try:
        X[col] = X[col].astype(float)
    except ValueError:
        pass

In [None]:
X = pd.get_dummies(X, prefix=['protocol_type_', 'service_', 'flag_'], drop_first=True)

In [None]:
X.head()

In [None]:
y.value_counts()

Será que o problema é a lista Y que tem texto e categorias em vez de números ?

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
le.fit(y)
print(le.classes_)
y_integer = le.transform(y)

In [None]:
# fit a classification tree with max_depth=3 on all data
from sklearn.tree import DecisionTreeClassifier, export_graphviz

treeclf = DecisionTreeClassifier(max_depth=7)

scores = cross_val_score(treeclf, X, y_integer, scoring='accuracy', cv=5)

print(np.mean(scores))

treeclf.fit(X, y_integer)

In [None]:
# create a Graphviz file
export_graphviz(treeclf, out_file='tree_kdd.dot', feature_names=X.columns)

# At the command line, run this to convert to PNG:
!dot -Tpng tree_kdd.dot -o tree_kdd.png

<img src="https://github.com/nunoaflopes/IA4cyber-Livro2-Hands-on-ML-for-Cyber-Security-Packt/blob/master/Chapter07/tree_kdd.png?raw=1">

In [None]:
pd.DataFrame({'feature':X.columns, 'importance':treeclf.feature_importances_}).sort_values('importance', ascending=False).head(10)

In [None]:
# How about a Random Forest?
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

scores = cross_val_score(rf, X, y_integer, scoring='accuracy', cv=5)

print(np.mean(scores))  # nicer

rf.fit(X, y_integer)

In [None]:
# more spread out importances than a single decision tree
pd.DataFrame({'feature':X.columns, 'importance':rf.feature_importances_}).sort_values('importance', ascending=False).head(10)

# Anomoly Detection
“An outlier is an observation in a data set which appears to be inconsistent with the remainder of that set of data.”

- Supervised Anomoly Detection

    - Labels available for both normal data and anomalies
    - Similar to rare class mining / imbalanced classification

- Unsupervised Anomoly Detection (Outlier Detection)
    - no labels, training set = normal + abnormal data • Assumption: anomalies are very rare

- Semi-supervised Anomoly Detection (Novelty Detection)
    - Only normal data available to train
    - The algorithm learns on normal data only

## Isolation Forest

The IsolationForest ‘isolates’ observations by randomly selecting a feature and then randomly selecting a split value between the maximum and minimum values of the selected feature.

Since recursive partitioning can be represented by a tree structure, the number of splittings required to isolate a sample is equivalent to the path length from the root node to the terminating node.

This path length, averaged over a forest of such random trees, is a measure of normality and our decision function.

Random partitioning produces noticeably shorter paths for anomalies. Hence, when a forest of random trees collectively produce shorter path lengths for particular samples, they are highly likely to be anomalies

In [None]:
# Supervised and Outlier Detection with KDD

# In this example, we will want to use binary data where 1 will represent a "not-normal" attack

In [None]:
from sklearn.model_selection import train_test_split

y_binary = y != 'normal.'

In [None]:
y_binary.head()


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_binary)


In [None]:
y_test.value_counts(normalize=True)  # check our null accuracy


In [None]:
model = IsolationForest()
model.fit(X_train)  # notice that there is no y in the .fit

In [None]:
y_predicted = model.predict(X_test)

pd.Series(y_predicted).value_counts()

In [None]:
y_predicted = np.where(y_predicted==1, 1, 0)  # turn into 0s and 1s

pd.Series(y_predicted).value_counts()  # that's better

In [None]:
scores = model.decision_function(X_test)

scores  # the smaller, the more anomolous

In [None]:
pd.Series(scores).hist()

In [None]:
from sklearn.metrics import accuracy_score
preds = np.where(scores < 0, 0, 1)  # customize threshold
accuracy_score(preds, y_test)

In [None]:
for t in (-2, -.15, -.1, -.05, 0, .05):
    preds = np.where(scores < t, 0, 1)  # customize threshold
    print t, accuracy_score(preds, y_test)

In [None]:
## -0.05 0.816988648325 gives us better than the null accuracy, without ever needing the testing set
# This shows how we can can achieve predictive results without labeled data


# This is an interesting use case of novelty detection becuase generally, when given labels
# we do not use such tactics.

In [None]:
# Detecting Malicious Urls

# Load the data
from urlparse import urlparse
import pandas as pd
urls = pd.read_json("../data/urls.json")
print urls.shape
urls['string'] = "http://" + urls['string']

In [None]:
urls.head(10)


In [None]:
X, y = urls['string'], urls['truth']


In [None]:
X.head()  # look at X



In [None]:
# get our null accuracy because we are interested in prediction where 0 is not malicious

y.value_counts(normalize=True)  

In [None]:
from sklearn.feature_extraction.text import CountVectorizer


# Create a function called custom_tokenizer that takes in a string and outputs a list of tokens of the string.
import re

def custom_tokenizer(string):
    final = []
    tokens = [a for a in list(urlparse(string)) if a]
    for t in tokens:
        final.extend(re.compile("[.-]").split(t))
    return final

In [None]:
print custom_tokenizer('google.com')

print custom_tokenizer('https://google-so-not-fake.com?fake=False&seriously=True')


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression


vect = CountVectorizer(tokenizer=custom_tokenizer)
lr = LogisticRegression()
lr_pipe = Pipeline([('vect', vect), ('model', lr)])

In [None]:
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split


scores = cross_val_score(lr_pipe, X, y, cv=5)

scores.mean()  # not good enough!!

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

rf_pipe = Pipeline([('vect', vect), ('model', RandomForestClassifier(n_estimators=500))])
scores = cross_val_score(rf_pipe, X, y, cv=5)

scores.mean()  # not as good

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

from sklearn.metrics import confusion_matrix

rf_pipe.fit(X_train, y_train)

preds = rf_pipe.predict(X_test)
print confusion_matrix(y_test, preds)  # hmmmm

In [None]:
probs = rf_pipe.predict_proba(X_test)[:,1]  # get predicted probabilities of malicious-ness

In [None]:
import numpy as np  # play with threshold to alter false positive/negative rate
for thresh in [.1, .2, .3, .4, .5, .6, .7, .8, .9]:
    preds = np.where(probs >= thresh, 1, 0)
    print thresh
    print confusion_matrix(y_test, preds)
    print

In [None]:
pd.DataFrame({'feature':rf_pipe.steps[0][1].get_feature_names(), 'importance':rf_pipe.steps[-1][1].feature_importances_}).sort_values('importance', ascending=False).head(10)

In [None]:
treeclf = DecisionTreeClassifier(max_depth=7)

tree_pipe = Pipeline([('vect', vect), ('model', treeclf)])

vect = CountVectorizer(tokenizer=custom_tokenizer)

scores = cross_val_score(tree_pipe, X, y, scoring='accuracy', cv=5)

print np.mean(scores)

tree_pipe.fit(X, y)

export_graphviz(tree_pipe.steps[1][1], out_file='tree_urls.dot', feature_names=tree_pipe.steps[0][1].get_feature_names())

<img src="https://github.com/nunoaflopes/IA4cyber-Livro2-Hands-on-ML-for-Cyber-Security-Packt/blob/master/Chapter07/tree_urls.png?raw=1">

In [None]:
# eg. if a url has "verifiziren" in it, it is VERY likely malicious

In [None]:
# Longer Credit Card Fraud Detection

https://github.com/sinanuozdemir/blackhat-vegas-ml/blob/master/Fraud%20Detection.ipynb