# Introduction to applying machine learning in security

## Eugene Teo 張漢輝
*   eugene at temasek.org
*   http://www.temasek.org (淡馬錫)

## Syllabus in brief
We will cover:

*   Data wrangling
*   Data visualization
*   Feature engineering
*   RandomForest
*   Model evaluation



In [0]:
%matplotlib inline

In [0]:
# https://github.com/mouradmourafiq/pandas-summary
# http://blog.rstudio.com/2016/03/29/feather/
# !pip install -U pandas-summary tldextract feather-format yellowbrick imblearn
!pip install -U pandas-summary tldextract yellowbrick imblearn

In [0]:
!mkdir data
!wget https://raw.githubusercontent.com/ClickSecurity/data_hacking/master/dga_detection/data/alexa_100k.csv -P ./data
!wget https://raw.githubusercontent.com/ClickSecurity/data_hacking/master/dga_detection/data/dga_domains.txt -P ./data
!wget https://raw.githubusercontent.com/ClickSecurity/data_hacking/master/dga_detection/data/words.txt -P ./data

In [0]:
import pandas as pd
import numpy as np
from pandas_summary import DataFrameSummary

In [0]:
alexa_dataframe = pd.read_csv('data/alexa_100k.csv', names=['rank', 'uri'],
                              header=None, encoding='utf-8')
DataFrameSummary(alexa_dataframe).summary()

In [0]:
alexa_dataframe.sample(5)

In [0]:
import tldextract

def domain_extract(uri):
    ext = tldextract.extract(uri)
    if not ext.suffix:
        return np.nan
    else:
        return ext.domain

In [0]:
alexa_dataframe['domain'] = [domain_extract(x) for x in alexa_dataframe['uri']]

In [0]:
DataFrameSummary(alexa_dataframe).summary()

In [0]:
alexa_dataframe[alexa_dataframe['domain'].isnull()].sample(5)

In [0]:
!grep -B1 -E '(28713|81751|49033)' data/alexa_100k.csv

In [0]:
del alexa_dataframe['rank']
del alexa_dataframe['uri']
alexa_dataframe['class'] = 'legit'
alexa_dataframe = alexa_dataframe.dropna()
alexa_dataframe = alexa_dataframe.drop_duplicates()
DataFrameSummary(alexa_dataframe).summary()

In [0]:
dga_dataframe = pd.read_csv('data/dga_domains.txt', header=None, names=['raw_domain'],
                            encoding='utf-8')
DataFrameSummary(dga_dataframe).summary()

In [0]:
dga_dataframe['raw_domain'][0]

In [0]:
dga_dataframe['raw_domain'][0].split('.')[0].strip().lower()

In [0]:
dga_dataframe['domain'] = dga_dataframe.applymap(lambda x: x.split('.')[0].strip().lower())

In [0]:
DataFrameSummary(dga_dataframe).summary()

In [0]:
del dga_dataframe['raw_domain']
dga_dataframe['class'] = 'dga'
dga_dataframe = dga_dataframe.drop_duplicates()
DataFrameSummary(dga_dataframe).summary()

In [0]:
# Unbalanced dataset
print(len(alexa_dataframe), len(dga_dataframe))

In [0]:
all_domains = pd.concat([alexa_dataframe, dga_dataframe], ignore_index=True)

In [0]:
# Feature engineering
all_domains['length'] = [len(x) for x in all_domains['domain']]

# Drop domains that are length <=6 even for legit domains
all_domains = all_domains[all_domains['length'] > 6]

In [0]:
# https://rosettacode.org/wiki/Entropy#Python:_More_succinct_version
import math
from collections import Counter

def entropy(s):
    p, lns = Counter(s), float(len(s))
    return -sum( count/lns * math.log(count/lns, 2) for count in p.values())

In [0]:
all_domains['entropy'] = [entropy(x) for x in all_domains['domain']]

In [0]:
import seaborn as sns
sns.set_style("whitegrid")

ax = sns.pairplot(data=all_domains, diag_kind = 'kde', hue = 'class',
                  markers = ["o", "D"], aspect = 1, plot_kws = {"s": 6})

In [0]:
ax = sns.boxplot(x='class', y='length', data=all_domains)

In [0]:
ax = sns.boxplot(x='class', y='entropy', data=all_domains)

In [0]:
ax = sns.lmplot(x='length', y='entropy', hue='class', data=all_domains, fit_reg=False)

In [0]:
all_domains.sort_values(['entropy'], ascending=False).head(10)

In [0]:
high_entropy_domains = all_domains[all_domains['entropy'] > 4]

print("entropy>4: {}% ({}) out of {}".format(high_entropy_domains.shape[0]/all_domains.shape[0]*100,
                                                           high_entropy_domains.shape[0], all_domains.shape[0]))
print("Legit domains with high entropy: {}".format(len(high_entropy_domains[high_entropy_domains['class'] == 'legit'])))
print("DGA domains with high entropy: {}".format(len(high_entropy_domains[high_entropy_domains['class'] == 'dga'])))

In [0]:
all_domains.sort_values(['length'], ascending=False).head(10)

In [0]:
all_domains[all_domains['domain'].str.contains('^xn--')]

In [0]:
punycode = all_domains[all_domains['domain'].str.contains('^xn--')]['domain']
[bytes(x, 'idna').decode('idna') for x in punycode]

In [0]:
all_domains.columns

In [0]:
X = all_domains.as_matrix(['length', 'entropy'])
X[:5]

In [0]:
y = np.array([1 if x == 'dga' else 0 for x in all_domains['class']])
y[:5]

In [0]:
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html
# This cross-validation object is a merge of StratifiedKFold and ShuffleSplit,
# which returns stratified randomized folds. The folds are made by preserving
# the percentage of samples for each class.

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.25)
for train_index, test_index in split.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

print(len(y_train[y_train == 1])/len(y_train[y_train == 0]))
print(len(y_test[y_test == 1])/len(y_test[y_test == 0]))

In [0]:
from sklearn.ensemble import RandomForestClassifier

In [0]:
from sklearn.metrics import accuracy_score
def print_score(m):
    res = [m.score(X_train, y_train), m.score(X_test, y_test)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)
    
clf = RandomForestClassifier(n_jobs=-1)
%time clf.fit(X_train, y_train)
print_score(clf)

In [0]:
# https://en.wikipedia.org/wiki/Cross-validation_(statistics)#/media/File:K-fold_cross_validation_EN.jpg
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X_test, y_test, cv=10, n_jobs=-1)
print(scores)

In [0]:
from yellowbrick.classifier import ConfusionMatrix

clf = RandomForestClassifier(n_jobs=-1)

# The ConfusionMatrix visualizer taxes a model
cm = ConfusionMatrix(clf, classes=[0, 1])

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm.fit(X_train, y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm.score(X_test, y_test)

# How did we do?
cm.poof()

In [0]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(ratio = 1.0)
X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

In [0]:
from yellowbrick.classifier import ClassBalance

m = RandomForestClassifier(n_jobs=-1)
visualizer = ClassBalance(m, classes=['legit', 'dga'])
%time visualizer.fit(X_train_res, y_train_res)
visualizer.score(X_train_res, y_train_res)  # Evaluate the model on the test data
g = visualizer.poof()             # Draw/show/poof the data
print_score(m)

## More feature engineering

In [0]:
from sklearn.feature_extraction.text import CountVectorizer

'''
 |  analyzer : string, {'word', 'char', 'char_wb'} or callable
 |      Whether the feature should be made of word or character n-grams.
 |      Option 'char_wb' creates character n-grams only from text inside
 |      word boundaries; n-grams at the edges of words are padded with space.
 |  
 |      If a callable is passed it is used to extract the sequence of features
 |      out of the raw, unprocessed input.
 |  
 |  ngram_range : tuple (min_n, max_n)
 |      The lower and upper boundary of the range of n-values for different
 |      n-grams to be extracted. All values of n such that min_n <= n <= max_n
 |      will be used.
 |  
 |  max_df : float in range [0.0, 1.0] or int, default=1.0
 |      When building the vocabulary ignore terms that have a document
 |      frequency strictly higher than the given threshold (corpus-specific
 |      stop words).
 |      If float, the parameter represents a proportion of documents, integer
 |      absolute counts.
 |      This parameter is ignored if vocabulary is not None.
 |  
 |  min_df : float in range [0.0, 1.0] or int, default=1
 |      When building the vocabulary ignore terms that have a document
 |      frequency strictly lower than the given threshold. This value is also
 |      called cut-off in the literature.
 |      If float, the parameter represents a proportion of documents, integer
 |      absolute counts.
 |      This parameter is ignored if vocabulary is not None.
 
min_df=1e-4 <- give me ngrams that happen in at least .1% of the domains
'''
ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(3,5),
                          max_df=1.0, min_df=1e-4)

In [0]:
ngram_features = ngram_vectorizer.fit_transform(alexa_dataframe['domain'])
ngram_counts = np.log10(ngram_features.sum(axis=0).getA1())
ngrams = ngram_vectorizer.get_feature_names()
len(ngrams)

In [0]:
# ngram_df = pd.DataFrame(ngram_features.todense(), columns=ngrams)
ngram_df = pd.DataFrame([ngram_counts], columns=ngrams)

In [0]:
#ngram_substr = ngram_df.T[ngram_df.T[0] > 1].sort_values([0], ascending=True).head(10)
ngram_substr = ngram_df.T.sort_values([0], ascending=True).head(10)
ngram_substr

In [0]:
# [all_domains[(all_domains['domain'].str.contains(x) == True) & (all_domains['class'] == 'dga')].sort_values('entropy', ascending=False) for x in ngram_substr.T.columns]
[all_domains[all_domains['domain'].str.contains(x) == True].sort_values('entropy', ascending=False) for x in ngram_substr.T.columns]

In [0]:
ngram_substr = ngram_df.T.sort_values([0], ascending=False).head(10)
ngram_substr

In [0]:
[all_domains[(all_domains['domain'].str.contains(x) == True)].sort_values('entropy', ascending=False) for x in ngram_substr.T.columns]

In [0]:
word_dataframe = pd.read_csv('data/words.txt', header=None, names=['word'], dtype={'word': np.str}, encoding='utf-8')

In [0]:
DataFrameSummary(word_dataframe).summary()

In [0]:
word_dataframe = word_dataframe[word_dataframe['word'].map(lambda x: str(x).isalpha())]

In [0]:
word_dataframe = word_dataframe.applymap(lambda x: str(x).strip().lower())
word_dataframe = word_dataframe.dropna()
word_dataframe = word_dataframe.drop_duplicates()
word_dataframe.sample(10)

In [0]:
word_vectorizer = CountVectorizer(analyzer='char', ngram_range=(3,5), min_df=1e-5, max_df=1.0)
word_features = word_vectorizer.fit_transform(word_dataframe['word'])
word_counts = np.log10(word_features.sum(axis=0).getA1())
words = word_vectorizer.get_feature_names()
len(words)

# ngram_df = pd.DataFrame(ngram_features.todense(), columns=ngrams)
word_df = pd.DataFrame([word_counts], columns=words)

In [0]:
word_substr = word_df.T.sort_values([0], ascending=True).head(10)
word_substr

In [0]:
[all_domains[(all_domains['domain'].str.contains(x) == True)].sort_values('entropy', ascending=False) for x in word_substr.T.columns]

In [0]:
def ngram_count(domain):
    ngram_match = ngram_counts * ngram_vectorizer.transform([domain]).T
    word_match = word_counts * word_vectorizer.transform([domain]).T
    print("{} Alexa match: {} Word match: {}".format(domain, ngram_match, word_match))

In [0]:
ngram_count('xcaywkeyptvcvcrkmzrpvjbobgy')
ngram_count('google')
ngram_count('facebook')

In [0]:
all_domains['alexa_grams'] = ngram_counts * ngram_vectorizer.transform(all_domains['domain']).T
all_domains['word_grams'] = word_counts * word_vectorizer.transform(all_domains['domain']).T
all_domains.sort_values(['alexa_grams', 'word_grams'], ascending=False).head(10)

In [0]:
# There are legit emails that look really like DGAs
all_domains[all_domains['class'] == 'dga'].sort_values(['alexa_grams', 'word_grams'], ascending=True).head(10)

In [0]:
all_domains['diff'] = all_domains['alexa_grams'] - all_domains['word_grams']
# Less dictionary-like? more Internety
all_domains.sort_values(['diff'], ascending=False).head(10)

In [0]:
all_domains[~all_domains['domain'].str.contains('-')].sort_values(['diff'], ascending=False).head(10)

In [0]:
# More dictionary-like, less Internety?
all_domains.sort_values(['diff'], ascending=True).head(10)

In [0]:
ax = sns.pairplot(data=all_domains, diag_kind = 'kde', hue = 'class',
                  markers = ["o", "D"], aspect = 1, plot_kws = {"s": 6})

In [0]:
all_domains[all_domains['word_grams']==0].head()

In [0]:
DataFrameSummary(all_domains[all_domains['class']=='legit']).summary()

In [0]:
%time legit = all_domains[all_domains['class']=='legit']
%time max_grams = np.maximum(legit['alexa_grams'], legit['word_grams'])
%time sns.distplot(max_grams, bins=80, kde=False, rug=True)

In [0]:
all_domains.columns

In [0]:
X = all_domains.as_matrix(['domain', 'length', 'entropy', 'alexa_grams', 'word_grams'])
X[:5]

In [0]:
y = np.array([1 if x == 'dga' else 0 for x in all_domains['class']])
y[:5]

In [0]:
# http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html
# This cross-validation object is a merge of StratifiedKFold and ShuffleSplit,
# which returns stratified randomized folds. The folds are made by preserving
# the percentage of samples for each class.

from sklearn.model_selection import StratifiedShuffleSplit

split = StratifiedShuffleSplit(n_splits=1, test_size=0.25)
for train_index, test_index in split.split(X, y):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

print(len(y_train[y_train == 1])/len(y_train[y_train == 0]))
print(len(y_test[y_test == 1])/len(y_test[y_test == 0]))

In [0]:
# sklearn.metrics.mean_squared_error
#from sklearn.metrics import accuracy_score
#def print_score(m):
#    res = [m.score(X_train, y_train), m.score(X_test, y_test)]
#    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
#    print(res)
    
#clf = RandomForestClassifier(n_jobs=-1)
#%time clf.fit(X_train, y_train)
#print_score(clf)

from yellowbrick.classifier import ConfusionMatrix

clf = RandomForestClassifier(n_jobs=-1)

# The ConfusionMatrix visualizer taxes a model
cm = ConfusionMatrix(clf, classes=[0, 1])

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
%time cm.fit(np.delete(X_train, [0], axis=1), y_train)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm.score(np.delete(X_test, [0], axis=1), y_test)

# How did we do?
cm.poof()

In [0]:
sm = SMOTE(ratio = 1.0)
X_train_res, y_train_res = sm.fit_sample(np.delete(X_train, [0], axis=1), y_train)

from yellowbrick.classifier import ClassBalance

m = RandomForestClassifier(n_jobs=-1)
visualizer = ClassBalance(m, classes=['legit', 'dga'])
%time visualizer.fit(X_train_res, y_train_res)
visualizer.score(X_train_res, y_train_res)  # Evaluate the model on the test data
g = visualizer.poof()             # Draw/show/poof the data
#print(m.score(X_train_res, y_train_res))

In [0]:
from yellowbrick.classifier import ConfusionMatrix

clf = RandomForestClassifier(n_jobs=-1)

# The ConfusionMatrix visualizer taxes a model
cm = ConfusionMatrix(clf, classes=[0, 1])

# Fit fits the passed model. This is unnecessary if you pass the visualizer a pre-fitted model
cm.fit(X_train_res, y_train_res)

# To create the ConfusionMatrix, we need some test data. Score runs predict() on the data
# and then creates the confusion_matrix from scikit-learn.
cm.score(np.delete(X_test, [0], axis=1), y_test)

# How did we do?
cm.poof()

In [0]:
cm.predict([X_test[1337][1:]])

In [0]:
print(X_test[1338], y_test[1338])

## References
*  https://github.com/ClickSecurity/data_hacking/tree/master/dga_detection

## Eugene Teo

*  eugene at temasek.org
*  http://www.temasek.org (淡馬錫)