In [136]:
%%bash
python --version

Python 3.6.10


In [336]:
import re
import pickle
import os
from collections import namedtuple
from functools import reduce

import requests
import pandas as pd
import numpy as np
import pyparsing as pp

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM

# Extraction

In [337]:
# Extract raw data from .webgateway
with open('./aktaion/data/proxyData//benignData/BlueCoat_Large.webgateway','rb') as f:
    data = f.readlines()

In [338]:
# Extract the user agent
user_ag = [str(i).split('"')[11] for i in data]
series_user_ag = pd.Series(user_ag,name = 'rawUserAg')

In [339]:
# Parser for the main products
LPAR,RPAR,SLASH = map(pp.Suppress, "()/")
product = pp.Word(pp.printables).setResultsName('product')
product_version = SLASH+pp.Word(pp.printables)
value = (pp.quotedString 
         | pp.originalTextFor(pp.OneOrMore(pp.Word(pp.printables, excludeChars="();") 
                                     | pp.nestedExpr())))
comments = LPAR + pp.delimitedList(value,delim=";") + RPAR
grammar = pp.Forward()
grammar << product + pp.Optional(product_version).setResultsName('product_version') \
        + pp.Optional(comments).setResultsName('comments') \
        + pp.ZeroOrMore(grammar).setResultsName('cousins')
node = namedtuple("Node", ["product", "product_version", "comments","cousins"])
def parseAction(string, location, tokens):
    return node(tokens.product,tokens.product_version,tokens.comments, tokens.cousins)
grammar.setParseAction(parseAction)

Forward: {{{W:(0123...) [{Suppress:("/") W:(0123...)}]} [{{Suppress:("(") {quotedString using single or double quotes | {{Empty {{W:(0123...) | nested () expression}}...} Empty}} [; {quotedString using single or double quotes | {{Empty {{W:(0123...) | nested () expression}}...} Empty}}]...} Suppress:(")")}]} [: ...]...}

In [389]:
# Wrap everything together
def parse_user_ag(x):
    nodes = grammar.parseString(x)[0]
    comment_pattern = re.compile('^(?P<product>.*?)(?P<comment> \(?.*?\))?$')
    result = list()
    current_node = nodes
    while True:
        product_name = current_node.product
        if len(current_node.comments) > 0:
            for comment in current_node.comments:
                match = re.match(comment_pattern, comment)
                result.append((product_name,match.group('product'),match.group('comment')))
        else:
            result.append((product_name,'not_available','not_available'))
        if current_node.cousins == '':
            break
        else: 
            current_node = current_node.cousins[0]
    return result

# Sample
s_size = .001
cutoff = int(len(user_ag)*s_size)
user_ag_s = user_ag[:cutoff]
user_ag_valid = pd.Series(user_ag_s).str.strip().str.contains('^\w.*')
user_ag_parsed = pd.Series(user_ag_s).loc[user_ag_valid].apply(lambda x: parse_user_ag(x))
user_ag_parsed = user_ag_parsed.append(pd.Series( [[('not_available','not_available','not_available')]] * np.invert(user_ag_valid).sum()
),ignore_index = True)

user_ag_parsed_unpivot = user_ag_parsed.apply(pd.Series).fillna({0:'-'})\
                        .reset_index().melt(id_vars = 'index').drop(columns = ['variable'])\
                        .dropna().rename(columns={'index':'id'})

user_ag_parsed_unpivot_exploded = pd.DataFrame(user_ag_parsed_unpivot.value.tolist()\
                                               ,columns = ['products','comments','comment_details']\
                                              , index = user_ag_parsed_unpivot.id).fillna('not_available')
user_ag_parsed_unpivot_exploded.products = user_ag_parsed_unpivot_exploded.products.where(user_ag_parsed_unpivot_exploded.products != '-','not_available')

#clean
for col in user_ag_parsed_unpivot_exploded.columns:
    user_ag_parsed_unpivot_exploded.loc[:,col] = user_ag_parsed_unpivot_exploded.loc[:,col].str.lower().str.strip()

user_ag_parsed_unpivot_exploded


Unnamed: 0_level_0,products,comments,comment_details
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,mozilla/5.0,windows,not_available
1,mozilla/5.0,windows nt 5.1,not_available
2,mozilla/5.0,ipod,not_available
3,mozilla/5.0,windows nt 5.1,not_available
4,mozilla/4.0,compatible,not_available
...,...,...,...
333,mozilla/4.0,asktbhip/5.12.2.16749,not_available
478,mozilla/4.0,asktbhip/5.12.2.16749,not_available
1020,mozilla/4.0,asktbcwn/5.8.0.12304,not_available
138,mozilla/4.0,masn,not_available


### Feature engineering
3 clear challenges stand out:
* Categorical data
* Cardinality
* Variable number of attributes (products, comments)

We will process feature by feature

### Products
We can isolate the version of the product in most cases

In [391]:
product_lvl_0_dedup = user_ag_parsed_unpivot_exploded.reset_index().drop_duplicates(['id','products'])
product_pat = re.compile('^(?P<products_main>\w*[a-z])\/?(?P<product_versions>.*)')
product_versions = user_ag_parsed_unpivot_exploded.products.str.extract(product_pat)
user_ag_parsed_unpivot_exploded = pd.concat([user_ag_parsed_unpivot_exploded,user_ag_parsed_unpivot_exploded.products.str.extract(product_pat)],axis = 1)

##### Encoding
To deal with cardinality, I suggest a method based on the estimator encoding:
* Principle: replace a category by the value associated with one if its features
* Target variable: as we do not have relevant features (numerical) we will create are own: the relative frequency (lvl_0)
* Relative feature: the relative frequency of the browser version is calculated relative to the same browser versions only (lvl_1)

In [392]:
# Mapping dicts
products_lvl_0_dict = product_versions.products_main.value_counts(normalize = True).to_dict()
products_lvl_1_dict = product_versions.groupby('products_main')['product_versions'].value_counts(normalize = True).to_dict()
#Map
def map_tuple(x, mapping, tup):
    not_av = 0
    return mapping.get((x[tup[0]],x[tup[1]]),not_av)

user_ag_parsed_unpivot_exploded['product_lvl_0'] = user_ag_parsed_unpivot_exploded.products_main.map(products_lvl_0_dict).fillna(0)
user_ag_parsed_unpivot_exploded['product_lvl_1'] = user_ag_parsed_unpivot_exploded.loc[:,['products_main','product_versions']].apply(lambda x: map_tuple(x,products_lvl_1_dict,('products_main','product_versions')),axis = 1)

In [343]:
with open('./model/products_lvl_0_dict.pickle','wb') as e:
    pickle.dump(products_lvl_0_dict,e)
with open('./model/products_lvl_1_dict.pickle','wb') as e:
    pickle.dump(products_lvl_1_dict,e)

### Comments

Same challenges and strategy as for the product

In [393]:
# Nokias
nokia = re.compile('(?P<comment_main>series.*?) (?P<comment_version>nokia.*)')
# WIndows
windows = re.compile('(?P<comment_main>windows \w{2}) (?P<comment_version>.*)')
# v/rv
rv = re.compile('(?P<comment_main>\w*v):(?P<comment_version>.*)')
# Mac
mac = re.compile('^cpu (?P<comment_main>.*) (?P<comment_version>(\d_)+\d)+ like mac.*')
# Else
fall_back = re.compile('^(?P<comment_main>\w*[a-z])\/?(?P<comment_version>.*)')

# Extract
pats = [nokia, windows, rv, mac, fall_back]
pat_extracted = [user_ag_parsed_unpivot_exploded.comments.str.lower().str.extract(pat).loc[:,['comment_main','comment_version']] for pat in pats]

# Reduce
comment_main = reduce(lambda x,y: x.combine_first(y),[i.comment_main for i in pat_extracted])
comment_version = reduce(lambda x,y: x.combine_first(y),[i.comment_version for i in pat_extracted])

# Merge
comments_detailed = pd.concat([user_ag_parsed_unpivot_exploded,comment_main,comment_version], axis = 1)

#Clean
comments_detailed.comment_version = comments_detailed.comment_version.where(comments_detailed.comment_version!='','not_available') 

In [394]:
# Mapping dict
comments_lvl_0_dict = comments_detailed.comments.value_counts(normalize = True).to_dict()
comments_lvl_1_dict = comments_detailed.groupby('comment_main')['comment_version'].value_counts(normalize = True).to_dict()
# Map
comments_detailed['comment_lvl_0'] = comments_detailed.comment_main.map(comments_lvl_0_dict).fillna(0)
comments_detailed['comment_lvl_1'] = comments_detailed.loc[:,['comment_main','comment_version']].apply(lambda x: map_tuple(x,comments_lvl_1_dict,('comment_main','comment_version')),axis = 1)

In [346]:
with open('./model/comments_lvl_0_dict.pickle','wb') as e:
    pickle.dump(comments_lvl_0_dict,e)
with open('./model/comments_lvl_1_dict.pickle','wb') as e:
    pickle.dump(comments_lvl_1_dict,e)

In [395]:
# Mapping relative to product
comments_lvl_0_dict_rel_2_product = comments_detailed.groupby(['products'])['comment_main'].value_counts(normalize = True).to_dict()
comments_lvl_1_dict_rel_2_product = comments_detailed.groupby(['products','comment_main'])['comment_version'].value_counts(normalize = True).to_dict()
# Map func
def map_tuple3(x, mapping, tup):
    not_av = -1
    return mapping.get((x[tup[0]],x[tup[1]],x[tup[2]]),not_av)
# Map
comments_detailed['comment_lvl_0_rel_2_product'] = comments_detailed.loc[:,['products','comment_main']].apply(lambda x: map_tuple(x,comments_lvl_0_dict_rel_2_product,('products','comment_main')),axis = 1)
comments_detailed['comment_lvl_1_rel_2_product'] = comments_detailed.loc[:,['products','comment_main','comment_version']].apply(lambda x: map_tuple3(x,comments_lvl_1_dict_rel_2_product,('products','comment_main','comment_version')),axis = 1)

In [348]:
with open('./model/comments_lvl_0_dict_rel_2_product.pickle','wb') as e:
    pickle.dump(comments_lvl_0_dict_rel_2_product,e)
with open('./model/comments_lvl_1_dict_rel_2_product.pickle','wb') as e:
    pickle.dump(comments_lvl_1_dict_rel_2_product,e)

### Aggregation at user_agent_level
Indeed,the number of products and comment available per user agent or product is  not fixed. We suggest the following method to encode it:
* Encode each system info and its version following the same steps as previously (estimator encoding)
* Summarize the multiple system info at the record level with a set of descriptive statistics (mean, min, max, number)


In [403]:
# Comment details stats
desc_stat = ['mean','max','min']
apply_map = {'comment_lvl_1':desc_stat,
            'comment_lvl_1_rel_2_product':desc_stat}
comment_detail_summary = comments_detailed.groupby(level = 0)[list(apply_map.keys())].agg(desc_stat)
comment_detail_summary.columns = ['_'.join(col) for col in comment_detail_summary.columns]

In [404]:
# product main count
def unique_count(x):
    uniques = list(x.unique())
    return len(uniques) if 'not_available' not in uniques else len(uniques)-1
product_main_count = comments_detailed.groupby(level=0)['products_main'].apply(lambda x: unique_count(x)).to_frame()
product_main_count.columns = ['products_main_count']
#Product main stats
product_main_lvl_0_summary = comments_detailed.reset_index().drop_duplicates(['id','products_main']).groupby('id')['product_lvl_0'].agg(desc_stat)
product_main_lvl_0_summary.columns = ['product_main_lvl_0_'+col for col in product_main_lvl_0_summary.columns]

In [405]:
# Product version stats
product_main_lvl_1_summary = comments_detailed.reset_index().drop_duplicates(['id','products_main','product_versions']).groupby('id')['product_lvl_1'].agg(desc_stat)
product_main_lvl_1_summary.columns = ['product_main_lvl_1_'+col for col in product_main_lvl_1_summary.columns]
product_main_lvl_1_summary
#Product version count stats
product_version_count = comments_detailed.reset_index().groupby(['id','products_main'])['product_versions'].apply(lambda x: unique_count(x))
product_version_count_stats = product_version_count.to_frame().reset_index(level = 1,drop=True).groupby(level = 0).agg(desc_stat)
product_version_count_stats.columns = ['_count_'.join(col) for col in product_version_count_stats.columns]

In [406]:
# Comment main stats
comment_main_summary = comments_detailed.reset_index().drop_duplicates(['id','products','comment_main']).groupby('id')[['comment_lvl_0','comment_lvl_0_rel_2_product']].agg(desc_stat)
comment_main_summary.columns = ['_'.join(col) for col in comment_main_summary.columns]
# Comment main counts
comment_main_count = comments_detailed.reset_index().drop_duplicates(['id','products','comment_main']).groupby(['id','products'])['comment_main'].apply(lambda x: unique_count(x))
comment_main_count_summary = comment_main_count.reset_index('products',drop=True).groupby(level = 0).agg(desc_stat)
comment_main_count_summary.columns = ['comment_main_count_'+col for col in comment_main_count_summary.columns]


In [407]:
dt_agg = pd.concat([comment_detail_summary,\
                    comment_main_count_summary,\
                    comment_main_summary,\
                    product_version_count_stats,\
                    product_main_lvl_1_summary,\
                    product_main_count,\
                    product_main_lvl_0_summary], axis = 1)

In [354]:
dt_agg.to_pickle('./data/training/training_set.pickle')

# Training

In [355]:
dt_agg = pd.read_pickle('./data/training/training_set.pickle')

In [358]:
# Training architecture
scaler = StandardScaler()
svm = OneClassSVM()
pipe = Pipeline([('scaler',scaler),
                 ('model',svm)])

In [359]:
# Training
pipe.fit(test.values)

Pipeline(memory=None,
         steps=[('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('model',
                 OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='scale',
                             kernel='rbf', max_iter=-1, nu=0.5, shrinking=True,
                             tol=0.001, verbose=False))],
         verbose=False)

In [360]:
prediction = pipe.predict(dt_agg.values)

In [361]:
pd.Series(prediction).value_counts()

 1    73333
-1    69096
dtype: int64

In [362]:
# Save model
with open('./model/one_class_svm.pickle','wb') as e:
    pickle.dump(pipe,e)

# Anomalies

In [363]:
with open('./aktaion/data/proxyData/exploitData/2014-01-02-neutrino-exploit-traffic.webgateway','rb') as e:
    exploit = e.readlines()

In [364]:
result = list()
for file in os.listdir('./aktaion/data/proxyData/exploitData'):
    with open('./aktaion/data/proxyData/exploitData/'+file,'rb') as e:
        exploit = e.readlines()
        result+=exploit

In [411]:
exploit_user_ag = [str(i).split('"')[11] for i in result]
body = pd.Series(exploit_user_ag).to_json()

In [366]:
url = "http://127.0.0.1:5000/api/classify_bulk"
payload = "{\n    \"raw_user_agent\": \"Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US) AppleWebKit/534.10 (KHTML, like Gecko) Chrome/8.0.552.237 Safari/534.10\"\n}"
headers = {
'Content-Type': 'application/json'
}
response = requests.request("POST", url, headers=headers, data = body)
print(response.text.encode('utf8'))

b'{\n    "predicted_class": {\n        "0": "exploit",\n        "1": "exploit",\n        "10": "exploit",\n        "100": "exploit",\n        "1000": "exploit",\n        "1001": "exploit",\n        "1002": "exploit",\n        "1003": "exploit",\n        "1004": "exploit",\n        "1005": "exploit",\n        "1006": "exploit",\n        "1007": "exploit",\n        "1008": "exploit",\n        "1009": "exploit",\n        "101": "exploit",\n        "1010": "exploit",\n        "1011": "exploit",\n        "1012": "exploit",\n        "1013": "exploit",\n        "1014": "exploit",\n        "1015": "exploit",\n        "1016": "exploit",\n        "1017": "exploit",\n        "1018": "exploit",\n        "1019": "exploit",\n        "102": "exploit",\n        "1020": "exploit",\n        "1021": "exploit",\n        "1022": "exploit",\n        "1023": "exploit",\n        "1024": "exploit",\n        "1025": "exploit",\n        "1026": "exploit",\n        "1027": "exploit",\n        "1028": "exploit",\n