In [41]:
import json
from pprint import pprint
import pandas as pd

def load_mongo_collection(filename):
    collection = []
    with open(filename) as fd:
        while True:
            line = fd.readline()
            if line:
                data = json.loads(line)
                collection.append(data)
            else:
                break
    
    return collection

attack_log = load_mongo_collection("./csic_attack_log.json")
normal_log = load_mongo_collection("./csic_normal_log.json")

pprint(len(attack_log))
pprint(attack_log[:5])

21451
[{'_id': {'$oid': '5ecbcc60140ff600050519e2'},
  'matched_rules': [{'id': 920273,
                     'matched_value': 'Matched "Operator `ValidateByteRange\' '
                                      'with parameter '
                                      "`38,44-46,48-58,61,65-90,95,97-122' "
                                      "against variable `REQUEST_BODY' (Value: "
                                      '`modo=entrar&login=arro&pwd=CarMinAR%3C%21--%23include+file%3D%22archivo_secreto%22+--%3E&remember=on '
                                      "(10 characters omitted)' )",
                     'msg': 'Invalid character in request (outside of very '
                            'strict set)'},
                    {'id': 941180,
                     'matched_value': 'Matched "Operator `Pm\' with parameter '
                                      '`document.cookie document.write '
                                      '.parentnode .innerhtml window.location '
                  

In [42]:
# collect all available rules
all_rules = set()
for item in attack_log:
    [all_rules.add(x["id"]) for x in item["matched_rules"]]

for item in normal_log:
    [all_rules.add(x["id"]) for x in item["matched_rules"]]

print(len(all_rules))
display(list(all_rules))

53


[942210,
 932100,
 942340,
 941320,
 942480,
 932115,
 942100,
 942360,
 942490,
 941210,
 920220,
 942110,
 921120,
 942240,
 942370,
 911100,
 920230,
 941100,
 942511,
 942130,
 942260,
 920500,
 941110,
 932150,
 942390,
 921151,
 932160,
 942400,
 930120,
 921160,
 942280,
 920270,
 920271,
 920272,
 920273,
 941140,
 942300,
 942200,
 942430,
 942431,
 942432,
 941150,
 932190,
 942180,
 942310,
 942440,
 941160,
 942190,
 942460,
 941170,
 920440,
 942330,
 941180]

In [43]:
from sklearn.model_selection import train_test_split

def transform_log_item(item):
    global all_rules
    transformed_data = {
        "request_method": item.get("request_method"),
        # "uri": item.get("uri")
    }

    for rule_id in all_rules:
        transformed_data[str(rule_id)] = 0

    for rule_id in map(lambda x: x["id"], item.get("matched_rules")):
        transformed_data[str(rule_id)] = 1

    return transformed_data

cleaned_attack_log = [transform_log_item(x) for x in attack_log]
cleaned_normal_log = [transform_log_item(x) for x in normal_log]

X = pd.DataFrame(cleaned_attack_log + cleaned_normal_log)
# convert request_method using one hot encoding
temp = pd.get_dummies(X["request_method"], prefix="request_method")
X = pd.concat([X, temp], axis=1).drop(["request_method"], axis=1)

Y = pd.Series([1]*len(cleaned_attack_log) + [0]*len(cleaned_normal_log))

# x_attack_log = cleaned_attack_log
# y_attack_log = [1] * len(cleaned_attack_log)
# x_attack_train, x_attack_test, y_attack_train, y_attack_test = train_test_split(x_attack_log, y_attack_log, test_size=0.33, random_state=42)

# x_normal_log = cleaned_normal_log
# y_normal_log = [0] * len(cleaned_normal_log)
# x_normal_train, x_normal_test, y_normal_train, y_normal_test = train_test_split(x_normal_log, y_normal_log, test_size=0.33, random_state=42)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

# X_train = pd.DataFrame(x_attack_train + x_normal_train)
# Y_train = pd.Series(y_attack_train + y_normal_train)
# X_test = pd.DataFrame(x_attack_test + x_normal_test)
# Y_test = pd.Series(y_attack_test + y_normal_test)

display("X train")
display(X_train.head())

display("Y train")
display(Y_train.head())

display("X test")
display(X_test.head())

display("Y test")
display(Y_test.head())

'X train'

Unnamed: 0,942210,932100,942340,941320,942480,932115,942100,942360,942490,941210,...,941160,942190,942460,941170,920440,942330,941180,request_method_GET,request_method_POST,request_method_PUT
45578,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3091,0,0,0,0,0,0,0,0,0,1,...,1,0,1,1,0,0,0,0,1,0
28392,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
21522,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
38109,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


'Y train'

45578    0
3091     1
28392    0
21522    0
38109    0
dtype: int64

'X test'

Unnamed: 0,942210,932100,942340,941320,942480,932115,942100,942360,942490,941210,...,941160,942190,942460,941170,920440,942330,941180,request_method_GET,request_method_POST,request_method_PUT
10867,0,0,0,0,0,1,1,0,1,0,...,0,0,1,0,0,1,0,0,1,0
67152,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
68052,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
13752,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
23436,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


'Y test'

10867    1
67152    0
68052    0
13752    1
23436    0
dtype: int64

X train


Unnamed: 0,uri,942210,932100,942340,941320,942480,932115,942100,942360,942490,...,941160,942190,942460,941170,920440,942330,941180,request_method_GET,request_method_POST,request_method_PUT
45578,/tienda1/imagenes/nuestratierra.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3091,/tienda1/publico/caracteristicas.jsp,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,0,0,0,1,0
28392,/tienda1/publico/vaciar.jsp,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
21522,/tienda1/imagenes/3.gif,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
38109,/tienda1/publico/registro.jsp,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


X test


Unnamed: 0,uri,942210,932100,942340,941320,942480,932115,942100,942360,942490,...,941160,942190,942460,941170,920440,942330,941180,request_method_GET,request_method_POST,request_method_PUT
45578,/tienda1/imagenes/nuestratierra.jpg,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3091,/tienda1/publico/caracteristicas.jsp,0,0,0,0,0,0,0,0,0,...,1,0,1,1,0,0,0,0,1,0
28392,/tienda1/publico/vaciar.jsp,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
21522,/tienda1/imagenes/3.gif,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
38109,/tienda1/publico/registro.jsp,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
