In [11]:
import json
from pprint import pprint
import pandas as pd

def load_mongo_collection(filename):
    collection = []
    with open(filename) as fd:
        while True:
            line = fd.readline()
            if line:
                data = json.loads(line)
                collection.append(data)
            else:
                break
    
    return collection

attack_log = load_mongo_collection("./csic_attack_log.json")
normal_log = load_mongo_collection("./csic_normal_log.json")

pprint(len(attack_log))
pprint(attack_log[:5])

21451
[{'_id': {'$oid': '5ecbcc60140ff600050519e2'},
  'matched_rules': [{'id': 920273,
                     'matched_value': 'Matched "Operator `ValidateByteRange\' '
                                      'with parameter '
                                      "`38,44-46,48-58,61,65-90,95,97-122' "
                                      "against variable `REQUEST_BODY' (Value: "
                                      '`modo=entrar&login=arro&pwd=CarMinAR%3C%21--%23include+file%3D%22archivo_secreto%22+--%3E&remember=on '
                                      "(10 characters omitted)' )",
                     'msg': 'Invalid character in request (outside of very '
                            'strict set)'},
                    {'id': 941180,
                     'matched_value': 'Matched "Operator `Pm\' with parameter '
                                      '`document.cookie document.write '
                                      '.parentnode .innerhtml window.location '
                  

In [12]:
# collect all available rules
all_rules = set()
for item in attack_log:
    [all_rules.add(x["id"]) for x in item["matched_rules"]]

for item in normal_log:
    [all_rules.add(x["id"]) for x in item["matched_rules"]]

pprint(len(all_rules))
pprint(all_rules)

53
{911100,
 920220,
 920230,
 920270,
 920271,
 920272,
 920273,
 920440,
 920500,
 921120,
 921151,
 921160,
 930120,
 932100,
 932115,
 932150,
 932160,
 932190,
 941100,
 941110,
 941140,
 941150,
 941160,
 941170,
 941180,
 941210,
 941320,
 942100,
 942110,
 942130,
 942180,
 942190,
 942200,
 942210,
 942240,
 942260,
 942280,
 942300,
 942310,
 942330,
 942340,
 942360,
 942370,
 942390,
 942400,
 942430,
 942431,
 942432,
 942440,
 942460,
 942480,
 942490,
 942511}


In [16]:
def transform_log_item(item):
    global all_rules
    transformed_data = {
        "request_method": item.get("request_method"),
        "uri": item.get("uri")
    }

    for rule_id in all_rules:
        transformed_data[str(rule_id)] = 0

    for rule_id in map(lambda x: x["id"], item.get("matched_rules")):
        transformed_data[str(rule_id)] = 1

    return transformed_data

cleaned_attack_log = [transform_log_item(x) for x in attack_log]
cleaned_normal_log = [transform_log_item(x) for x in normal_log]

attack_log_df = pd.DataFrame(cleaned_attack_log)
attack_log_df["is_attack"] = 1
normal_log_df = pd.DataFrame(cleaned_normal_log)
normal_log_df["is_attack"] = 0

combined_df = pd.concat([attack_log_df, normal_log_df])

print("Attack log", attack_log_df.shape)
print(attack_log_df.head())

print("Normal log", normal_log_df.shape)
print(normal_log_df.head())

print("Combined log", combined_df.shape)
combined_df.describe()

Attack log (21451, 56)
  request_method                                                uri  942210  \
0           POST                    /tienda1/publico/autenticar.jsp       0   
1            GET         /tienda1/publico/caracteristicas.jsp?idA=2       0   
2           POST                        /tienda1/publico/anadir.jsp       0   
3           POST                      /tienda1/publico/registro.jsp       0   
4            GET  /tienda1/publico/autenticar.jsp?modo=entrar&lo...       0   

   932100  942340  941320  942480  932115  942100  942360  ...  942310  \
0       0       0       0       0       0       0       0  ...       0   
1       0       0       0       0       0       0       0  ...       0   
2       0       0       0       0       0       0       0  ...       0   
3       0       0       0       0       0       0       0  ...       0   
4       0       0       0       0       0       0       0  ...       0   

   942440  941160  942190  942460  941170  920440  942330

Unnamed: 0,942210,932100,942340,941320,942480,932115,942100,942360,942490,941210,...,942310,942440,941160,942190,942460,941170,920440,942330,941180,is_attack
count,88409.0,88409.0,88409.0,88409.0,88409.0,88409.0,88409.0,88409.0,88409.0,88409.0,...,88409.0,88409.0,88409.0,88409.0,88409.0,88409.0,88409.0,88409.0,88409.0,88409.0
mean,0.001844,0.001866,2.3e-05,0.011198,0.003224,0.012352,0.016062,0.000905,0.01061,0.003405,...,0.000305,0.017634,0.014603,1.1e-05,0.025473,0.003405,0.025167,0.009173,0.010293,0.242634
std,0.042899,0.043161,0.004756,0.105227,0.056686,0.11045,0.125714,0.030068,0.102456,0.05825,...,0.017473,0.131618,0.119956,0.003363,0.157556,0.05825,0.156633,0.095337,0.100932,0.428678
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
