In [2]:
import io
import os
import math
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report, precision_recall_fscore_support, accuracy_score
from sklearn import metrics

import tensorflow as tf
from tensorflow.keras import Input, Model, layers, losses, optimizers, callbacks
import tensorflow_decision_forests as tfdf


default_max_columns = None
default_max_rows = 10
pd.set_option('display.max_columns', default_max_columns)
pd.set_option('display.max_rows', default_max_rows)

df_name = "X-IIoTiD"
test_size_ratio = 0.2
val_size_ratio = 0.1
cwd = os.getcwd()

In [3]:
#Utility functions

def convert_to_number(x,replacement_number):
    try:
        x = int(x)
    except ValueError:
        try:
            x = float(x)
            if np.isnan(x):
                x = replacement_number
        except ValueError:
            x = replacement_number
    return x

def ip_to_octets(ip_adress):
    if ip_adress == "?" or pd.isnull(ip_adress):
        return [replacement_number]*4
    octets = str(ip_adress).split(".")
    if len(octets) != 4:
        raise Exception("Invalid IP Adress: "+str(ip_adress))
    else:
        for i in range(0,3):
            octets[i] = int(octets[i])
        return octets

def ip_to_octet(ip_adress, octet_index):
    octets = ip_to_octets(ip_adress)
    return octets[octet_index]

def frame_ip_to_octet(input_frame, ip_column, cleanup = True):
    frame = input_frame.copy()
    for i in range(4):
        frame[ip_column+"_octet"+str(i)] = frame[ip_column].apply(lambda x : ip_to_octet(x,i)).astype("float32")
    if cleanup:
        frame = frame.drop([ip_column],axis = 1)
    return frame


#Imputes by sampling mean and std derived from frames that have been split by unique values from a target column
#Most data is very dependent on protocol type
def impute_by_target_col(input_frame, target_col, cols_to_impute):
    output_frame = pd.DataFrame()
    for value in input_frame[target_col].unique():
        df_temp = input_frame.loc[input_frame[target_col] == value].copy() 
        for col in df_temp.columns:
            if col == target_col or col not in cols_to_impute:
                continue
            mean, std = df_temp[col].mean(), df_temp[col].std()
            if pd.isna(mean) or pd.isna(std):
                mean, std = input_frame[col].mean(), input_frame[col].std()
            df_temp[col] = df_temp[col].apply(lambda x: np.random.normal(mean,std) if pd.isna(x) else x)
        output_frame = pd.concat([output_frame,df_temp],ignore_index = True)
    return output_frame

#Removes all Features that do no Intersect with other datasets
def remove_features(input_frame): 
    ftk = features_to_keep.copy()
    for column in input_frame.columns:
        for target_col in ["Src_IP","Des_IP","Src_Port","Des_Port","Protocol"]:
            if target_col in column:
                ftk.append(column)
                break;              
    return input_frame[ftk]

In [4]:
#Load Data
file_path = "" #File paths deleted for privacy reasons

df_raw = pd.read_csv(file_path, dtype=str)

In [5]:
#Columns that need cleaning sorted by data types
num_cols = ["Scr_port",
            "Des_port",
            "Duration",
            "Scr_bytes",
            "Des_bytes",
            "missed_bytes",
            "Scr_pkts",
            "Scr_ip_bytes",
            "Des_pkts",
            "Des_ip_bytes",
            "total_bytes",
            "total_packet",
            "paket_rate",
            "byte_rate",
            "Scr_packts_ratio",
            "Des_pkts_ratio",
            "Scr_bytes_ratio",
            "Des_bytes_ratio",
            "Avg_user_time",
            "Std_user_time",
            "Avg_nice_time",
            "Std_nice_time",
            "Avg_system_time",
            "Std_system_time",
            "Avg_iowait_time",
            "Std_iowait_time",
            "Avg_ideal_time",
            "Std_ideal_time",
            "Avg_tps",
            "Std_tps",
            "Avg_rtps",
            "Std_rtps",
            "Avg_wtps",
            "Std_wtps",
            "Avg_ldavg_1",
            "Std_ldavg_1",
            "Avg_kbmemused",
            "Std_kbmemused",
            "Avg_num_Proc/s",
            "Std_num_proc/s",
            "Avg_num_cswch/s",
            "std_num_cswch/s",
            "OSSEC_alert",
            "OSSEC_alert_level",
            "Login_attempt",
            "Succesful_login",
            "File_activity",
            "Process_activity",
            "read_write_physical.process",
            "is_privileged"
            ]

bol_cols = ["anomaly_alert"]

features_to_keep = [
            "Duration",
            "Scr_bytes",
            "Des_bytes",
            "Scr_pkts",
            "Des_pkts",
            "Attack"
            ]

In [6]:
df_raw.head(5)

Unnamed: 0,Date,Timestamp,Scr_IP,Scr_port,Des_IP,Des_port,Protocol,Service,Duration,Scr_bytes,Des_bytes,Conn_state,missed_bytes,is_syn_only,Is_SYN_ACK,is_pure_ack,is_with_payload,FIN or RST,Bad_checksum,is_SYN_with_RST,Scr_pkts,Scr_ip_bytes,Des_pkts,Des_ip_bytes,anomaly_alert,total_bytes,total_packet,paket_rate,byte_rate,Scr_packts_ratio,Des_pkts_ratio,Scr_bytes_ratio,Des_bytes_ratio,Avg_user_time,Std_user_time,Avg_nice_time,Std_nice_time,Avg_system_time,Std_system_time,Avg_iowait_time,Std_iowait_time,Avg_ideal_time,Std_ideal_time,Avg_tps,Std_tps,Avg_rtps,Std_rtps,Avg_wtps,Std_wtps,Avg_ldavg_1,Std_ldavg_1,Avg_kbmemused,Std_kbmemused,Avg_num_Proc/s,Std_num_proc/s,Avg_num_cswch/s,std_num_cswch/s,OSSEC_alert,OSSEC_alert_level,Login_attempt,Succesful_login,File_activity,Process_activity,read_write_physical.process,is_privileged,class1,class2,class3
0,9/01/2020,1578540956,192.168.2.199,49278,192.168.2.10,80,tcp,http,0.67369,13437,34924,1,0,True,True,True,True,True,False,False,105,18905,105,40392,True,107658,210,311.7160712,159803.4704,0.5,0.5,0.300414275,0.699585725,9.207,5.55584206,10.994,1.356305275,4.864,1.873004004,0.311,0.224653066,74.624,8.245611196,12.297,10.38585004,8.0,10.50714043,4.297,2.723578712,2.146,0.102781321,915852.8,2507.97563,5.1,3.238826948,2806.2,158.7493622,1,5,0,0,0,0,0,0,Scanning_vulnerability,Reconnaissance,Attack
1,13/01/2020,1578871873,10.0.1.5,39769,131.236.3.92,53,udp,dns,8.3e-05,78,0,1,0,False,False,False,True,False,False,False,2,134,0,0,False,212,2,24096.38554,2554216.867,1,0,1,0,12.326,6.674247823,2.895,1.604302029,3.778,2.099741889,2.741,5.327471164,78.263,8.869399134,6.6,5.713142743,0.0,0.0,6.6,5.713142743,0.612,0.037629775,910498.4,4044.112046,1.0,0.894427191,1561.6,247.0802299,0,0,0,0,0,0,0,0,Normal,Normal,Normal
2,9/01/2020,1578522486,172.24.1.80,59050,172.24.1.1,53,udp,dns,0.000132,38,38,1,0,False,False,False,True,False,False,False,1,66,1,66,False,208,2,15151.51515,1575757.576,0.5,0.5,0.5,0.5,6.931,6.416007248,0.706,0.408905857,1.693,0.771635277,2.423,3.829809525,88.245,7.112108337,37.4,40.19004852,30.1,39.79811553,7.3,3.1,0.55,0.02,921020.4,2139.652645,1.0,0.0,1603.3,294.1390997,0,0,0,0,0,0,0,0,Normal,Normal,Normal
3,27/02/2020,1582757640,192.168.2.196,37966,192.168.2.10,1880,tcp,websocket,9.378481,1121,484,1,0,True,True,True,True,True,False,False,8,1545,6,804,False,3954,14,1.492779055,421.6034558,0.571428571,0.428571429,0.67425392,0.32574608,10.244,7.932040343,1.965,1.514168088,3.941,1.709055002,0.509,0.425522032,83.339,10.20608686,29.0,26.57442379,23.3,26.12680616,5.7,6.229767251,0.708,0.026,915874.8,977.8851466,7.2,5.325410782,1685.3,467.8737116,0,0,1,1,1,1,1,1,Normal,Normal,Normal
4,16/12/2019,1576452612,172.24.1.80,38233,172.24.1.1,53,udp,dns,7.4e-05,-,-,1,0,False,False,False,True,False,False,False,-,-,-,-,False,-,-,-,-,-,-,-,-,5.181,5.980458929,2.365,1.284462923,2.415,1.401151312,1.803,4.478256469,88.239,8.090017862,9.8,1.326649916,0.1,0.3,9.7,1.268857754,0.536,0.030066593,918213.2,2342.93836,0.6,0.489897949,1456.5,346.8847791,0,0,0,0,0,0,0,0,Normal,Normal,Normal


In [7]:
df_raw.describe()

Unnamed: 0,Date,Timestamp,Scr_IP,Scr_port,Des_IP,Des_port,Protocol,Service,Duration,Scr_bytes,Des_bytes,Conn_state,missed_bytes,is_syn_only,Is_SYN_ACK,is_pure_ack,is_with_payload,FIN or RST,Bad_checksum,is_SYN_with_RST,Scr_pkts,Scr_ip_bytes,Des_pkts,Des_ip_bytes,anomaly_alert,total_bytes,total_packet,paket_rate,byte_rate,Scr_packts_ratio,Des_pkts_ratio,Scr_bytes_ratio,Des_bytes_ratio,Avg_user_time,Std_user_time,Avg_nice_time,Std_nice_time,Avg_system_time,Std_system_time,Avg_iowait_time,Std_iowait_time,Avg_ideal_time,Std_ideal_time,Avg_tps,Std_tps,Avg_rtps,Std_rtps,Avg_wtps,Std_wtps,Avg_ldavg_1,Std_ldavg_1,Avg_kbmemused,Std_kbmemused,Avg_num_Proc/s,Std_num_proc/s,Avg_num_cswch/s,std_num_cswch/s,OSSEC_alert,OSSEC_alert_level,Login_attempt,Succesful_login,File_activity,Process_activity,read_write_physical.process,is_privileged,class1,class2,class3
count,820503,820537,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834.0,820834.0,820834,820834,820834.0,820834.0,820834.0,820834.0,820834.0,820834.0,820834.0,820834.0,820834.0,820834.0,820834,820834,820834,820834,820834,820834,820834.0,820834.0,820834.0,820834.0,820834,820834,820834.0,820834.0,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834,820834
unique,49,218989,36,57460,96,6444,4,17,330207,21054,51899,2,231,2,2,2,2,2,1,1,1211,32872,1556,55574,3,62464,2216,303353,339658,19995.0,19995.0,72716,72716,12635.0,51646.0,4551.0,50541.0,5174.0,50456.0,10157.0,30703.0,19705.0,51725.0,3736,21771,2254,16492,2017,11124,2814.0,3450.0,28144.0,51523.0,1061,3649,14986.0,51588.0,2,6,2,2,2,2,2,2,19,10,2
top,24/02/2020,1578542119,192.168.2.199,5353,192.168.2.10,80,tcp,http,-,-,-,1,0,TRUE,FALSE,FALSE,TRUE,TRUE,FALSE,FALSE,1,-,1,0,FALSE,-,2,-,-,0.5,0.5,-,-,6.478,6.07937176,1.472,1.403424383,2.15,2.563215949,0.077,0.117647779,87.054,7.736539537,3,0,0,0,3,0,0.825,0.02,914762.8,2614.998616,0,0,1578.8,393.7089788,0,0,0,0,0,0,0,0,Normal,Normal,Normal
freq,147621,5631,156023,2889,312684,237640,422334,237738,75761,223871,223871,699041,820351,420448,497805,497546,711816,415483,820834,820834,249056,152743,208246,174182,740250,223872,260010,223871,223871,248977.0,248977.0,223872,223872,5776.0,5707.0,6071.0,5707.0,6365.0,5707.0,12923.0,13186.0,5754.0,5707.0,11726,11484,138135,138134,33190,32601,6766.0,37463.0,5755.0,5707.0,131979,147442,5804.0,5707.0,778819,778819,749171,752926,761200,753017,529184,753151,421417,421417,421417


In [8]:
#Cleaning
replacement_number = np.nan

#Drop time related features
df_cleaned = df_raw.drop(["Date","Timestamp"],axis = 1)


for col in num_cols:
    df_cleaned[col] = df_cleaned[col].apply(lambda x: convert_to_number(x,replacement_number))
for col in bol_cols:
    df_cleaned[col] = df_cleaned[col].replace({"-" : replacement_number})

print("IPV6 deleted adresses count:")
x = df_cleaned.loc[df_cleaned["Scr_IP"].str.contains(":",  na = False)].shape[0]
print(x)
print(x/df_raw.shape[0])
print("no data entries count:")
x = df_cleaned.loc[df_cleaned["Protocol"] == "?"].shape[0]
print(x)
print(x/df_raw.shape[0])

df_cleaned.loc[:,"Protocol"] = df_cleaned["Protocol"].apply(lambda x: x.upper())
df_cleaned = df_cleaned.drop(df_cleaned.loc[df_cleaned["Scr_IP"].str.contains(":",  na = False)].index) # IPV 6 adresses deleted
df_cleaned = df_cleaned.loc[df_cleaned["Protocol"] != "?"] #Attack entries that contain no information deleted



IPV6 deleted adresses count:
592
0.000721217688351116
no data entries count:
154
0.00018761406082106735


In [9]:
print("nan count:")
x = df_cleaned.shape[0]-df_cleaned.dropna().shape[0]
print(x)
print(x/df_raw.shape[0])

nan count:
224433
0.27342069163801697


In [10]:
#Replace boolean features
for col in bol_cols:
    df_cleaned[col] = df_cleaned[col].replace({np.nan : False})

In [11]:
#Reload dataframe for automatic type detection
stream = io.StringIO()
df_cleaned.to_csv(stream, index=False)
stream.seek(0)
df_reloaded = pd.read_csv(stream)
stream.close()

In [12]:
df_reloaded.columns

Index(['Scr_IP', 'Scr_port', 'Des_IP', 'Des_port', 'Protocol', 'Service',
       'Duration', 'Scr_bytes', 'Des_bytes', 'Conn_state', 'missed_bytes',
       'is_syn_only', 'Is_SYN_ACK', 'is_pure_ack', 'is_with_payload',
       'FIN or RST', 'Bad_checksum', 'is_SYN_with_RST', 'Scr_pkts',
       'Scr_ip_bytes', 'Des_pkts', 'Des_ip_bytes', 'anomaly_alert',
       'total_bytes', 'total_packet', 'paket_rate', 'byte_rate',
       'Scr_packts_ratio', 'Des_pkts_ratio', 'Scr_bytes_ratio',
       'Des_bytes_ratio', 'Avg_user_time', 'Std_user_time', 'Avg_nice_time',
       'Std_nice_time', 'Avg_system_time', 'Std_system_time',
       'Avg_iowait_time', 'Std_iowait_time', 'Avg_ideal_time',
       'Std_ideal_time', 'Avg_tps', 'Std_tps', 'Avg_rtps', 'Std_rtps',
       'Avg_wtps', 'Std_wtps', 'Avg_ldavg_1', 'Std_ldavg_1', 'Avg_kbmemused',
       'Std_kbmemused', 'Avg_num_Proc/s', 'Std_num_proc/s', 'Avg_num_cswch/s',
       'std_num_cswch/s', 'OSSEC_alert', 'OSSEC_alert_level', 'Login_attempt',
       

In [13]:
df_reloaded["Protocol"].unique()

array(['TCP', 'UDP', 'ICMP'], dtype=object)

In [14]:
#Preprocessing
df_processed = df_reloaded

#Dropping
df_processed = df_processed.drop(["class1"],axis = 1)
df_processed = df_processed.drop(["class2"],axis = 1)

#Imputing
df_processed = impute_by_target_col(df_processed,"Protocol",features_to_keep)

#Renaming for Netflow Datasets
df_processed = df_processed.rename(columns={
    "Protocol":"PROTOCOL",
    "Duration":"FLOW_DURATION_MILLISECONDS",
    "Scr_bytes":"OUT_BYTES",
    "Des_bytes":"IN_BYTES",
    "Scr_pkts":"OUT_PKTS",
    "Des_pkts":"IN_PKTS",
    "Scr_IP":"IPV4_SRC_ADDR",
    "Des_IP":"IPV4_DST_ADDR",
    "Scr_port":"L4_SRC_PORT",
    "Des_port":"L4_DST_PORT"
})

#Converting Attack Label to Bool
df_processed["class3"] = df_processed["class3"].apply(lambda x: 1 if x == "Attack" else 0)
df_processed = df_processed.rename(columns={"class3":"Label"})

In [15]:
df_processed.head(5)

Unnamed: 0,IPV4_SRC_ADDR,L4_SRC_PORT,IPV4_DST_ADDR,L4_DST_PORT,PROTOCOL,Service,FLOW_DURATION_MILLISECONDS,OUT_BYTES,IN_BYTES,Conn_state,missed_bytes,is_syn_only,Is_SYN_ACK,is_pure_ack,is_with_payload,FIN or RST,Bad_checksum,is_SYN_with_RST,OUT_PKTS,Scr_ip_bytes,IN_PKTS,Des_ip_bytes,anomaly_alert,total_bytes,total_packet,paket_rate,byte_rate,Scr_packts_ratio,Des_pkts_ratio,Scr_bytes_ratio,Des_bytes_ratio,Avg_user_time,Std_user_time,Avg_nice_time,Std_nice_time,Avg_system_time,Std_system_time,Avg_iowait_time,Std_iowait_time,Avg_ideal_time,Std_ideal_time,Avg_tps,Std_tps,Avg_rtps,Std_rtps,Avg_wtps,Std_wtps,Avg_ldavg_1,Std_ldavg_1,Avg_kbmemused,Std_kbmemused,Avg_num_Proc/s,Std_num_proc/s,Avg_num_cswch/s,std_num_cswch/s,OSSEC_alert,OSSEC_alert_level,Login_attempt,Succesful_login,File_activity,Process_activity,read_write_physical.process,is_privileged,Label
0,192.168.2.199,49278.0,192.168.2.10,80.0,TCP,http,0.67369,13437.0,34924.0,1,0.0,True,True,True,True,True,False,False,105.0,18905.0,105.0,40392.0,True,107658.0,210.0,311.716071,159803.4704,0.5,0.5,0.300414,0.699586,9.207,5.555842,10.994,1.356305,4.864,1.873004,0.311,0.224653,74.624,8.245611,12.297,10.38585,8.0,10.50714,4.297,2.723579,2.146,0.102781,915852.8,2507.97563,5.1,3.238827,2806.2,158.749362,1,5,0,0,0,0,0,0,1
1,192.168.2.196,37966.0,192.168.2.10,1880.0,TCP,websocket,9.378481,1121.0,484.0,1,0.0,True,True,True,True,True,False,False,8.0,1545.0,6.0,804.0,False,3954.0,14.0,1.492779,421.603456,0.571429,0.428571,0.674254,0.325746,10.244,7.93204,1.965,1.514168,3.941,1.709055,0.509,0.425522,83.339,10.206087,29.0,26.574424,23.3,26.126806,5.7,6.229767,0.708,0.026,915874.8,977.885147,7.2,5.325411,1685.3,467.873712,0,0,1,1,1,1,1,1,0
2,192.168.10.155,50502.0,192.168.2.10,80.0,TCP,http,5.153928,460.0,671.0,1,0.0,True,True,True,True,True,False,False,12.0,1560.0,10.0,1878.0,False,4569.0,22.0,4.268589,886.50831,0.545455,0.454545,0.44211,0.55789,10.579,6.866914,3.273,2.366141,2.638,1.040921,0.279,0.210829,83.233,9.579212,19.3,18.423083,8.8,10.205881,10.5,13.529597,0.524,0.028,910370.8,4006.8817,0.9,0.538516,1366.9,304.062642,0,0,0,0,0,0,1,0,0
3,172.24.1.213,3467.0,161.69.36.37,80.0,TCP,http,3.011189,0.0,0.0,1,0.0,True,False,False,False,False,False,False,2.0,104.0,0.0,0.0,False,104.0,2.0,0.664189,34.537852,1.0,0.0,1.0,0.0,11.505,9.762796,1.795,0.970775,3.669,1.54324,2.928,7.630635,80.104,14.572305,22.4,20.323386,8.9,17.13155,13.5,12.531959,0.624,0.110109,912787.6,3127.432404,1.2,0.748331,1519.4,185.391046,0,0,0,0,0,0,0,0,0
4,192.168.2.199,42465.0,192.168.2.10,1880.0,TCP,websocket,0.003863,16554.93363,-25506.464239,1,0.0,True,True,True,True,True,False,False,-441.855635,,146.869271,,False,,,,,,,,,10.855,6.155793,12.885,3.109682,8.039,2.830076,2.735,5.399112,65.484,10.616435,18.8,16.521501,7.5,8.546929,11.3,12.814445,0.913,0.021,914442.8,1957.066417,13.8,7.9975,2889.6,717.9822,0,0,0,0,0,0,0,0,1


In [16]:
df_processed.describe()

Unnamed: 0,L4_SRC_PORT,L4_DST_PORT,FLOW_DURATION_MILLISECONDS,OUT_BYTES,IN_BYTES,Conn_state,missed_bytes,OUT_PKTS,Scr_ip_bytes,IN_PKTS,Des_ip_bytes,total_bytes,total_packet,paket_rate,byte_rate,Scr_packts_ratio,Des_pkts_ratio,Scr_bytes_ratio,Des_bytes_ratio,Avg_user_time,Std_user_time,Avg_nice_time,Std_nice_time,Avg_system_time,Std_system_time,Avg_iowait_time,Std_iowait_time,Avg_ideal_time,Std_ideal_time,Avg_tps,Std_tps,Avg_rtps,Std_rtps,Avg_wtps,Std_wtps,Avg_ldavg_1,Std_ldavg_1,Avg_kbmemused,Std_kbmemused,Avg_num_Proc/s,Std_num_proc/s,Avg_num_cswch/s,std_num_cswch/s,OSSEC_alert,OSSEC_alert_level,Login_attempt,Succesful_login,File_activity,Process_activity,read_write_physical.process,is_privileged,Label
count,820088.0,820088.0,820088.0,820088.0,820088.0,820088.0,820088.0,820088.0,667608.0,820088.0,667609.0,596579.0,667609.0,596580.0,596579.0,667609.0,667609.0,596579.0,596579.0,819589.0,819609.0,819609.0,819609.0,819604.0,819604.0,819604.0,819604.0,819609.0,819609.0,819596.0,819596.0,819596.0,819596.0,819596.0,819596.0,819562.0,819562.0,818976.0,818976.0,819609.0,819609.0,819609.0,819609.0,820088.0,820088.0,820088.0,820088.0,820088.0,820088.0,820088.0,820088.0,820088.0
mean,45808.197022,2240.209685,9.698792,2031.048,55061.29,0.85172,35.64816,26.031087,3082.449,37.177036,53317.08,121592.4,58.346737,29993.69,3193058.0,0.643616,0.356384,0.572309,0.427691,10.112034,5.855865,2.706363,1.547597,4.576969,1.634279,2.157225,3.577942,80.447911,9.481081,15.809487,13.034756,7.095183,9.547381,8.714321,5.905881,0.995593,0.058123,914156.365189,2490.516874,2.693628,2.958004,3945.771652,623.004722,0.051232,0.26796,0.087202,0.082623,0.072532,0.082511,0.355633,0.082347,0.486854
std,12237.626323,4846.902489,114.672244,8598.969,324882.8,0.355378,3981.357,312.928431,21366.36,303.239935,321979.5,676964.1,584.087353,94885.37,11321780.0,0.224373,0.224373,0.295219,0.295219,8.964108,2.397998,2.00238,0.841908,4.264697,0.878964,3.650195,5.710527,13.569503,4.467291,15.560577,14.16277,14.950015,14.892822,4.258546,5.128661,0.646491,0.081763,12382.925578,2366.669624,580.52437,820.973276,5268.494992,863.336952,0.220471,1.178535,0.28213,0.275311,0.259368,0.275141,0.478705,0.274893,0.499827
min,3.0,0.0,-709.37665,-46050.74,-1872586.0,0.0,0.0,-1753.826502,0.0,-1837.099938,0.0,56.0,1.0,0.02337215,1.562475,0.0,0.0,0.0,0.0,0.228,0.125064,0.0,0.0,0.507,-0.198966,0.0,-0.498306,0.0,0.0,-45.5,-19.152128,0.0,0.0,-46.3,-42.037441,0.02,-0.163691,528121.2,5.6,-11.9,-24.694497,-861.4,-1918.07033,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,39134.0,53.0,0.000186,37.0,0.0,1.0,0.0,1.0,75.0,0.0,0.0,204.0,2.0,5.384286,2927.566,0.5,0.0,0.414557,0.129909,4.308,4.354195,1.695,1.045053,2.155,1.072269,0.207,0.16857,76.228,6.754729,9.1,5.325411,0.1,0.3,4.697,1.417745,0.533,0.022271,913080.8,1617.490748,0.499,0.4,1333.3,233.883817,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,48346.0,80.0,0.024902,232.0,72.0,1.0,0.0,2.0,590.0,1.0,73.0,1470.0,3.0,342.6513,157172.5,0.529412,0.470588,0.507043,0.492957,6.65,5.69158,2.166,1.387738,2.753,1.426613,0.457,0.409141,85.708,8.666752,13.8,10.336827,3.8,6.103278,9.2,5.07937,0.795,0.035,915435.6,2134.767022,0.8,0.629314,1526.7,314.956521,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,54492.0,5355.0,5.909006,1556.33,1002.0,1.0,0.0,8.40479,1180.0,6.0,591.0,3628.0,11.0,13157.89,1229050.0,1.0,0.5,0.870091,0.585443,10.905,7.1465,2.817,1.771168,5.082,1.979456,2.735,5.40637,90.043,11.478169,20.8,16.973214,11.1,13.030104,11.6,8.953212,1.276,0.061847,918393.2,2847.97868,1.4,1.2,2372.7,468.96712,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
max,65535.0,65389.0,9331.420034,1665228.0,19359680.0,1.0,1882937.0,65541.0,4194548.0,65575.0,20110090.0,39163470.0,131116.0,3000000.0,306000000.0,1.0,1.0,1.0,1.0,42.814,29.770909,21.203,19.400489,74.15,32.679285,72.654,56.073424,98.59,60.44165,1053.7,533.356588,1047.6,531.959998,60.6,85.99907,4.307,1.516748,936797.6,429301.466,303432.0,429113.4235,21007.9,7250.787108,1.0,10.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [17]:
#Export
df_processed.to_csv(df_name+"_pre_aligned.csv",index=False)