In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sagemaker
import boto3
import json
import math
import os
import sys

In [2]:
"""
Set up AWS resources.
"""
bucket = 'cic-network-data'
file_path = 'base_df.csv'
role = sagemaker.get_execution_role()
s3 = boto3.resource('s3')
s3.Bucket(bucket).download_file(file_path, 'raw_data.csv')

In [3]:
def load_file(file):
    df = pd.read_csv(file)
    return df

In [None]:
loaded_df = load_file('./raw_data.csv')

In [None]:
def clean_df(df):
    df.fillna(0, inplace=True)
    df = df[df.Protocol != 'Protocol'].copy()
    numeric_list = list(df.columns.values)
    numeric_list.remove('Label')
    numeric_list.remove('Timestamp')
    for col in numeric_list:
        df[col] = pd.to_numeric(df[col].copy(), errors='coerce')
    df['Dst Port'] = pd.to_numeric(df['Dst Port'].copy(), errors='coerce')
    no_bias = ['Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd URG Flags', 'Bwd PSH Flags', \
              'Bwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Fwd Byts/b Avg']
    df.drop(df[no_bias], axis=1, inplace=True)
    df['Label'] = df['Label'].copy().apply(lambda y: 0 if y == 'Benign' else 1)
    df['Timestamp'] = pd.to_datetime(df['Timestamp'].copy(), dayfirst=True, errors='coerce')
    df['Hour'] = df['Timestamp'].copy().dt.hour
    df['Minute'] = df['Timestamp'].copy().dt.minute
    df['Second'] = df['Timestamp'].copy().dt.second
    df['Day'] = df['Timestamp'].copy().dt.day
    df['Month'] = df['Timestamp'].copy().dt.month
    df['Year'] = df['Timestamp'].copy().dt.year
    df.drop('Timestamp', axis=1, inplace=True)
    cols = list(df.columns)
    reorder_df = [cols[-7]] + cols[:-7] + cols[-6:]
    abt = df[reorder_df]
    abt.to_csv('abt.csv', index=False)
    s3.Bucket(bucket).upload_file('abt.csv', 'abt.csv')
    return df

In [None]:
cleaned_df = clean_df(loaded_df)

In [None]:
s3.Bucket(bucket).upload_file('abt.csv', 'abt.csv')