## Load imports

In [None]:
%run Imports\&functions.ipynb

## Data Analysis

In [None]:
# list of binetflow files
binetflow_list =["binetflow/capture20110810.binetflow",
                 "binetflow/capture20110811.binetflow",
                 "binetflow/capture20110812.binetflow",
                 "binetflow/capture20110815-2.binetflow",
                 "binetflow/capture20110815-3.binetflow",
                 "binetflow/capture20110815.binetflow",
                 "binetflow/capture20110816-2.binetflow",
                 "binetflow/capture20110816-3.binetflow",
                 "binetflow/capture20110816.binetflow",
                 "binetflow/capture20110817.binetflow",
                 "binetflow/capture20110818.binetflow",
                 "binetflow/capture20110818-2.binetflow",
                 "binetflow/capture20110819.binetflow"]

# read binetflows into df1...df13 variables
i = 1
for binetflow in binetflow_list:
    globals()["df" + str(i)] = pd.read_csv(binetflow)
    i+=1
    
#place newly read pandas into list for easier data processing
df_list = [df1,df2,df3,df4,df5,
           df6,df7,df8,df9,df10,
           df11,df12,df13]

In [None]:
#drop null values
#Most from one column but Dropping column lost more information than dropping nulls
for df in df_list:
    df.dropna(inplace=True)

In [None]:
#Change Label column from string to boolean for modeling
for df in df_list:
    df.Label = df.Label.str.contains("Botnet")

In [None]:
#Source address and Start time not used in this analysis
#Removed to save space and memory
for df in df_list:
    df.drop('StartTime',axis=1, inplace=True)
    df.drop('SrcAddr',axis=1, inplace=True)

In [None]:
#adding dataframes together for use in cloud instance
total_df = df1.append(df2)
total_df = total_df.append(df3)
total_df = total_df.append(df4)
total_df = total_df.append(df5)
total_df = total_df.append(df6)

In [None]:
#Split up append functions individually rather than loop
#Because of checking shape and errors throughout 
total_df = total_df.append(df7)
total_df = total_df.append(df8)
total_df = total_df.append(df9)
total_df = total_df.append(df10)
total_df = total_df.append(df11)
total_df = total_df.append(df12)
total_df = total_df.append(df13)

In [None]:
total_df.shape

In [None]:
total_df.dtypes

In [None]:
#Check y value counts for modeling
total_df.Label.value_counts(normalize=True)

In [None]:
#Noticed identical rows when all dataframes added together
total_df.drop_duplicates(inplace=True)

## Feature Engineering

In [None]:
#regex pattern for grabbing first 6 digits of IP address
pattern ="^\d{1,3}\.\d{1,3}\."
#replace column values with pattern grab
total_df['DstAddr'] = total_df['DstAddr'].map(lambda x: re.findall(pattern,x)[0])

In [None]:
#get dummies of categorical values that don't have value counts over 30
total_df = pd.get_dummies(total_df, columns=['State','Proto','Dir'], drop_first=True)

In [None]:
#Random sampling of minority and majority values
#READ THIS: n_samples value significantly lowered for notebooks to be able to run on laptops 
#Label value is what we are trying to predict

df_majority = total_df[total_df.Label==False]
df_minority = total_df[total_df.Label==True]

df_majority_downsampled = resample(df_majority, 
                                 replace=True,     # sample with replacement
                                 n_samples=50000,# to match majority class
                                 random_state=42) # reproducible results

# upsample minority class
df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=50000,# to match minority class
                                 random_state=42) # reproducible results
 

In [None]:
df_minority_upsampled[df_minority_upsampled.Label==True].shape

In [None]:
df_majority_downsampled[df_majority_downsampled.Label==False].shape

In [None]:
# Combine upsampled minority class with majority class
sampled_df = pd.concat([df_minority_upsampled, df_majority_downsampled])

In [None]:
#Sanity check
sampled_df[sampled_df.Label==True].shape

In [None]:
#Grab top 100 value counts of the three remaining categorical columns

top_dstaddr = sampled_df.DstAddr.value_counts().index[:100]
top_dport = sampled_df.Dport.value_counts().index[:100]
top_sport = sampled_df.Sport.value_counts().index[:100]

# assign values to pandas dataframe
dstaddr_dummies= dum_sign(top_dstaddr)
dport_dummies= dum_sign(top_dport)
sport_dummies= dum_sign(top_sport)

In [None]:
#join dumies to sampled_df
sampled_df= sampled_df.join(dstaddr_dummies)
sampled_df= sampled_df.join(dport_dummies)
# sampled_df= sampled_df.join(sport_dummies)

In [None]:
#sanity check
sampled_df[sampled_df.Label==True].shape

In [None]:
#remove dummied columns to save memory
sampled_df = sampled_df.drop(axis=1,labels=['DstAddr','Dport','Sport'])