In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import LocalOutlierFactor

import matplotlib.pyplot as plt
%matplotlib inline

### Load Data

In [None]:
%%time
#print('script started!',flush=True)
data = pd.read_csv('../Data/ISCX_Botnet_Labelled.csv')

In [None]:
data.shape

### Check class distribution .. is it balanced?

In [None]:
# Explore BotNet_Label values
s = data['BotNet_Label'].value_counts()

In [None]:
s

In [None]:
# get a list of labels where the number of instances is > 10
labels = list(s[s > 10].index)

In [None]:
# remove BotNets where the number of instances is < 10
data = data[data['BotNet_Label'].isin(labels)]
data['BotNet_Label'].value_counts()

In [None]:
#data.columns

### Missing value imputation

In [None]:
## FlowGenerator uses ? for a missing value .. let's replace it with NaN
data.replace('?', np.NaN,inplace=True)
print('symbol ? replaced with NaN',flush=True)
# using isnull() function  
data.isnull().values.any()

In [None]:
data.isnull().sum().sum()

In [None]:
%%time
## Only run this code if your data contains NaNs
## Replace NaNs with the median of the column

#for c in data.columns:
#    if c != 'BotNet_Label':
#        data[c] = pd.to_numeric(data[c], errors='coerce')
#        data[c] = data[c].replace(np.NaN,data[c].median())


#save data so we can use it later
#data.to_csv('../Data/no_nans.csv',index=False)
#print('NaN values replaced with median in %f'%(t2-t1),flush=True)

### Plot Correlation Matrix to Check for Highly Correlated Features

In [None]:
## use all columns except protocol and ports to save time
tmp_df = data[['Flow Duration', 'Flow Bytes/s', 'Flow Packets/s','Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max', 
'Flow IAT Min','Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min','Bwd IAT Mean',
'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min','Active Mean', 'Active Std', 'Active Max', 'Active Min', 'Idle Mean',
       'Idle Std', 'Idle Max', 'Idle Min']]

In [None]:
f = plt.figure(figsize=(19, 15))
plt.matshow(tmp_df.corr(), fignum=f.number)
plt.xticks(range(tmp_df.shape[1]), tmp_df.columns, fontsize=14, rotation=45)
plt.yticks(range(tmp_df.shape[1]), tmp_df.columns, fontsize=14)
cb = plt.colorbar()
cb.ax.tick_params(labelsize=14)
plt.title('Correlation Matrix', fontsize=16);

#### Do it Programmatically

In [None]:
# Create correlation matrix
corr_matrix = tmp_df.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]
to_drop

#### Do you want to drop highly correlated features?

In [None]:
#remove highly correlated columns
#data.drop(to_drop,axis=1, inplace=True)

In [None]:
data.columns

In [None]:
%%time

Y = data['BotNet_Label']
data.drop('BotNet_Label', axis=1, inplace=True)

#remove single quote from column name
#rm_quote = lambda x: x.replace('\'', '')
#cols = data.columns
#data.columns = [rm_quote(x) for x in cols]

### Run Outlier Detection and Removal?

In [None]:
%%time
outliers_fraction = 0.5
rng = np.random.RandomState(42)

# fit the model
#clf = IsolationForest(contamination=outliers_fraction, random_state=rng, n_jobs=60)
#clf = EllipticEnvelope(contamination=outliers_fraction)
clf = LocalOutlierFactor(n_neighbors=25, contamination=outliers_fraction)
y_pred = clf.fit_predict(data)#only for LocalOutlierFactor


data['Outlier'] = y_pred
data['BotNet_Label'] = Y

outlier_mask = data['Outlier'].isin([-1])

print('To apply mask and removed outliters',flush=True)
data = data.loc[~outlier_mask]
data.drop('Outlier', axis=1, inplace=True)
print('data to be saved',flush=True)
#data.to_csv('../Data/no_outliers.csv',index=False)


In [None]:
data.shape

In [None]:
#save subdatasets
#labels = list(data['BotNet_Label'].unique())
#for label in labels:
#    tlbl = label.replace(" ", "_")#if label has space replace it with _
#    tdata = data[data['BotNet_Label']==label]
#    tdata.to_csv('../Data/Bot_'+tlbl+'.csv',index=False)
#    print('Done: ',label,len(tdata))

#print('all done', flush=True)

In [None]:
data['BotNet_Label'].value_counts()

In [None]:
# Explore BotNet_Label values
s = data['BotNet_Label'].value_counts()

# get a list of labels where the number of instances is > 10
labels = list(s[s > 10].index)

# remove BotNets where the number of instances is < 10
data = data[data['BotNet_Label'].isin(labels)]
data['BotNet_Label'].value_counts()

In [None]:
#Y.value_counts()

### SMOTE (Synthetic Minority Oversampling Technique) – Oversampling

* It aims to balance class distribution by randomly increasing minority class examples by replicating them.
* SMOTE synthesises new minority instances between existing minority instances. 
* It generates the virtual training records by linear interpolation for the minority class.
* These synthetic training records are generated by randomly selecting one or more of the k-nearest neighbors for each example in the minority class. 

In [None]:
#!pip install -U imbalanced-learn

In [None]:
%%time
#from imblearn.over_sampling import SMOTE 
#sm = SMOTE(random_state = 2) 
# separate features from class variable
y = data['BotNet_Label']
X = data.drop('BotNet_Label', axis=1)

X, y = sm.fit_resample(X, y)

In [None]:
y.shape

In [None]:
#type(X)

In [None]:
y.value_counts()

In [None]:
targets = np.unique(y).tolist()
len(targets)

### Principal Component Analysis (PCA)

In [None]:
# Apply StandardScaler
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)

In [None]:
# Apply PCA from sklearn
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
principal_components = pca.fit_transform(X)
# create a datarame that contains the PCA components
principal_df = pd.DataFrame(data = principal_components, columns=['principal component 1','principal component 2'])

In [None]:
# add the class label to the PCA components
principal_df['BotNet_Label'] = y

In [None]:
principal_df.head()

In [None]:
principal_df['BotNet_Label'].value_counts()

In [None]:
from itertools import cycle

fig = plt.figure(figsize = (8,8))
ax = fig.add_subplot(1,1,1) 
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 component PCA', fontsize = 20)
#targets = ['Neris', 'Normal', 'TBot', 'RBot']
#colors = ['r', 'g', 'b','c']


cycol = cycle('bgrcmk')

    
for target in targets:
    indicesToKeep = principal_df['BotNet_Label'] == target
    ax.scatter(principal_df.loc[indicesToKeep, 'principal component 1']
               , principal_df.loc[indicesToKeep, 'principal component 2']
               , c = next(cycol)
               , s = 50)
ax.legend(targets)
ax.grid()

In [None]:
targets

### Use your Favourite Classifier to Make Predictions

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify = y, random_state=0)
gnb = GaussianNB()
gnb.fit(X_train, y_train)

In [None]:
%%time
#from sklearn.metrics import confusion_matrix
#import matplotlib.pyplot as plt
#import seaborn as sns

y_pred = gnb.predict(X_test)

#conf_mat = confusion_matrix(y_test, y_pred)
# Plot confusion_matrix
#fig, ax = plt.subplots(figsize=(15, 10))
#sns.heatmap(conf_mat, annot=True, cmap = "Set3", fmt ="d",
#xticklabels=targets, yticklabels=targets)
#plt.ylabel('Actual')
#plt.xlabel('Predicted')
#plt.show()


In [None]:
from sklearn.metrics import accuracy_score

print("Accuracy: " , (accuracy_score(y_test, y_pred)))

# Well Done!