**Importing Libraries**

In [None]:
import math, time, random, datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
plt.style.use('seaborn')
import warnings
warnings.filterwarnings('ignore')
import seaborn as sns
import missingno
import pickle
print('matplotlib: {}'.format(matplotlib.__version__))

#### Read the training and test set.

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
train = pd.read_csv('/kaggle/input/unsw-nb15/UNSW_NB15_training-set.csv')
test = pd.read_csv('/kaggle/input/unsw-nb15/UNSW_NB15_testing-set.csv')

In [None]:
print(train.shape)
print(test.shape)

In [None]:
data = pd.concat([train,test])
print(data.shape)
data.dtypes

In [None]:
cols_cat = data.select_dtypes('object').columns # To be explained later
cols_numeric = data._get_numeric_data().columns # To be explained later

In [None]:
print(cols_cat)
print(cols_numeric)

In [None]:
for column in data:
    print(data[column].value_counts())
    

In [None]:
data.describe().to_csv("describe_NB15.csv")

# Missing Values:

   <ul>
        <li>
        Check for missing values.
        </li>
         <li>
        Replace those missing values.
        </li>
    </ul>

In [None]:
print(data.isnull().sum())

In [None]:
missingno.matrix(data)

Data is clean and there are no missing values. 

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

Dropping the 'label' feature

**PCA** 

# Insights and steps: 
   <ul>
        <li>
        Data is clean.
        </li>
         <li>
        Data still needs furthur processing in terms of One-hot-encoding for categorical data.
                     E.g.: 'service' consists of different types, we have ftp, http, and '-' denoting (not available or None), So we will need to treat it as a missing value as we will change it from '-' to 'None' instead of dropping the whole column.
        </li>
            <li>
        Removing unnecessary features like 'id'.
        </li>
    </ul>

In [None]:
data['proto'].unique() #This is definitely a categorical feature.

In [None]:
data['service'].unique() #Here, we'll deal with the type of service that is '-'
data['service']= np.where(data['service'] == '-', 'None', data['service'])
print(data['service'].unique())

In [None]:
data['state'].unique() #Keep it.

##### Now, let's try to automate this process.

In [None]:
def Remove_dump_values(data, cols):
    for col in cols:
        data[col] = np.where(data[col] == '-', 'None', data[col])
    return data

In [None]:
cols = data.columns
data_bin = Remove_dump_values(data, cols)

**Removing unnecessary features:**

In [None]:
data_bin = data_bin.drop(['id'], axis=1) #Remove Unnecessary features

##### Categorical Features:
One Hot Encoding using cols_cat

In [None]:
data_bin.drop(['attack_cat'], axis=1, inplace=True)

In [None]:
cols_cat = cols_cat.drop(['attack_cat'])

In [None]:
# correlation = data_bin.corr()
# print(type(correlation))
# fig = plt.figure(figsize=(18,12))
# ax = plt.subplot(1,1,1)
# sns.heatmap(correlation.values,ax= ax, cmap='coolwarm')
# # ax = sns.heatmap(data_bin, hue=data_bin.to_list)
# # ax.legend(title='H')  # add a title to the legend
# # plt.title('Correlation between different fearures')
# # # sns.heatmap(data_bin.corr())

**Do one-hot encoding**

In [None]:
data_bin_hot = pd.get_dummies(data_bin,columns=cols_cat)

In [None]:
data_bin_hot.shape

In [None]:
X = data_bin_hot.drop('label', axis=1)
Y = data_bin_hot['label']


In [None]:
pca = PCA().fit(X.values)
plt.xlim([0, 10])
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance');

In [None]:
import matplotlib.pyplot as plt
from matplotlib.figure import Figure

In [None]:
from sklearn import metrics
from sklearn import model_selection
global X

In [None]:
def feature_plot(imp):
    global X
    fimp = pd.DataFrame({'Feature': X.columns, 'Importance' : np.round(imp,5)})
    fimp =fimp.sort_values(by='Importance', ascending=False)
    plt.figure(figsize=(30,15))
    plt.plot(fimp['Feature'], fimp['Importance'])
    plt.xticks(rotation=90);

In [None]:
def fit_algo(algo, x, y, cv):
    #Fit the model
    model = algo.fit(x, y)
    
    #Check its score
    acc = round(model.score(x, y) *100, 2)
    y_pred = model_selection.cross_val_predict(algo, x, y, cv=cv, n_jobs = -1)
    
    acc_cv = round(metrics.accuracy_score(Y,y_pred)*100, 2)
    
    return y_pred, acc, acc_cv, model

In [None]:
from sklearn.ensemble import RandomForestClassifier
pred_now, acc_rf, acc_cv_rf, rf = fit_algo(RandomForestClassifier(n_estimators = 100)
                                        , X, Y, 10)

In [None]:
feature_plot(rf.feature_importances_*100)

In [None]:
pd.set_option("max_rows",30)
imp=rf.feature_importances_*100
fimp = pd.DataFrame({'Feature': X.columns, 'Importance' : np.round(imp,5)})
fimp =fimp.sort_values(by='Importance', ascending=False)
fimp.index=np.arange(1,197)
print(fimp.head(30))

In [None]:
print('col'+': '+'4')