<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Capstone: Network Traffic Classification

**Project notebooks:**<br>
1 - [Data Import and Cleansing](./01_Data_import_cleaning)<br>
2 - [Exploratory Data Analysis](./02_EDA)<br>
3 - Preprocessing and Modeling(current notebook)

### Data import

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import joblib
import shap

pd.set_option('display.max_columns', None)

In [None]:
#load dataset 
train = joblib.load('../dataset/s1_final_train.pickle')
test = joblib.load('../dataset/s1_final_test.pickle')

In [None]:
display(train.shape)
display(test.shape)

### Pre-processing

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from imblearn.over_sampling import SMOTE

In [None]:
# Create X and y
X = train.drop(['label'], axis=1)
y = train['label']

test_x = test.drop(['label'], axis=1)
test_y = test['label']

display(X.shape, y.shape)
display(test_x.shape, test_y.shape)

In [None]:
# Create train/test splits
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify = y, random_state=42)

In [None]:
display(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

### Baseline accuracy

In [None]:
y_test.value_counts(normalize=True)

In [None]:
#dummy categorical cols
enc = OneHotEncoder(handle_unknown="ignore", sparse=False)

enc.fit(X_train.select_dtypes(include="object"))

enc_X_train = pd.DataFrame(
    enc.transform(X_train.select_dtypes(include="object")),
    columns=enc.get_feature_names_out(),
)

enc_X_test = pd.DataFrame(
    enc.transform(X_test.select_dtypes(include="object")),
    columns=enc.get_feature_names_out(),
)

enc_test = pd.DataFrame(
    enc.transform(test_x.select_dtypes(include="object")),
    columns=enc.get_feature_names_out(),
)

enc_X_train.index = X_train.index
enc_X_test.index = X_test.index
enc_test.index = test_x.index

In [None]:
display(enc_X_train.shape)
display(enc_X_test.shape)
display(enc_test.shape)

In [None]:
# Scale numerical cols
MMsc = MinMaxScaler()
MMsc.fit(X_train.select_dtypes(exclude=object))

MM_X_train = pd.DataFrame(MMsc.transform(X_train.select_dtypes(exclude=object)),
    columns=MMsc.get_feature_names_out())

MM_X_test = pd.DataFrame(MMsc.transform(X_test.select_dtypes(exclude=object)),
    columns=MMsc.get_feature_names_out())

MM_test = pd.DataFrame(MMsc.transform(test_x.select_dtypes(exclude=object)),
    columns=MMsc.get_feature_names_out())

MM_X_train.index = X_train.index
MM_X_test.index = X_test.index
MM_test.index = test_x.index

In [None]:
#combine df for MinMax Scalar
new_X_train = pd.concat([enc_X_train, MM_X_train], axis=1)
new_X_test = pd.concat([enc_X_test, MM_X_test], axis=1)
new_test = pd.concat([enc_test, MM_test], axis=1)

In [None]:
display(new_X_train.shape)
display(new_X_test.shape)
display(new_test.shape)

### PCA for feature selection

In [None]:
# Use PCA to reduce the dimensions to speed up the ML algo without sacrificing much of the explanability. In this case, we want to have 90% of the explanability retained
pca_model = PCA(0.9)

In [None]:
pca_model.fit(new_X_train)

In [None]:
# to check to make sure that the explained variance is 90%
np.sum(pca_model.explained_variance_ratio_)

In [None]:
# Number of features after PCA
pca_model.n_components_ 

### Pipelines and Parameters

In [None]:
# Import libraries
from sklearn.decomposition import PCA
from imblearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score, f1_score, recall_score, precision_score, roc_curve, roc_auc_score
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

### Pipeline and Hyperparameter Tuning

In [None]:
# Logistic Regression
pipe_lr = Pipeline([('sm', SMOTE()),('lr', LogisticRegression(solver='liblinear'))])

# Random Forest      
pipe_rf = Pipeline([('rf', RandomForestClassifier(n_jobs=-1))])

#SVM
pipe_svm = Pipeline([('svm', SVC())])

#catboost
pipe_cat = Pipeline([('cat', CatBoostClassifier(verbose= False))])

# XGBoost      
pipe_xgb = Pipeline([('xgb', XGBClassifier(n_jobs=-1))])

# AdaBoost      
pipe_ada = Pipeline([('ada', AdaBoostClassifier())])

# GradientBoost      
pipe_gb = Pipeline([('gb', GradientBoostingClassifier())])

In [None]:
params_lr = {
    'lr__C': np.linspace(0.1,5,10)
}

params_rf = {
    'rf__max_depth' : [2,4,6],
    'rf__n_estimators': [150, 250, 350],
    #'rf__min_samples_leaf': [2, 4],
    #'rf__min_samples_split' : [1, 2]
    'rf__class_weight' : ['balanced', {0:1,1:200}]
}

#Support Vector Machine
params_svm = {
    'svm__C': np.logspace(-2, 1, 10),
    'svm__gamma':['scale','auto'],
    'svm__class_weight':['balanced', {0:1,1:200}],
    'svm__probability': [True]
}

#Support Vector Machine
params_cat = {
    'cat__depth': [2,4,6],
    'cat__learning_rate' : [0.1, 0.05, 0.01],
    'cat__iterations': [150, 250, 350],
    'cat__auto_class_weights' : ['Balanced']
}

params_ada = {
    'ada__n_estimators': [150, 250, 350],
    'ada__learning_rate' : [0.1, 0.05, 0.01]
}

params_gb = {
    'gb__n_estimators': [150, 250, 350],
    'gb__max_depth' : [2,4,6],
    'gb__learning_rate' : [0.15, 0.1, 0.05],
    #'gb__min_samples_split': [2, 4],
    #'gb__min_samples_leaf': [1, 2]
}

params_xgb = {
    'xgb__n_estimators': [150, 250, 350],
    'xgb__eta' : [0.1, 0.05, 0.01],
    'xgb__max_depth' : [2,4,6],
    'xgb__eval_metric' : ['error', 'auc']
}

### Modelling

In [None]:
def model_fit(model_name, X_train, y_train, pipe, param):
    
    gs = GridSearchCV(pipe, param, cv=3, scoring = 'f1', n_jobs=-1)
    gs.fit(X_train, y_train)
    filename = f'../model/{model_name}.pickle'
    joblib.dump(gs.best_estimator_, filename)

In [None]:
def model_plot(df, model_name, X_train, X_test, y_train, y_test):
    
    #load model
    filename = f'../model/{model_name}.pickle'
    model = joblib.load(filename)
    
    # check for generalisation
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)
    
    # Save confusion matrix values
    tn, fp, fn, tp = confusion_matrix(y_test, y_test_pred).ravel()
    
    #train scoring
    train_accuracy = round(accuracy_score(y_train, y_train_pred),3)
    train_recall = round(recall_score(y_train, y_train_pred),3)
    train_f1 = round(f1_score(y_train, y_train_pred),3)
    train_precision = round(precision_score(y_train, y_train_pred),3)
    train_roc_auc = round(roc_auc_score(y_train, y_train_pred),3)
    
    #test scoring
    test_accuracy = round(accuracy_score(y_test, y_test_pred),3)
    test_recall = round(recall_score(y_test, y_test_pred),3)
    test_f1 = round(f1_score(y_test, y_test_pred),3)
    test_precision = round(precision_score(y_test, y_test_pred),3)
    test_roc_auc = round(roc_auc_score(y_test, y_test_pred),3)
    
    #generalisation
    gen_accuracy = round(((train_accuracy - test_accuracy)/train_accuracy * 100),3)
    gen_recall = round(((train_recall - test_recall)/train_recall * 100),3)
    gen_f1 = round(((train_f1 - test_f1)/train_f1 * 100),3)
    gen_precision = round(((train_precision - test_precision)/train_precision * 100),3)
    gen_roc_auc = round(((train_roc_auc - test_roc_auc)/train_roc_auc * 100),3)
    
    #get predicted data points to plot the roc curve
    y_proba = model.predict_proba(X_test)
    
    # roc curve for models
    fpr, tpr, thresh = roc_curve(y_test, y_proba[:, 1])

    #store values as dictionary
    data = {"model" : model_name,
            "train_accuracy" : train_accuracy, 
            "train_recall" : train_recall,
            "train_precision" : train_precision, 
            "train_f1": train_f1, 
            "train_roc_auc" : train_roc_auc, 
            "test_accuracy" : test_accuracy, 
            "test_recall" : test_recall,
            "test_precision" : test_precision, 
            "test_f1": test_f1, 
            "test_roc_auc" : test_roc_auc,
            "gen_accuracy" : gen_accuracy, 
            "gen_recall" : gen_recall,
            "gen_precision" : gen_precision, 
            "gen_f1": gen_f1, 
            "gen_roc_auc" : gen_roc_auc,
            "best_params" : model,
            "fpr" : [fpr], 
            "tpr" : [tpr]}
    
    # Create DataFrame
    result = pd.DataFrame(data)

    #append to original df
    result = pd.concat([df, result], ignore_index=True)
    
    print(model)
    
    print('Classification Report')
    print(classification_report(y_test,y_test_pred))
    print('Confusion Matrix')
    #plot confusion matrix 
    fig, ax = plt.subplots(1, 2, figsize = (8, 4))
    
    cm1 = confusion_matrix(y_train, y_train_pred)
    ConfusionMatrixDisplay(confusion_matrix=cm1, 
                               display_labels=['normal', 'malicious']).plot( ax = ax[0], values_format='d', 
                                                                        colorbar = False, cmap='Blues')  
                                #0 is normal, 1 is malicious

    ax[0].set_title(f'Train - {model_name}')
    
    cm2 = confusion_matrix(y_test, y_test_pred)
    ConfusionMatrixDisplay(confusion_matrix=cm2, 
                               display_labels=['normal', 'malicious']).plot(ax = ax[1], values_format='d', 
                                                                        colorbar = False, cmap='Blues')  

    ax[1].set_title(f'Test - {model_name}')   

    plt.tight_layout()
    plt.show()
    
    TP = cm2[0][0]
    FP = cm2[0][1]
    FN = cm2[1][0]
    TN = cm2[1][1]
    
    
    print(f'Train f1: {train_f1}')
    print(f'Test f1: {test_f1}')
    print(f'f1 generalisation: {gen_f1}')
    print(f'Test FPR: {round((FP/(FP+TN)*100),2)}%')
    print(f'Test FNR: {round((FN/(TP+FN)*100),2)}%')
      
    return result

### Find the best estimators

In [None]:
%time model_fit('s1_lr', new_X_train, y_train, pipe_lr, params_lr)

In [None]:
%time model_fit('s1_cat', new_X_train, y_train, pipe_cat, params_cat)

In [None]:
%time model_fit('s1_svm', new_X_train, y_train, pipe_svm, params_svm)

In [None]:
%time model_fit('s1_ada', new_X_train, y_train, pipe_ada, params_ada)

In [None]:
%time model_fit('s1_rf', new_X_train, y_train, pipe_rf, params_rf)

In [None]:
%time model_fit('s1_gb', new_X_train, y_train, pipe_gb, params_gb)

In [None]:
%time model_fit('s1_xgb', new_X_train, y_train, pipe_xgb, params_xgb)

In [None]:
%time model_fit('s1_svm', new_X_train, y_train, pipe_svm, params_svm)

### Predict

In [None]:
#define empty dataframe to store the results
result = pd.DataFrame()

In [None]:
#Model 1: lr
result = model_plot(result, 's1_lr', new_X_train, new_X_test, y_train, y_test)

In [None]:
#Model 2: ada
result = model_plot(result, 's1_ada', new_X_train, new_X_test, y_train, y_test)

In [None]:
#Model 3: lr
result = model_plot(result, 's1_rf_gs', new_X_train, new_X_test, y_train, y_test)

In [None]:
#Model 4: gb
result = model_plot(result, 's1_gb', new_X_train, new_X_test, y_train, y_test)

In [None]:
#Model 5: lr
result = model_plot(result, 's1_xgb', new_X_train, new_X_test, y_train, y_test)

In [None]:
#Model 5: lr
result = model_plot(result, 's1_cat', new_X_train, new_X_test, y_train, y_test)

In [None]:
#Model 5: lr
result = model_plot(result, 's1_svm', new_X_train, new_X_test, y_train, y_test)

In [None]:
result

In [None]:
# roc curve for tpr = fpr 
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)

In [None]:
plt.figure(figsize = (12,8))

color = ['green','magenta','purple','red','orange','cyan']

# plot roc curves
for i in range(len(result)):
    plt.plot(result['fpr'][i], result['tpr'][i], linestyle='--',color=color[i], label=f"{result['model'][i]}, AUC="+str(round(result['test_roc_auc'][i],2)))

#plot random classifier graph
plt.plot(p_fpr, p_tpr, linestyle='-.', color='black')

plt.title('All_text ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')

plt.legend(loc='best')
plt.show();

In [None]:
#Precision-Recall Curve
plt.figure(figsize = (12,8))
ax.plot(recall_scores, precision_scores, label='Logistic Regression')
ax.plot(l2_recall_scores, l2_precision_scores, label='L2 Logistic Regression')
baseline = len(y_test[y_test==1]) / len(y_test)
ax.plot([0, 1], [baseline, baseline], linestyle='--', label='Baseline')
ax.set_xlabel('Recall')
ax.set_ylabel('Precision')
ax.legend(loc='center left');

### Feature Importance

In [None]:
#load best model
filename = f'../model/s1_cat.pickle'
best = joblib.load(filename) 

In [None]:
# Fits the explainer
explainer = shap.Explainer(best.predict, new_X_test)
# Calculates the SHAP values - It takes some time
shap_values = explainer(new_X_test)

In [None]:
# Evaluate SHAP values
shap_values = explainer.shap_values(X)

In [None]:
shap.plots.bar(shap_values[0])

In [None]:
shap.plots.waterfall(shap_values[0])

In [None]:
shap.summary_plot(shap_values)

In [None]:
shap.plots.beeswarm(shap_values)

In [None]:
shap.summary_plot(shap_values, plot_type='violin')

In [None]:
#get feature names
fea_names = new_X_train.columns
#get the value of importance
impt = best_model.named_steps["rf"].feature_importances_

In [None]:
df_impt = pd.DataFrame(zip(fea_names,impt), columns = ["feature", "value"])

In [None]:
df_impt["value"] = round(df_impt["value"],2)

In [None]:
#exclude features that have no importance
key_fea = df_impt.loc[df_impt["value"] != 0.0] 

In [None]:
plt.figure(figsize = (10,6))

ax = sns.barplot(x="value", y="feature", data=key_fea.sort_values(by = "value", ascending = False))
ax.bar_label(ax.containers[0])
ax.set_xlabel("Importance")
ax.set_ylabel("Words")
ax.set_title('Keywords and Importance', fontsize=16)

plt.tight_layout()
plt.show()

In [None]:
#load best model
filename = f'../model/s1_cat.pickle'
best_model = joblib.load(filename)

In [None]:
predict = best_model.predict(new_test)

In [None]:
y_check = pd.DataFrame()
y_check['Original_y'] = test_y
y_check['Predict_y'] = predict

In [None]:
y_check['check'] = y_check.apply(lambda x: 1 if x['Original_y'] ==
                     x['Predict_y'] else 0, axis=1)

In [None]:
y_check.loc[y_check['check'] == 0]['Original_y'].value_counts()