# Setting up enviroments requirements
If you want to run this Jupyter Notebook on Google colab, clieck on the next hyperlink: [Load on Google Colab.](https://githubtocolab.com/mjacker/MJCapstone/blob/master/0_merged_ipynb_files_for_google_colab.ipynb)

If you want to load the Jypyter Notebook locally then clone the github repository on [Github Repository](https://github.com/mjacker/MJCapstone/tree/develop) by yourself and prepare a venv python enviroment to install the requirements dependencies.

Uncomment the next block to install dependencies.


## Venv python enviroment 

In [None]:
# !python -m pip install requirements.yml -y

## Downloading the Dataset
Downloading on Google colab. (By default)

In [None]:
# on linux
!apt-get install awscli
!python -m pip install requests==2.28.2
!mkdir datasets
!aws s3 ls --no-sign-request --region ap-northeast-3 "s3://cse-cic-ids2018/" --recursive --human-readable
!aws s3 cp --no-sign-request --region ap-northeast-3 "s3://cse-cic-ids2018/Processed Traffic Data for ML Algorithms/Friday-02-03-2018_TrafficForML_CICFlowMeter.csv" "./datasets/"
!aws s3 cp --no-sign-request --region ap-northeast-3 "s3://cse-cic-ids2018/Processed Traffic Data for ML Algorithms/Friday-16-02-2018_TrafficForML_CICFlowMeter.csv" "./datasets/"


Downloading on windows.

In [None]:
# Tested on windows 10
# On powershell 7.4

!python -m pip install boto3
!python download-cic-ids-dataset.py 


## Dataset Preparation
For this Capstone, are selecte to be procesed two datasets from #[CSE-CIC-IDS2018](https://www.unb.ca/cic/datasets/ids-2018.html)
- Friday 02 03 2018
- Friday 16 02 2018


In [None]:
%%time
import os
import pandas as pd
import matplotlib
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

### Loading path to dataset files.

In [None]:
DATASET_FILES_PATH = []
for path, _, file in (os.walk("./datasets/")):
    for eachFile in file:
        DATASET_FILES_PATH.append(path + eachFile)
DATASET_FILES_PATH

### Loading datasets to PandaData Frame

In [None]:
%%time

df_dataset = pd.read_csv(DATASET_FILES_PATH[0])

# For Google Colab, due to memory capacity, only can handle one day dataset.
# df_friday1 = pd.read_csv(DATASET_FILES_PATH[0])
# df_friday2 = pd.read_csv(DATASET_FILES_PATH[1])


### Concatenating datasets

In [None]:
# For Google Colab, due to memory capacity, only can handle one day dataset.
# df_dataset = pd.concat([df_friday1, df_friday2], axis=0, ignore_index=True)

Because two datasets was concatenated, then need to delete the row which cointain the second dataframe title


In [None]:
df_dataset.drop(df_dataset.loc[df_dataset["Label"] == "Label"].index, inplace=True)
print(df_dataset.shape)

In [None]:
df_dataset.sample()

### Drop unrelated columns
Since Port, protocol and the timestand are not related to the label with those selectec machine learning, those will be droped

In [None]:
df_dataset.drop(columns=['Dst Port', 'Protocol', 'Timestamp'], inplace=True)


In [None]:
df_dataset.head()

### Droping rows with infinite or null values

In [None]:
print("Shape before deleting rows: ", df_dataset.shape)
df_dataset[df_dataset.isnull().any(axis=1)]
df_dataset.replace([np.inf, -np.inf], np.nan)
df_dataset.dropna(inplace=True)
print("Shape after deteling rows:", df_dataset.shape)

### Check Label labels

In [None]:
print(df_dataset['Label'].unique())
print(df_dataset.shape)

### Changing Labels names 
To unify the labels, those malicius packages will be renamend as ones, and the normal as zeros.
- 0 - normal package
- 1 - malicius package

In [None]:
%%time
df_dataset.replace(to_replace=['Benign'], value=0, inplace=True)
df_dataset.replace(to_replace=["Bot", "DoS attacks-SlowHTTPTest", "DoS attacks-Hulk"], value=1, inplace=True)
df_dataset[df_dataset.columns[-1]].unique()

### convert some string numbers to numbers

In [None]:
df_dataset.astype('float')

### Dropping duplicated rows

In [None]:
print(df_dataset.shape)
df_dataset.drop_duplicates(inplace=True)
print(df_dataset.shape)


### Check columns datatypes

In [None]:
df_dataset.info()

### Distributions labels after drop rows

In [None]:
print(df_dataset["Label"].value_counts()[[0]].sum())
print(df_dataset["Label"].value_counts()[[1]].sum())

print(df_dataset.shape)

sns.set(rc={'figure.figsize':(8, 6)}
        )
sns.countplot(x=df_dataset[df_dataset.columns[-1]], 
              data = df_dataset,
              palette = 'dark:#5A9_r'
              )

### Saving the Dataset as a csv file

In [None]:
df_dataset.to_csv("processed_dataset.csv", index=False)

## Data Exploratory - Columns

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Loading processed dataset to a dataframe

In [None]:
df_dataset = pd.read_csv("processed_dataset.csv")
df_dataset

In [None]:
df_dataset.describe()

In [None]:
df_dataset.info()

In [None]:
# no parece tan necesario aca, al volver a leer el archivo csv, parece que detecta bien el tipo de dato.
for position, type in enumerate(df_dataset.dtypes):
    # print(type)
    # print(position)
    # df_dataset[df_dataset.columns[position]].astype(type)
    df_dataset = df_dataset.astype({df_dataset.columns[position]:type})
df_dataset

In [None]:
df_dataset.columns

In [None]:
# plt.figure()
# plt.
# df_dataset.hist()
df_dataset[df_dataset.columns[0]].hist()
# df_dataset[df_dataset.columns[1]].hist()


In [None]:
print(df_dataset.columns[0])
df_dataset[df_dataset.columns[0]].describe()

In [None]:
df_dataset[df_dataset.columns[0]].unique()


### Ploting columns

In [None]:
# plt.pyplot.plot(df_dataset[df_dataset.columns[1]], df_dataset[df_dataset.columns[-1]])
print(np.sort(df_dataset[df_dataset.columns[7]]))
pd.DataFrame(np.sort(df_dataset[df_dataset.columns[6]])).plot(kind='hist')
# df_dataset[df_dataset.columns[1]].unique()
# df_dataset[df_dataset.columns[1]].hist()
# df_dataset[df_dataset.columns[1]].count()
# df_dataset[df_dataset.columns[1]]


## Decision Tree

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV

# Model
from sklearn.tree import DecisionTreeClassifier, export_graphviz


# For reproducible results
RANDOM_STATE_SEED = 420

In [None]:
df_dataset = pd.read_csv("processed_dataset.csv")
df_dataset





In [None]:
df_dataset

In [None]:
# es realmente necesario volver a filtrar los datos si supuestamente el procesado no deveria tener valores infinitos

print(np.any(np.isnan(df_dataset)))
print(np.any(np.isinf(df_dataset)))

In [None]:
df_dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
df_dataset.dropna(inplace=True)

In [None]:
print(np.any(np.isnan(df_dataset)))
print(np.any(np.isinf(df_dataset)))

In [None]:
df_dataset.info()

In [None]:
y = np.array(df_dataset.pop('Label'))
y

In [None]:
X = np.array(df_dataset)
X

In [None]:
print(df_dataset.shape)
print(X.shape)
print(y.shape)

In [None]:
pato = pd.DataFrame(X)
pato

In [None]:
df_dataset.info()

In [None]:
# pienso que aqui tengo que agregar uso de baja los valores
# no, tengo que hacer despues de la separacion X e Y

In [None]:
# TEMP
len(df_dataset.columns)

In [None]:
# print(df_X.shape)
# print(df_y.shape)

In [None]:
# train, test = train_test_split(df_dataset, test_size=0.3, random_state=RANDOM_STATE_SEED)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=RANDOM_STATE_SEED)

In [None]:
print(df_dataset.shape)

print("TRAIN:")
print(X_train.shape)
print(y_train.shape)

print("TEST")
print(X_test.shape)
print(y_test.shape)

In [None]:
model = DecisionTreeClassifier(
    criterion='gini',
    splitter='best',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    max_features=None,
    random_state=None,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    class_weight=None,
    ccp_alpha=0.0
)

In [None]:
hyperparameters = {
    'max_depth': [i for i in range(1, 20)]
}

In [None]:
clf = GridSearchCV(
    estimator=model,
    param_grid=hyperparameters,
    cv=5,
    verbose=1,
    n_jobs=-1  # Use all available CPU cores
)

In [None]:
%%time
clf.fit(X=X_train, y=y_train)

In [None]:
print("Accuracy score on Validation set: \n")
print(clf.best_score_ )
print("---------------")
print("Best performing hyperparameters on Validation set: ")
print(clf.best_params_)
print("---------------")
print(clf.best_estimator_)

In [None]:
model = clf.best_estimator_
model

In [None]:
predictions = model.predict(X_test)
predictions

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


In [None]:
print(accuracy_score(y_test, predictions))

In [None]:
cm = confusion_matrix(y_test, predictions)
print(cm)

In [None]:
# from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(y_test, predictions, cmap=plt.cm.Greens)

In [None]:
# import sklearn
# print('The scikit-learn version is {}.'.format(sklearn.__version__))

In [None]:
# def visualize_tree(tree, feature_names):
#     """Create tree png using graphviz.

#     Args
#     ----
#     tree -- scikit-learn DecsisionTree.
#     feature_names -- list of feature names.
#     """
#     with open("dt.dot", 'w') as f:
#         export_graphviz(tree, out_file=f,
#                         feature_names=feature_names)

#     command = ["dot", "-Tpng", "dt.dot", "-o", "dt.png"]
#     try:
#         subprocess.check_call(command)
#     except:
#         exit("Could not run dot, ie graphviz, to "
#              "produce visualization")

# df_dataset_ori = pd.read_csv("processed_dataset.csv")
# features =list(df_dataset_ori.columns)
# features

# visualize_tree(clf.best_estimator_, features)

# # import graphviz
# # graphviz.Source.from_file('algo.dot')  not working

In [None]:
# from sklearn import tree
# text_representation = tree.export_text(clf)


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, GridSearchCV

# Model
from sklearn.tree import DecisionTreeClassifier

# For reproducible results
RANDOM_STATE_SEED = 732

In [None]:
df_dataset = pd.read_csv("processed_dataset.csv")
df_dataset


In [None]:
# es realmente necesario volver a filtrar los datos si supuestamente el procesado no deveria tener valores infinitos

print(np.any(np.isnan(df_dataset)))
print(np.any(np.isfinite(df_dataset)))

# si trato de usar where infinite, normalmente trae malos resultados onda overflow de memoria
df_dataset.isin([np.inf, -np.inf]).values.sum()

In [None]:
# df_dataset.isinf()
df_dataset.replace([np.inf, -np.inf], np.nan, inplace=True)
df_dataset.dropna(inplace=True)


In [None]:
# es realmente necesario volver a filtrar los datos si supuestamente el procesado no deveria tener valores infinitos

print(np.any(np.isnan(df_dataset)))
print(np.any(np.isinf(df_dataset)))

# si trato de usar where infinite, normalmente trae malos resultados onda overflow de memoria
df_dataset.isin([np.inf, -np.inf]).values.sum()

In [None]:
df_dataset.describe()
# df_label = np.array(df_dataset.pop('Label'))
# df_label

In [None]:
df_dataset.info()

In [None]:
y = np.array(df_dataset.pop('Label'))
X = np.array(df_dataset)

In [None]:
print(X.shape)
print(y.shape)

In [None]:
pd.DataFrame(X)

In [None]:
pd.DataFrame(y)

In [None]:
from sklearn.preprocessing import MinMaxScaler
X_scaler = MinMaxScaler().fit(X)
pd.DataFrame(X_scaler.transform(X))
X = np.array(X_scaler.transform(X))
X

In [None]:
# X, y = train_test_split(df_dataset, test_size=0.3, random_state=RANDOM_STATE_SEED)
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=RANDOM_STATE_SEED)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
from sklearn.utils import class_weight  # For balanced class weighted classification training

# Calculating class weights for balanced class weighted classifier training
class_weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)

print(class_weights)

# Must be in dict format for scikitlearn
class_weights = {
    0: class_weights[0],
    1: class_weights[1]
}

print(class_weights)

In [None]:
from sklearn.ensemble import RandomForestClassifier


In [None]:

model = RandomForestClassifier(
    n_estimators=100,
    criterion='gini',
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    min_weight_fraction_leaf=0.0,
    # max_features='auto',
    max_features=10,
    max_leaf_nodes=None,
    min_impurity_decrease=0.0,
    # bootstrap=True,
    bootstrap=False,
    oob_score=False,
    n_jobs=None,
    random_state=1,
    verbose=0,
    warm_start=False,
    class_weight=class_weights,
    ccp_alpha=0.0,
    max_samples=None
)

hyperparameters = {
    'n_estimators': [50, 75, 100, 125, 150]
}

In [None]:
clf = GridSearchCV(
    estimator=model,
    param_grid=hyperparameters,
    cv=5,
    verbose=1,
    n_jobs=-1  # Use all available CPU cores
)

In [None]:
clf.fit(X_train, y_train)

In [None]:
print("Accuracy score on Validation set: \n")
print(clf.best_score_ )
print("---------------")
print("Best performing hyperparameters on Validation set: ")
print(clf.best_params_)
print("---------------")
print(clf.best_estimator_)

In [None]:
model = clf.best_estimator_
model

In [None]:
predictions = model.predict(X_test)
predictions

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix #, plot_confusion_matrix


In [None]:
print(accuracy_score(y_test, predictions))
cm = confusion_matrix(y_test, predictions)
print(cm)

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay
ConfusionMatrixDisplay.from_predictions(y_test, predictions, cmap=plt.cm.Greens)

In [None]:
print(classification_report(y_test, predictions, digits=5))

In [None]:
!python -m pip install joblib


In [None]:
import joblib


In [None]:
joblib.dump(model, "trained_models/random-forest-classifier.pkl")

In [None]:
model = joblib.load("trained_models/random-forest-classifier.pkl")
model