In [None]:
import pandas as pd
import numpy as np
import sklearn.preprocessing as proc
from sklearn.feature_selection import chi2, SelectKBest

from sklearn.model_selection import train_test_split
import sklearn.metrics as skm

import random

import lightgbm as lgb
import catboost as cat
import xgboost as xgb

import time
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

# Solusi 1 pake pandas

In [None]:
def randompick(filename, s):
    n = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)
    skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list
    df = pd.read_csv(filename, skiprows=skip)
    return df

dfb=randompick(r"D:\DatasetSkripsi\Cleaned\balanced-benign.csv",3500000)
dfd=randompick(r"D:\DatasetSkripsi\Cleaned\balanced-ddos.csv",1500000)

In [None]:
dfb=dfb.replace('Benign',0)
dfd=dfd.replace('ddos',1)
le = proc.LabelEncoder()
def labelencode(df):
    for column in df.columns:
        if df[column].dtype == type(object):
            print(column)
            df[column] =le.fit_transform(df[column].astype(str))
    return df
dfb=labelencode(dfb)
dfd=labelencode(dfd)
df=pd.concat([dfb,dfd])
dfb=0
dfd=0

In [None]:
def clean_dataset(df):
    df.fillna(0,inplace=True)
    df[df<0]=0
    return df
df=clean_dataset(df)

In [None]:
X_dfbalanced=df.iloc[:,1:84]
Y_dfbalanced=df.iloc[:,84]
df=0

In [None]:

selector = SelectKBest(chi2, k = 10)
cols = selector.get_support(indices=True)
features_df_new = X_dfbalanced.iloc[:,cols]
names = X_dfbalanced.columns.values[selector.get_support()]
scores = selector.scores_[selector.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'F_Scores'])
ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'], ascending = [False, True])
print(ns_df_sorted)

In [None]:
X_dfbalanced=0
X=features_df_new
Y=Y_dfbalanced

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
def getTime(a,b,c,d,e,f):
    print("Waktu yang dibutuhkan dalam sekali training: "+str(round(b-a))+" detik")
    print("Waktu yang dibutuhkan dalam prediksi: "+str(d-c)+" detik")
    print("Waktu yang dibutuhkan dalam Cross Validation: "+str(round(f-e))+" detik")

# XGBoost

In [None]:
model = xgb.XGBClassifier()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
predictions = [round(value) for value in y_pred]
kfold = KFold(n_splits=10, random_state=7)
results = cross_val_score(model, X, Y, cv=kfold)
# evaluate predictions
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
skm.classification_report(y_test,lgbpred,output_dict=True, target_names=['Benign','ddos'])

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

param = {
    'tree_method': 'gpu_hist',
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 10}  # the number of classes that exist in this datset
num_round = 20

import time
starttime = time.time()
bst = xgb.train(param, dtrain, num_round)
endtime = time.time()

print(endtime-starttime)

In [None]:
preds = bst.predict(dtest)
print(len(preds))
print(preds)

In [None]:
import numpy as np
best_preds = np.asarray([np.argmax(line) for line in preds])
print(best_preds)

In [None]:
from sklearn.metrics import precision_score

print(precision_score(y_test, best_preds, average='macro'))

# LightGBM

In [None]:
lgbmodel = lgb.LGBMClassifier()
a= time.time()
lgbmodel.fit(X_train,y_train)
b = time.time()


c = time.time()
lgbpred=lgbmodel.predict(X_test)
d = time.time()

kfold = KFold(n_splits=10, random_state=7)
e = time.time()
results = cross_val_score(lgbmodel, X, Y, cv=kfold)
f = time.time()

getTime(a,b,c,d,e,f)
# evaluate predictions

In [None]:
print("Accuracy: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))
skm.classification_report(y_test,lgbpred,output_dict=True, target_names=['Benign','ddos'])

In [None]:
d_train=lgb.Dataset(X_train, label=y_train)

In [None]:
params={}
params['learning_rate']=0.03
params['boosting_type']='gbdt' #GradientBoostingDecisionTree
params['objective']='binary' #Binary target feature
params['metric']='binary_logloss' #metric for binary classification
params['max_depth']=10

In [None]:
clf=lgb.train(params,d_train,100)

In [None]:
#prediction on the test set
y_pred=clf.predict(X_test)

In [None]:
print(precision_score(y_test, y_pred.round(), average='macro'))

# CatBoost

In [None]:
clf = cat.CatBoostClassifier(
    iterations=5, 
    learning_rate=0.1, 
    #loss_function='CrossEntropy'
)


clf.fit(X_train, y_train,
        eval_set=(X_test, y_test), 
        verbose=False
)

print('CatBoost model is fitted: ' + str(clf.is_fitted()))
print('CatBoost model parameters:')
print(clf.get_params())

In [None]:
catpred=clf.predict(data=X_test)

In [None]:
print(precision_score(y_test, catpred, average='macro'))