In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.feature_selection import chi2, SelectKBest, f_classif
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import dask.array as da
import dask_ml.preprocessing as dm
import dask.dataframe as dd
import random
import vaex as vx

# Solusi 1 pake pandas

In [None]:
def randompick(filename):
    n = sum(1 for line in open(filename)) - 1 #number of records in file (excludes header)
    s = 3000000 #desired sample size
    skip = sorted(random.sample(range(1,n+1),n-s)) #the 0-indexed header will not be included in the skip list
    df = pd.read_csv(filename, skiprows=skip)
    return df

dfb=randompick(r"D:\DatasetSkripsi\Cleaned\balanced-benign.csv")
dfd=randompick(r"D:\DatasetSkripsi\Cleaned\balanced-ddos.csv")

In [None]:
dfb=dfb.replace('Benign',0)
dfd=dfd.replace('ddos',1)
le = dm.LabelEncoder()
def labelencode(df):
    for column in df.columns:
        if df[column].dtype == type(object):
            print(column)
            df[column] =le.fit_transform(df[column].astype(str))
    return df
dfb=labelencode(dfb)
dfd=labelencode(dfd)
df=pd.concat([dfb,dfd])
dfb=0
dfd=0

In [None]:
def clean_dataset(df):
    df.fillna(0,inplace=True)
    df[df<0]=0
    return df
df=clean_dataset(df)

In [None]:
X_dfbalanced=df.iloc[:,1:84]
Y_dfbalanced=df.iloc[:,84]
df=0

In [None]:
#Suppose, we select 5 features with top 5 Fisher scores
selector = SelectKBest(f_classif, k = 10)
#New dataframe with the selected features for later use in the classifier. fit() method works too, if you want only the feature names and their corresponding scores
selector.fit(X_dfbalanced, Y_dfbalanced)
cols = selector.get_support(indices=True)
features_df_new = X_dfbalanced.iloc[:,cols]
names = X_dfbalanced.columns.values[selector.get_support()]
scores = selector.scores_[selector.get_support()]
names_scores = list(zip(names, scores))
ns_df = pd.DataFrame(data = names_scores, columns=['Feat_names', 'F_Scores'])
#Sort the dataframe for better visualization
ns_df_sorted = ns_df.sort_values(['F_Scores', 'Feat_names'], ascending = [False, True])
print(ns_df_sorted)

In [None]:
X_dfbalanced=0
X=features_df_new
Y=Y_dfbalanced

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
import xgboost as xgb
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

param = {
    'tree_method': 'gpu_hist',
    'max_depth': 3,  # the maximum depth of each tree
    'eta': 0.3,  # the training step for each iteration
    'silent': 1,  # logging mode - quiet
    'objective': 'multi:softprob',  # error evaluation for multiclass training
    'num_class': 10}  # the number of classes that exist in this datset
num_round = 20

bst = xgb.train(param, dtrain, num_round)


In [None]:
preds = bst.predict(dtest)
print(preds)

In [None]:
import numpy as np
best_preds = np.asarray([np.argmax(line) for line in preds])
print(best_preds)

In [None]:
from sklearn.metrics import precision_score

print(precision_score(y_test, best_preds, average='macro'))

In [None]:
from xgboost import cv
xgb_cv = cv(dtrain=dtrain, params=params, nfold=5,
                    num_boost_round=20, early_stopping_rounds=10, metrics="error", as_pandas=True, seed=123)

In [None]:
from sklearn.metrics import accuracy_score
import xgboost as xgb
model = xgb.XGBClassifier()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
predictions = [round(value) for value in y_pred]
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

# Solusi 2 pake Dask tapi gabisa

In [None]:
df=vx.read_csv(r"D:\DatasetSkripsi\ddos_balanced\final_dataset.csv")

In [None]:
# Syntac of where: (condition, fill value, column to fill)
df['Label'] = df.func.where(df.Label == 'Benign', 0, df.Label)
df['Label'] = df.func.where(df.Label == 'ddos', 1, df.Label)
le = dm.LabelEncoder()
def labelencode(df):
    for column in df.columns:
        if df[column].dtype == type(object):
            print(column)
            df[column] =le.fit_transform(df[column].astype(str))
    return df
df=labelencode(df)