In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
import category_encoders as ce
from sklearn.pipeline import make_pipeline
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
import numpy as np
from sklearn.metrics import f1_score, confusion_matrix

In [2]:
data = pd.read_csv("../data/interim/first_data_clean.csv")

In [3]:
data.head()

Unnamed: 0,URL,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CHARSET,WHOIS_COUNTRY,TCP_CONVERSATION_EXCHANGE,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES,Type
0,M0_109,16,7,Iso-8859-1,Not_available,7,0,2,700,9,10,1153,832,9,2.0,1
1,B0_2314,16,6,Utf-8,Not_available,17,7,4,1230,17,19,1265,1230,17,0.0,0
2,B0_911,16,6,Us-ascii,Not_available,0,0,0,0,0,0,0,0,0,0.0,0
3,B0_113,17,6,Iso-8859-1,Us,31,22,3,3812,39,37,18784,4380,39,8.0,0
4,B0_403,17,6,Utf-8,Us,57,2,5,4278,61,62,129889,4586,61,4.0,0


In [4]:
data.shape

(1774, 16)

In [5]:
data.isna().sum()

URL                          0
URL_LENGTH                   0
NUMBER_SPECIAL_CHARACTERS    0
CHARSET                      0
WHOIS_COUNTRY                0
TCP_CONVERSATION_EXCHANGE    0
DIST_REMOTE_TCP_PORT         0
REMOTE_IPS                   0
APP_BYTES                    0
SOURCE_APP_PACKETS           0
REMOTE_APP_PACKETS           0
SOURCE_APP_BYTES             0
REMOTE_APP_BYTES             0
APP_PACKETS                  0
DNS_QUERY_TIMES              0
Type                         0
dtype: int64

In [6]:
data.dtypes

URL                           object
URL_LENGTH                     int64
NUMBER_SPECIAL_CHARACTERS      int64
CHARSET                       object
WHOIS_COUNTRY                 object
TCP_CONVERSATION_EXCHANGE      int64
DIST_REMOTE_TCP_PORT           int64
REMOTE_IPS                     int64
APP_BYTES                      int64
SOURCE_APP_PACKETS             int64
REMOTE_APP_PACKETS             int64
SOURCE_APP_BYTES               int64
REMOTE_APP_BYTES               int64
APP_PACKETS                    int64
DNS_QUERY_TIMES              float64
Type                           int64
dtype: object

In [7]:
data = data.drop(columns="URL")

In [8]:
data.shape

(1774, 15)

In [9]:
data.head()

Unnamed: 0,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CHARSET,WHOIS_COUNTRY,TCP_CONVERSATION_EXCHANGE,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES,Type
0,16,7,Iso-8859-1,Not_available,7,0,2,700,9,10,1153,832,9,2.0,1
1,16,6,Utf-8,Not_available,17,7,4,1230,17,19,1265,1230,17,0.0,0
2,16,6,Us-ascii,Not_available,0,0,0,0,0,0,0,0,0,0.0,0
3,17,6,Iso-8859-1,Us,31,22,3,3812,39,37,18784,4380,39,8.0,0
4,17,6,Utf-8,Us,57,2,5,4278,61,62,129889,4586,61,4.0,0


In [10]:
targets = data.loc[:, "Type"]

In [11]:
data = data.drop(columns="Type")

In [12]:
X_train, X_test, y_train, y_test = train_test_split(
    data, targets, random_state=42, test_size=0.2
)

In [13]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1419, 14)
(355, 14)
(1419,)
(355,)


In [14]:
y_train.value_counts()

Type
0    1246
1     173
Name: count, dtype: int64

Encoding

In [15]:
columns_to_oh_encode = ["CHARSET"]
columns_to_target_encode = ["WHOIS_COUNTRY"]

In [16]:
pipeline = make_pipeline(
    # ce.OneHotEncoder(cols=columns_to_oh_encode),
    # ce.TargetEncoder(cols=columns_to_target_encode),
    # XGBClassifier(),
)

In [17]:
oh_enc = ce.OneHotEncoder(cols=columns_to_oh_encode)
tar_enc = ce.TargetEncoder(cols=columns_to_target_encode)

In [18]:
X_Train_enc = oh_enc.fit_transform(X_train)

In [19]:
X_Train_enc_tar = tar_enc.fit_transform(X_Train_enc, y_train)

In [20]:
X_Train_enc_tar.head()

Unnamed: 0,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CHARSET_1,CHARSET_2,CHARSET_3,CHARSET_4,WHOIS_COUNTRY,TCP_CONVERSATION_EXCHANGE,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES
265,34,7,1,0,0,0,0.047404,23,3,11,2042,29,29,7939,2520,29,6.0
1752,145,31,0,1,0,0,0.047404,10,3,3,1354,18,19,2517,2010,18,8.0
1040,55,8,0,1,0,0,0.214876,24,3,9,1715,26,31,22292,1867,26,2.0
1436,75,16,1,0,0,0,0.047404,33,3,8,3647,41,39,32399,4247,41,8.0
1384,71,12,0,1,0,0,0.095807,25,3,5,3428,29,23,28007,3736,29,4.0


In [21]:
X_test_enc = oh_enc.transform(X_test)
X_test_enc_tar = tar_enc.transform(X_test_enc)

In [22]:
X_test_enc_tar.head()

Unnamed: 0,URL_LENGTH,NUMBER_SPECIAL_CHARACTERS,CHARSET_1,CHARSET_2,CHARSET_3,CHARSET_4,WHOIS_COUNTRY,TCP_CONVERSATION_EXCHANGE,DIST_REMOTE_TCP_PORT,REMOTE_IPS,APP_BYTES,SOURCE_APP_PACKETS,REMOTE_APP_PACKETS,SOURCE_APP_BYTES,REMOTE_APP_BYTES,APP_PACKETS,DNS_QUERY_TIMES
999,54,11,1,0,0,0,0.047404,5,0,2,582,9,9,1097,882,9,4.0
596,42,10,1,0,0,0,0.047404,0,0,0,0,0,0,0,0,0,0.0
1132,58,8,1,0,0,0,0.047404,9,0,3,1481,13,12,1327,1773,13,4.0
270,35,9,1,0,0,0,0.214876,39,0,2,3240,45,45,35301,3722,45,6.0
414,39,11,0,1,0,0,0.214876,11,1,4,1266,15,12,960,1548,15,4.0


In [23]:
X_test_enc_tar.isna().sum()

URL_LENGTH                   0
NUMBER_SPECIAL_CHARACTERS    0
CHARSET_1                    0
CHARSET_2                    0
CHARSET_3                    0
CHARSET_4                    0
WHOIS_COUNTRY                0
TCP_CONVERSATION_EXCHANGE    0
DIST_REMOTE_TCP_PORT         0
REMOTE_IPS                   0
APP_BYTES                    0
SOURCE_APP_PACKETS           0
REMOTE_APP_PACKETS           0
SOURCE_APP_BYTES             0
REMOTE_APP_BYTES             0
APP_PACKETS                  0
DNS_QUERY_TIMES              0
dtype: int64

In [24]:
X_Train_enc_tar.dtypes

URL_LENGTH                     int64
NUMBER_SPECIAL_CHARACTERS      int64
CHARSET_1                      int64
CHARSET_2                      int64
CHARSET_3                      int64
CHARSET_4                      int64
WHOIS_COUNTRY                float64
TCP_CONVERSATION_EXCHANGE      int64
DIST_REMOTE_TCP_PORT           int64
REMOTE_IPS                     int64
APP_BYTES                      int64
SOURCE_APP_PACKETS             int64
REMOTE_APP_PACKETS             int64
SOURCE_APP_BYTES               int64
REMOTE_APP_BYTES               int64
APP_PACKETS                    int64
DNS_QUERY_TIMES              float64
dtype: object

Model

In [25]:
dc = DecisionTreeClassifier()

In [26]:
dc.fit(X_Train_enc_tar, y_train)

In [27]:
y_pred = dc.predict(X_test_enc_tar)

In [30]:
np.unique(y_pred, return_counts=True)

(array([0, 1]), array([315,  40]))

In [32]:
f1_score(y_true=y_test, y_pred=y_pred)

0.7560975609756099

In [35]:
confusion_matrix(y_true=y_test, y_pred=y_pred)

array([[304,   9],
       [ 11,  31]])