NOTE: This notebook is done in Google Colab

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


#Load Data

In [98]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, GridSearchCV

sns.set_theme(style="darkgrid")

In [99]:
root_path='/content/drive/MyDrive/ML Project/online_shopping_analysis'
data_path=f'{root_path}/processed_data.csv'
label_path=f'{root_path}/data_label.csv'

In [100]:
raw_processed_data = pd.read_csv(data_path)
raw_processed_data.head()

Unnamed: 0,Administrative,Administrative_Duration,Informational,Informational_Duration,ProductRelated,ProductRelated_Duration,BounceRates,ExitRates,PageValues,SpecialDay,Month,OperatingSystems,Browser,Region,TrafficType,VisitorType,Weekend
0,0.0,0.0,0.0,0.0,1.0,0.0,0.2,0.2,0.0,0.0,2,1,1,1,1,1,0
1,0.0,0.0,0.0,0.0,2.0,64.0,0.0,0.1,0.0,0.0,2,2,2,1,2,1,0
2,0.0,-1.0,0.0,-1.0,1.0,-1.0,0.2,0.2,0.0,0.0,2,4,1,9,3,1,0
3,0.0,0.0,0.0,0.0,2.0,2.666667,0.05,0.14,0.0,0.0,2,3,2,2,4,1,0
4,0.0,0.0,0.0,0.0,10.0,627.5,0.02,0.05,0.0,0.0,2,3,3,1,4,1,1


In [101]:
data_label = pd.read_csv(label_path)
data_label.head()

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0


# Scale data

In [102]:
"""StandarScaler"""
standard_scaler = preprocessing.StandardScaler().fit(raw_processed_data)
standardized_data = standard_scaler.transform(raw_processed_data)
standardized_data

array([[-0.69699296, -0.45773627, -0.39636998, ..., -0.76262903,
        -0.40102509, -0.55055169],
       [-0.69699296, -0.45773627, -0.39636998, ..., -0.51418219,
        -0.40102509, -0.55055169],
       [-0.69699296, -0.46339389, -0.39636998, ..., -0.26573535,
        -0.40102509, -0.55055169],
       ...,
       [-0.69699296, -0.45773627, -0.39636998, ...,  2.21873304,
        -0.40102509,  1.81635987],
       [ 0.50722805, -0.03341489, -0.39636998, ...,  1.72183936,
        -0.40102509, -0.55055169],
       [-0.69699296, -0.45773627, -0.39636998, ..., -0.51418219,
         2.25167844,  1.81635987]])

In [27]:
standardized_data.mean(axis=0)

array([-2.99661170e-17, -9.45085228e-17,  3.22712029e-17,  2.07457733e-17,
        3.68813748e-17, -9.22034369e-18, -3.45762888e-17,  3.91864607e-17,
        1.06033952e-16, -5.53220622e-17, -2.21288249e-16,  4.61017185e-18,
        1.49830585e-17,  7.72203784e-17, -2.99661170e-17, -2.25898420e-16,
        2.30508592e-18])

In [28]:
standardized_data.std(axis=0)

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

In [33]:
"""MaxMINScaler"""
max_min_scaler= preprocessing.MinMaxScaler()
max_min_data = max_min_scaler.fit_transform(raw_processed_data)
max_min_data

array([[0.00000000e+00, 2.94139275e-04, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.94139275e-04, 0.00000000e+00, ...,
        5.26315789e-02, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        1.05263158e-01, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 2.94139275e-04, 0.00000000e+00, ...,
        6.31578947e-01, 0.00000000e+00, 1.00000000e+00],
       [1.48148148e-01, 2.23545849e-02, 0.00000000e+00, ...,
        5.26315789e-01, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 2.94139275e-04, 0.00000000e+00, ...,
        5.26315789e-02, 5.00000000e-01, 1.00000000e+00]])

In [34]:
"""MaxAbsScaler for sparse data"""
max_abs_scaler = preprocessing.MaxAbsScaler()
max_abs_data = max_abs_scaler.fit_transform(raw_processed_data)
max_abs_data

array([[ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         5.00000000e-02,  3.33333333e-01,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         1.00000000e-01,  3.33333333e-01,  0.00000000e+00],
       [ 0.00000000e+00, -2.94225818e-04,  0.00000000e+00, ...,
         1.50000000e-01,  3.33333333e-01,  0.00000000e+00],
       ...,
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         6.50000000e-01,  3.33333333e-01,  1.00000000e+00],
       [ 1.48148148e-01,  2.20669364e-02,  0.00000000e+00, ...,
         5.50000000e-01,  3.33333333e-01,  0.00000000e+00],
       [ 0.00000000e+00,  0.00000000e+00,  0.00000000e+00, ...,
         1.00000000e-01,  6.66666667e-01,  1.00000000e+00]])

In [55]:
"""PowerTransofmr, convert to gaussian distribution"""
power_scaler = preprocessing.PowerTransformer(method='yeo-johnson')
gaussian_data = power_scaler.fit_transform(raw_processed_data)

In [None]:
gaussian_data_frame = pd.DataFrame(gaussian_data,columns=raw_processed_data.columns.values)

plt.figure(figsize=(30,15))
for index, col_name in enumerate(gaussian_data_frame.columns.values):
  ax_sub=plt.subplot(3,6, index+1);
  ax_sns = sns.histplot(data=gaussian_data_frame[col_name], ax=ax_sub, kde=True, stat='density', color='green')

In [38]:
"""QuantileTransform, convert to uniform distribution"""
quantile_scaler = preprocessing.QuantileTransformer(random_state=123)
uniform_data = quantile_scaler.fit_transform(raw_processed_data)
uniform_data

array([[0.        , 0.24024024, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.24024024, 0.        , ..., 0.35735736, 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.5990991 , 0.        ,
        0.        ],
       ...,
       [0.        , 0.24024024, 0.        , ..., 0.94744745, 0.        ,
        1.        ],
       [0.77327327, 0.70770771, 0.        , ..., 0.90740741, 0.        ,
        0.        ],
       [0.        , 0.24024024, 0.        , ..., 0.35735736, 0.92442442,
        1.        ]])

In [None]:
uniform_data_frame = pd.DataFrame(uniform_data,columns=raw_processed_data.columns.values)

plt.figure(figsize=(30,15))
for index, col_name in enumerate(uniform_data_frame.columns.values):
  ax_sub=plt.subplot(3,6, index+1);
  ax_sns = sns.histplot(data=uniform_data_frame[col_name], ax=ax_sub, kde=True, stat='density', color='green')

In [60]:
"""Normalization"""
# l1 norm
l1_scaler = preprocessing.Normalizer(norm='l1')
l1_data = l1_scaler.fit_transform(raw_processed_data)
l1_data

array([[ 0.        ,  0.        ,  0.        , ...,  0.11904762,
         0.11904762,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.02628121,
         0.0131406 ,  0.        ],
       [ 0.        , -0.04098361,  0.        , ...,  0.12295082,
         0.04098361,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.05844798,
         0.004496  ,  0.004496  ],
       [ 0.00851026,  0.15956732,  0.        , ...,  0.02340321,
         0.00212756,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.043181  ,
         0.043181  ,  0.0215905 ]])

In [61]:
# l2 norm
l2_scaler = preprocessing.Normalizer(norm='l2')
l2_data = l2_scaler.fit_transform(raw_processed_data)
l2_data

array([[ 0.        ,  0.        ,  0.        , ...,  0.31497039,
         0.31497039,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.03116638,
         0.01558319,  0.        ],
       [ 0.        , -0.09281567,  0.        , ...,  0.27844701,
         0.09281567,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.07020356,
         0.00540027,  0.00540027],
       [ 0.0112758 ,  0.21142116,  0.        , ...,  0.03100844,
         0.00281895,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.08134068,
         0.08134068,  0.04067034]])

In [63]:
# max norm
max_scaler = preprocessing.Normalizer(norm='max')
max_norm_data = max_scaler.fit_transform(raw_processed_data)
max_norm_data

array([[ 0.        ,  0.        ,  0.        , ...,  0.5       ,
         0.5       ,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.03125   ,
         0.015625  ,  0.        ],
       [ 0.        , -0.11111111,  0.        , ...,  0.33333333,
         0.11111111,  0.        ],
       ...,
       [ 0.        ,  0.        ,  0.        , ...,  0.07055631,
         0.00542741,  0.00542741],
       [ 0.01156069,  0.21676301,  0.        , ...,  0.03179191,
         0.00289017,  0.        ],
       [ 0.        ,  0.        ,  0.        , ...,  0.09411765,
         0.09411765,  0.04705882]])

In [69]:
complete_data_dict ={'standardized_data':standardized_data, 
                     'max_min_data':max_min_data, 
                     'max_abs_data':max_abs_data, 
                     'gaussian_data':gaussian_data, 
                     'uniform_data':uniform_data, 
                     'l1_data':l1_data, 
                     'l2_data':l2_data, 
                     'max_norm_data':max_norm_data}

# GridSearch For Data Modeling

In [125]:
data = complete_data_dict['standardized_data']

X_train, X_test, y_train, y_test = train_test_split(data, data_label.values, train_size=0.7, random_state=123, shuffle=True)

svc = SVC(C=1.0,
          kernel='linear',
          gamma='scale', 
          shrinking=True, 
          probability=False,  
          tol=0.0001, 
          cache_size=200, 
          class_weight='balanced', 
          verbose=True, 
          max_iter=200, 
          decision_function_shape='ovr', 
          break_ties=False, 
          random_state=123)

svc.fit(X_train, y_train)

[LibSVM]

  y = column_or_1d(y, warn=True)


SVC(C=1.0, break_ties=False, cache_size=200, class_weight='balanced', coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='linear',
    max_iter=200, probability=False, random_state=123, shrinking=True,
    tol=0.0001, verbose=True)

In [124]:
"""SVM grid search for each scaled data"""
svc_grid_params={
    'C': [1,0.1,0.01,0.001,0.0001,0.00001], 
    'kernel':['linear', 'poly', 'rbf', 'sigmoid'],
    'degree':[2,3,4,5],  
}

data = complete_data_dict['standardized_data']

X_train, X_test, y_train, y_test = train_test_split(data, data_label.values, train_size=0.7, random_state=123, shuffle=True)
classifer = SVC(gamma='scale', 
                shrinking=True, 
                probability=False, 
                tol=0.0001, 
                cache_size=200, 
                class_weight='balanced', 
                verbose=True, 
                max_iter=200, 
                decision_function_shape='ovr', 
                break_ties=False, 
                random_state=123)

gd_sr = GridSearchCV(estimator=classifer, param_grid=svc_grid_params, scoring='accuracy', cv=5, n_jobs=-1, verbose=True)
gd_sr.fit(X_train, y_train.reshape(y_train.shape[0]).astype('float32'))

Fitting 5 folds for each of 96 candidates, totalling 480 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  88 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 388 tasks      | elapsed:   28.4s
[Parallel(n_jobs=-1)]: Done 480 out of 480 | elapsed:   35.4s finished


[LibSVM]



GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight='balanced', coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=200,
                           probability=False, random_state=123, shrinking=True,
                           tol=0.0001, verbose=True),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1, 0.1, 0.01, 0.001, 0.0001, 1e-05],
                         'degree': [2, 3, 4, 5],
                         'kernel': ['linear', 'poly', 'rbf', 'sigmoid']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=True)

In [123]:
y_train.astype('float32')

array([[0.],
       [0.],
       [0.],
       ...,
       [0.],
       [0.],
       [1.]], dtype=float32)

In [108]:
X_train, X_test, y_train, y_test = train_test_split(complete_data_dict['max_min_data'], data_label.values, train_size=0.7, random_state=123, shuffle=True)

In [117]:
y_train.reshape(y_train.shape[0]).shape

(8631,)