# Shill Bidding Dataset Analysis

In [155]:
import pandas as pd
import numpy as np
from numpy import where
import matplotlib.pyplot as plt
import seaborn as sns
from pandas import Series, DataFrame
from scipy import stats
#from scipy.special import boxcox1p
#from scipy.stats import boxcox_normmax
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
#from xgboost import XGBClassifier
from sklearn.svm import SVC

#Common Model Helpers
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn import metrics
import random
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from matplotlib.colors import ListedColormap
from matplotlib import pyplot


from tqdm import tqdm
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import optuna

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler


from collections import Counter
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from sklearn.svm import SVC
from imblearn.pipeline import Pipeline

seed = 42

In [156]:
shill = pd.read_csv('Shill_Bidding_Dataset.csv')
shill.head()

Unnamed: 0,Record_ID,Auction_ID,Bidder_ID,Bidder_Tendency,Bidding_Ratio,Successive_Outbidding,Last_Bidding,Auction_Bids,Starting_Price_Average,Early_Bidding,Winning_Ratio,Auction_Duration,Class
0,1,732,_***i,0.2,0.4,0.0,2.8e-05,0.0,0.993593,2.8e-05,0.666667,5,0
1,2,732,g***r,0.02439,0.2,0.0,0.013123,0.0,0.993593,0.013123,0.944444,5,0
2,3,732,t***p,0.142857,0.2,0.0,0.003042,0.0,0.993593,0.003042,1.0,5,0
3,4,732,7***n,0.1,0.2,0.0,0.097477,0.0,0.993593,0.097477,1.0,5,0
4,5,900,z***z,0.051282,0.222222,0.0,0.001318,0.0,0.0,0.001242,0.5,7,0


In [157]:
X = shill.drop(columns=['Class', 'Record_ID', 'Auction_ID', 'Bidder_ID'])
y = shill.Class
X

Unnamed: 0,Bidder_Tendency,Bidding_Ratio,Successive_Outbidding,Last_Bidding,Auction_Bids,Starting_Price_Average,Early_Bidding,Winning_Ratio,Auction_Duration
0,0.200000,0.400000,0.0,0.000028,0.000000,0.993593,0.000028,0.666667,5
1,0.024390,0.200000,0.0,0.013123,0.000000,0.993593,0.013123,0.944444,5
2,0.142857,0.200000,0.0,0.003042,0.000000,0.993593,0.003042,1.000000,5
3,0.100000,0.200000,0.0,0.097477,0.000000,0.993593,0.097477,1.000000,5
4,0.051282,0.222222,0.0,0.001318,0.000000,0.000000,0.001242,0.500000,7
...,...,...,...,...,...,...,...,...,...
6316,0.333333,0.160000,1.0,0.738557,0.280000,0.993593,0.686358,0.888889,3
6317,0.030612,0.130435,0.0,0.005754,0.217391,0.993593,0.000010,0.878788,7
6318,0.055556,0.043478,0.0,0.015663,0.217391,0.993593,0.015663,0.000000,7
6319,0.076923,0.086957,0.0,0.068694,0.217391,0.993593,0.000415,0.000000,7


In [158]:
y.value_counts()

0    5646
1     675
Name: Class, dtype: int64

In [159]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = seed)

scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, stratify = y_train, random_state = seed)

print(f"Training set: {X_train.shape[0]}")
print(f"Validation set: {X_val.shape[0]}")
print(f"Test set: {X_test.shape[0]}")

Training set: 4044
Validation set: 1012
Test set: 1265


# Class balance by Undersampling

When undersampling, we aim to remove a number of the rows of the majority class (rows where class=0) in order to match the number of rows of the minority class (rows where class=1).

In [160]:
y_train.value_counts()

0    3612
1     432
Name: Class, dtype: int64

In [161]:
rus = RandomUnderSampler(random_state=seed)
X_train_us, y_train_us = rus.fit_resample(X_train, y_train)

In [162]:
y_train_us.value_counts()

1    432
0    432
Name: Class, dtype: int64

In [163]:
X_train_us, X_val_us, y_train_us, y_val_us = train_test_split(X_train_us, y_train_us, test_size=0.2, random_state = seed)

# Class balance by oversampling with SMOTE

Now, oversampling will be performed on the training data.

In [164]:
y_train.value_counts()

0    3612
1     432
Name: Class, dtype: int64

In [165]:
smote = SMOTE(random_state=seed)

X_train_os, y_train_os = smote.fit_resample(X_train, y_train)

y_train_os.value_counts()

1    3612
0    3612
Name: Class, dtype: int64

In [166]:
X_train_os, X_val_os, y_train_os, y_val_os = train_test_split(X_train_os, y_train_os, stratify=y_train_os, test_size=0.2, random_state = seed)

# Hybrid Sampling Method

A combination of under- and oversampling method using pipeline

In [167]:
# define pipeline
over = SMOTE(sampling_strategy = "not minority")
under = RandomUnderSampler(sampling_strategy = "majority")
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)

# transform the dataset
X_h, y_h = pipeline.fit_resample(X, y)
# summarize the new class distribution
counter = Counter(y_h)
print(counter)

Counter({0: 675, 1: 675})


In [168]:
X_train_h, X_test_h, y_train_h, y_test_h = train_test_split(X_h, y_h, test_size=0.2, stratify = y_h, random_state=seed)

scaler = StandardScaler()

X_train_h = scaler.fit_transform(X_train_h)
X_test_h = scaler.transform(X_test_h)

X_train_h, X_val_h, y_train_h, y_val_h = train_test_split(X_train_h, y_train_h, test_size=0.2, stratify=y_train_h, random_state=seed)

print(f"Training set: {X_train_h.shape[0]}")
print(f"Validation set: {X_val_h.shape[0]}")
print(f"Test set: {X_test_h.shape[0]}")

Training set: 864
Validation set: 216
Test set: 270


# Applying Ensemble Learning Algorithms

Ensemble Algorithms

    1. Bagged Decision Trees
    2. Extra Trees
    3. Stochastic Gradient Boosting
    4. AdaBoost
    5. XGBoost
    6. Gradient Boosting
    
Hyperparameter Tuning
After spot-checking machine learning algorithms and imbalanced algorithms, you will have some idea of what works and what does not on your specific dataset.

The simplest approach to hyperparameter tuning is to select the top five or 10 algorithms or algorithm combinations that performed well and tune the hyperparameters for each.

There are three popular hyperparameter tuning algorithms that you may choose from:

    Random Search
    Grid Search
    Bayesian Optimization

## Bagged Decision Trees

### Imbalanced data

In [169]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score

In [170]:
kfold = KFold(n_splits = 10, random_state = seed)
cart = DecisionTreeClassifier()
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

kfold = model_selection.KFold(n_splits=10, random_state=42)
bclf = BaggingClassifier(base_estimator = cart, n_estimators = 200, random_state = seed)

scoring = ['accuracy', 'precision', 'recall', 'f1']

results = model_selection.cross_validate(bclf, X_scaled, y, cv = kfold, scoring = scoring)

for name in results.keys():
     print('{} {:.6f}'.format(name, np.average(results[name])))

fit_time 1.389901
score_time 0.052299
test_accuracy 0.997628
test_precision 0.992358
test_recall 0.985921
test_f1 0.989020


In [171]:
bclf = BaggingClassifier(base_estimator = cart, n_estimators = tree_no, random_state = seed).fit(X_train, y_train)


In [172]:
y_pred = bclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   0.998232  0.999115  0.998673      1130
         Yes   0.992537  0.985185  0.988848       135

    accuracy                       0.997628      1265
   macro avg   0.995384  0.992150  0.993760      1265
weighted avg   0.997624  0.997628  0.997625      1265



### Undersampled Data

In [204]:
bclf = BaggingClassifier(base_estimator = cart, n_estimators = tree_no, random_state = seed).fit(X_train_us, y_train_us)

In [205]:
scoring = ['accuracy', 'precision', 'recall', 'f1']
results = model_selection.cross_validate(bclf, X_scaled, y, cv=kfold, scoring = scoring)

for name in results.keys():
     print('{} {:.6f}'.format(name, np.average(results[name])))

fit_time 1.566698
score_time 0.062501
test_accuracy 0.997628
test_precision 0.992358
test_recall 0.985921
test_f1 0.989020


In [206]:
y_pred = bclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   1.000000  0.976991  0.988362      1130
         Yes   0.838509  1.000000  0.912162       135

    accuracy                       0.979447      1265
   macro avg   0.919255  0.988496  0.950262      1265
weighted avg   0.982766  0.979447  0.980230      1265



### Oversampled Data

In [210]:
bclf = BaggingClassifier(base_estimator = cart, n_estimators = tree_no, random_state = seed).fit(X_train_os, y_train_os)

In [211]:
scoring = ['accuracy', 'precision', 'recall', 'f1']
results = model_selection.cross_validate(bclf, X_scaled, y, cv=kfold, scoring = scoring)

for name in results.keys():
     print('{} {:.6f}'.format(name, np.average(results[name])))

fit_time 1.629899
score_time 0.061400
test_accuracy 0.997628
test_precision 0.992358
test_recall 0.985921
test_f1 0.989020


In [212]:
y_pred = bclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   0.996473  1.000000  0.998233      1130
         Yes   1.000000  0.970370  0.984962       135

    accuracy                       0.996838      1265
   macro avg   0.998236  0.985185  0.991598      1265
weighted avg   0.996849  0.996838  0.996817      1265



### Hybrid Sampled Data

In [213]:
bclf = BaggingClassifier(base_estimator = cart, n_estimators = tree_no, random_state = seed).fit(X_train_h, y_train_h)

In [214]:
y_pred = bclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   1.000000  0.485841  0.653961      1130
         Yes   0.188547  1.000000  0.317274       135

    accuracy                       0.540711      1265
   macro avg   0.594274  0.742920  0.485617      1265
weighted avg   0.913402  0.540711  0.618030      1265



## Extra Trees Classifiers

### Imbalanced Data

In [180]:
from sklearn.ensemble import ExtraTreesClassifier
xtclf = ExtraTreesClassifier(n_estimators = 150, max_features = 5).fit(X_train, y_train)

In [181]:
y_pred = xtclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   0.998232  0.999115  0.998673      1130
         Yes   0.992537  0.985185  0.988848       135

    accuracy                       0.997628      1265
   macro avg   0.995384  0.992150  0.993760      1265
weighted avg   0.997624  0.997628  0.997625      1265



In [182]:
kfold = KFold(n_splits = 10, random_state = seed)
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

kfold = model_selection.KFold(n_splits=10, random_state=42)
xtclf = ExtraTreesClassifier(n_estimators = 150, max_features = 5)

scoring = ['accuracy', 'precision', 'recall', 'f1']

results = model_selection.cross_validate(xtclf, X_scaled, y, cv = kfold, scoring = scoring)

for name in results.keys():
     print('{} {:.6f}'.format(name, np.average(results[name])))

fit_time 0.436200
score_time 0.029200
test_accuracy 0.997627
test_precision 0.989382
test_recall 0.988100
test_f1 0.988714


### Undersampled Data

In [183]:
xtclf = ExtraTreesClassifier(n_estimators = 150, max_features = 5).fit(X_train_us, y_train_us)
y_pred = xtclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   1.000000  0.980531  0.990170      1130
         Yes   0.859873  1.000000  0.924658       135

    accuracy                       0.982609      1265
   macro avg   0.929936  0.990265  0.957414      1265
weighted avg   0.985046  0.982609  0.983178      1265



### Oversampled Data

In [184]:
xtclf = ExtraTreesClassifier(n_estimators = 150, max_features = 5).fit(X_train_os, y_train_os)
y_pred = xtclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   0.999113  0.997345  0.998229      1130
         Yes   0.978102  0.992593  0.985294       135

    accuracy                       0.996838      1265
   macro avg   0.988608  0.994969  0.991761      1265
weighted avg   0.996871  0.996838  0.996848      1265



### Hybrid Sampled Data

In [185]:
xtclf = ExtraTreesClassifier(n_estimators = 150, max_features = 5).fit(X_train_h, y_train_h)
y_pred = xtclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   1.000000  0.896460  0.945404      1130
         Yes   0.535714  1.000000  0.697674       135

    accuracy                       0.907510      1265
   macro avg   0.767857  0.948230  0.821539      1265
weighted avg   0.950452  0.907510  0.918966      1265



## Stochastic Gradient Boosting

### Imbalanced Dataset

In [186]:
from sklearn.ensemble import GradientBoostingClassifier
kfold = KFold(n_splits = 10, random_state = seed)
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

kfold = model_selection.KFold(n_splits=10, random_state=42)

gbclf = GradientBoostingClassifier(n_estimators = 50, random_state=seed)

scoring = ['accuracy', 'precision', 'recall', 'f1']

results = model_selection.cross_validate(gbclf, X_scaled, y, cv = kfold, scoring = scoring)

for name in results.keys():
     print('{} {:.6f}'.format(name, np.average(results[name])))

fit_time 0.410501
score_time 0.005999
test_accuracy 0.996520
test_precision 0.983812
test_recall 0.984454
test_f1 0.984008


In [187]:
gbclf = GradientBoostingClassifier(n_estimators = 50, random_state=seed).fit(X_train, y_train)

y_pred = gbclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   0.998229  0.997345  0.997787      1130
         Yes   0.977941  0.985185  0.981550       135

    accuracy                       0.996047      1265
   macro avg   0.988085  0.991265  0.989668      1265
weighted avg   0.996063  0.996047  0.996054      1265



### Undersampled Dataset

In [188]:
gbclf = GradientBoostingClassifier(n_estimators = 50, random_state=seed).fit(X_train_us, y_train_us)

y_pred = gbclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   0.999094  0.976106  0.987466      1130
         Yes   0.832298  0.992593  0.905405       135

    accuracy                       0.977866      1265
   macro avg   0.915696  0.984349  0.946436      1265
weighted avg   0.981294  0.977866  0.978709      1265



### Oversampled Dataset

In [189]:
gbclf = GradientBoostingClassifier(n_estimators = 50, random_state=seed).fit(X_train_os, y_train_os)

y_pred = gbclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   1.000000  0.991150  0.995556      1130
         Yes   0.931034  1.000000  0.964286       135

    accuracy                       0.992095      1265
   macro avg   0.965517  0.995575  0.979921      1265
weighted avg   0.992640  0.992095  0.992218      1265



### Hybrid sampled Dataset

In [190]:
gbclf = GradientBoostingClassifier(n_estimators = 50, random_state=seed).fit(X_train_h, y_train_h)

y_pred = gbclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   1.000000  0.431858  0.603214      1130
         Yes   0.173745  1.000000  0.296053       135

    accuracy                       0.492490      1265
   macro avg   0.586873  0.715929  0.449633      1265
weighted avg   0.911823  0.492490  0.570434      1265



## AdaBoost Classifier

### Imbalanced Dataset

In [191]:
from sklearn.ensemble import AdaBoostClassifier
kfold = KFold(n_splits = 10, random_state = seed)
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

kfold = model_selection.KFold(n_splits = 10, random_state = 42)

abclf = AdaBoostClassifier(n_estimators = 50, random_state = seed)

scoring = ['accuracy', 'precision', 'recall', 'f1']

results = model_selection.cross_validate(abclf, X_scaled, y, cv = kfold, scoring = scoring)

for name in results.keys():
     print('{} {:.6f}'.format(name, np.average(results[name])))

fit_time 0.294702
score_time 0.020099
test_accuracy 0.994621
test_precision 0.969238
test_recall 0.979374
test_f1 0.974124


In [192]:
abclf = AdaBoostClassifier(n_estimators = 50, random_state=seed).fit(X_train, y_train)

y_pred = abclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   1.000000  0.995575  0.997783      1130
         Yes   0.964286  1.000000  0.981818       135

    accuracy                       0.996047      1265
   macro avg   0.982143  0.997788  0.989800      1265
weighted avg   0.996189  0.996047  0.996079      1265



### Undersampled Dataset

In [193]:
abclf = AdaBoostClassifier(n_estimators = 50, random_state=seed).fit(X_train_us, y_train_us)

y_pred = abclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   0.997300  0.980531  0.988844      1130
         Yes   0.857143  0.977778  0.913495       135

    accuracy                       0.980237      1265
   macro avg   0.927221  0.979154  0.951170      1265
weighted avg   0.982342  0.980237  0.980803      1265



### Oversampled Dataset

In [194]:
abclf = AdaBoostClassifier(n_estimators = 50, random_state=seed).fit(X_train_os, y_train_os)

y_pred = abclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   0.999111  0.994690  0.996896      1130
         Yes   0.957143  0.992593  0.974545       135

    accuracy                       0.994466      1265
   macro avg   0.978127  0.993641  0.985721      1265
weighted avg   0.994632  0.994466  0.994511      1265



### Hybrid sampled Dataset

In [195]:
abclf = AdaBoostClassifier(n_estimators = 50, random_state=seed).fit(X_train_h, y_train_h)

y_pred = abclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   1.000000  0.634513  0.776394      1130
         Yes   0.246350  1.000000  0.395315       135

    accuracy                       0.673518      1265
   macro avg   0.623175  0.817257  0.585854      1265
weighted avg   0.919571  0.673518  0.735726      1265



## XGBoost Classifier

### Imbalanced Dataset

In [196]:
from xgboost import XGBClassifier

kfold = KFold(n_splits = 10, random_state = seed)
scaler = StandardScaler()

X_scaled = scaler.fit_transform(X)

kfold = model_selection.KFold(n_splits = 10, random_state = 42)

xgbclf = XGBClassifier(n_estimators = 50, random_state = seed)

scoring = ['accuracy', 'precision', 'recall', 'f1']

results = model_selection.cross_validate(xgbclf, X_scaled, y, cv = kfold, scoring = scoring)

for name in results.keys():
     print('{} {:.6f}'.format(name, np.average(results[name])))





fit_time 0.250899
score_time 0.016001
test_accuracy 0.996994
test_precision 0.982427
test_recall 0.988100
test_f1 0.985143


In [197]:
xgbclf = XGBClassifier(n_estimators = 50, random_state=seed).fit(X_train, y_train)

y_pred = xgbclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   0.997343  0.996460  0.996901      1130
         Yes   0.970588  0.977778  0.974170       135

    accuracy                       0.994466      1265
   macro avg   0.983966  0.987119  0.985536      1265
weighted avg   0.994488  0.994466  0.994475      1265



### Undersampled Dataset

In [198]:
xgbclf = XGBClassifier(n_estimators = 50, random_state=seed).fit(X_train_us, y_train_us)

y_pred = xgbclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   0.999100  0.982301  0.990629      1130
         Yes   0.870130  0.992593  0.927336       135

    accuracy                       0.983399      1265
   macro avg   0.934615  0.987447  0.958982      1265
weighted avg   0.985336  0.983399  0.983875      1265



### Oversampled Dataset

In [199]:
xgbclf = XGBClassifier(n_estimators = 50, random_state=seed).fit(X_train_os, y_train_os)

y_pred = xgbclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   0.997340  0.995575  0.996457      1130
         Yes   0.963504  0.977778  0.970588       135

    accuracy                       0.993676      1265
   macro avg   0.980422  0.986676  0.983523      1265
weighted avg   0.993729  0.993676  0.993696      1265



### Hybrid Sampled Dataset

In [200]:
xgbclf = XGBClassifier(n_estimators = 50, random_state=seed).fit(X_train_h, y_train_h)

y_pred = xgbclf.predict(X_test)
print(classification_report(y_test, y_pred, target_names = ['No','Yes'], digits = 6))

              precision    recall  f1-score   support

          No   1.000000  0.401770  0.573232      1130
         Yes   0.166461  1.000000  0.285412       135

    accuracy                       0.465613      1265
   macro avg   0.583231  0.700885  0.429322      1265
weighted avg   0.911045  0.465613  0.542516      1265

