# 1 Import libraries

In [2]:
import random
import numpy as np
import pandas as pd

from ucimlrepo import fetch_ucirepo # for importing datasets
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectFromModel

from sklearn.preprocessing import OneHotEncoder
import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from sklearn.metrics import roc_auc_score, make_scorer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score, KFold
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.model_selection import StratifiedKFold

from imblearn.over_sampling import SMOTE, ADASYN

from scipy.stats import mannwhitneyu
from scipy.stats import wilcoxon
import pymannkendall as mk

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# 2 Import datasets

In [3]:
# define a function to fetch datasets from UCI machine learning repotory and make it into dataframe

def fetch_datasets(dataset_id):
    dataset = fetch_ucirepo(id = dataset_id)
    
    # data (as pandas dataframes) 
    X = dataset.data.features # pandas dataframe X
    y = dataset.data.targets # pandas dataframe Y
    
    # join X and y into a whole dataframe
    df = pd.concat([X,y], axis = 1)
    df.to_excel(f'dataset.xlsx', index=False) # export to excel file
#     df.shape # number of observations and number of variables
    globals()[f"ds{dataset_id}"] = df
    
    return globals()[f"ds{dataset_id}"] # return a dataframe

### 2.1 Import datasets - A. Size of datasets

In [6]:
# fetch datasets for size of datasets

fetch_datasets(2) # id = 2 adult income

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


### 2.2 Import datasets - B. Balance of datasets

In [8]:
# fetch datasets for class balance datasets

fetch_datasets(2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States,<=50K.
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States,<=50K.
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States,<=50K.
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States,<=50K.


### 2.3 Import datasets - C. Number of features

In [12]:
# fetch datasets for number of features datasets

for ds_id in [75, 367,722]:
    fetch_datasets(ds_id)

### 2.4 Import datasets - D. Binary or non-binary

In [13]:
# fetch datasets for binary or non-binary response variable

binaryId = [159, 222, 367, 722, 891, 350] # remove 967, use 891
ordinalId = [54, 146, 148, 78, 31, 107] # remove 1, use 31
categoricalId = [697, 602, 26, 352, 23, 59] # remove 76, use 59

binary_ds = []
ordinal_ds = []
categorical_ds = []

for i in binaryId:
    binary_ds.append(fetch_datasets(i))

for j in ordinalId:
    ordinal_ds.append(fetch_datasets(j))

for z in categoricalId:
    categorical_ds.append(fetch_datasets(z))

In [105]:
# observe characteristics of datasets
for ds in categorical_ds:
    print(ds.iloc[:,-1].unique())

['Dropout' 'Graduate' 'Enrolled']
['SEKER' 'BARBUNYA' 'BOMBAY' 'CALI' 'HOROZ' 'SIRA' 'DERMASON']
['win' 'draw' 'loss']
['United Kingdom' 'France' 'Australia' 'Netherlands' 'Germany' 'Norway'
 'EIRE' 'Switzerland' 'Spain' 'Poland' 'Portugal' 'Italy' 'Belgium'
 'Lithuania' 'Japan' 'Iceland' 'Channel Islands' 'Denmark' 'Cyprus'
 'Sweden' 'Austria' 'Israel' 'Finland' 'Bahrain' 'Greece' 'Hong Kong'
 'Singapore' 'Lebanon' 'United Arab Emirates' 'Saudi Arabia'
 'Czech Republic' 'Canada' 'Unspecified' 'Brazil' 'USA'
 'European Community' 'Malta' 'RSA']
['draw' 'zero' 'one' 'two' 'three' 'four' 'five' 'six' 'seven' 'eight'
 'nine' 'ten' 'eleven' 'twelve' 'thirteen' 'fourteen' 'fifteen' 'sixteen']
['T' 'I' 'D' 'N' 'G' 'S' 'B' 'A' 'J' 'M' 'X' 'O' 'R' 'F' 'C' 'H' 'W' 'L'
 'P' 'E' 'V' 'Y' 'Q' 'U' 'K' 'Z']


# 3 Data preprocessing

### 3.0 Define functions

In [18]:
# define a function to make data preprocessing

def data_preprocessing(df):
    # # check for missing values
    # print(df.isnull().sum())
    
    # remove rows of missing values
    df.dropna(inplace = True)

    # label-encoding
    is_categorical = any(df.dtypes == 'object')
    
    if is_categorical:  # if dataframe has catogorical data
        categorical_features = df.select_dtypes(include = ['object']).columns # extract categorical features

        label_encoder = LabelEncoder() # label encoding
        
        for col in categorical_features:
            df[col] = label_encoder.fit_transform(df[col])
    
    return df

In [20]:
# check for balance of a dataset

def check_balance(ds):
    y = ds.iloc[:,-1]
    result = sorted(Counter(y).items())
    return result

In [80]:
# create a function to resample based on the given weight (use for module2 checking balanced or imbalanced)

def resample_dataset(ds, whole_counts, proportion):
    
    category = ds.iloc[:,-1].unique() # extract the categories in original dataset
    
    sub_datasets = {} # create a dictionary to store all sub datasets based on the category
    
    # extract sub datasets based on category
    for i, cat in enumerate(category):
        sub_datasets[f"sub_datasets{i+1}"] = ds[ds.iloc[:,-1] == cat]
    
    for name, dataset in sub_datasets.items():
        print(f"The number of rows before resampling in {name} is {len(dataset)}.")
    
    # resample each sub dataset
    size1 = int(whole_counts * proportion) # number of rows from category 1 based on the given weight
    size2 = whole_counts - size1 # number of rows given to cateogry 2
    
    resample1 = sub_datasets['sub_datasets1'].sample(n = size1, replace = True)
    resample2 = sub_datasets['sub_datasets2'].sample(n = size2, replace = True)
    print(f"Rows after resampling are {len(resample1)} and {len(resample2)} respectively.")

    # combine resample1 and resample2 into a new dataframe
    resample_ds = pd.concat([resample1, resample2])

    # shuffle the combined dataset
    resample_shuffle_ds = resample_ds.sample(frac=1).reset_index(drop=True)


    return resample_shuffle_ds


In [54]:
# define a function to process feature selection

def feature_selection(ds, n):
    X = ds.iloc[:, :-1]
    y = ds.iloc[:, -1]
    selector = SelectKBest(mutual_info_classif, k = n)
    X_new = selector.fit_transform(X, y)

    selected_features = X.columns[selector.get_support()]

    df_selection = pd.DataFrame(X_new, columns = selected_features)
    df_selection['target'] = y  
    
    return df_selection

In [56]:
# define a function to resample and return a balanced dataframe

def resample_to_balance(ds):
    X = ds.iloc[:, :-1]
    y = ds.iloc[:, -1]
    X_resampled, y_resampled = ADASYN().fit_resample(X, y)
    ds_resampled = pd.concat([X_resampled, y_resampled], axis = 1)
    
    return ds_resampled

### 3.1 Data preprocessing - A. Size of datasets

In [58]:
# check for response value and remove noise

ds2['income'].unique() # contains noise in column

array(['<=50K', '>50K', '<=50K.', '>50K.'], dtype=object)

In [60]:
ds2['income'] = ds2['income'].replace({'<=50K.':'<=50K', '>50K.':'>50K'})

ds2['income'].unique()

array(['<=50K', '>50K'], dtype=object)

In [62]:
data_preprocessing(ds2)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,7,77516,9,13,4,1,1,4,1,2174,0,40,39,0
1,50,6,83311,9,13,2,4,0,4,1,0,0,13,39,0
2,38,4,215646,11,9,0,6,1,4,1,0,0,40,39,0
3,53,4,234721,1,7,2,6,0,2,1,0,0,40,39,0
4,28,4,338409,9,13,2,10,5,2,0,0,0,40,5,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48836,33,4,245211,9,13,4,10,3,4,1,0,0,40,39,0
48837,39,4,215419,9,13,0,10,1,4,0,0,0,36,39,0
48839,38,4,374983,9,13,2,10,0,4,1,0,0,50,39,0
48840,44,4,83891,9,13,0,1,3,1,1,5455,0,40,39,0


In [68]:
# define different size of datasets

large_datasets = [47621,45000,40000,38000,35000,30000,25000,20000,15000,10000]
small_datasets = [50,100,200,300,400,500,600,700,800,900]

### 3.2 Data preprocessing - B. Balance of datasets

#### 3.2.1 Check for balance

In [72]:
# check for the balance of datasets

print(ds2.shape)

# check for the counts
category_counts = ds2['income'].value_counts()
print(category_counts)
category_proportion = ds2['income'].value_counts(normalize = True)
print(category_proportion) # before resampling

(47621, 15)
income
0    36080
1    11541
Name: count, dtype: int64
income
0    0.757649
1    0.242351
Name: proportion, dtype: float64


#### 3.2.2 Resample to datasets with different level of balance

In [84]:
# create different weight of imbalanced datasets and balanced datasets

imbalanced_datasets = [ds2] # create a new list to store all the imbalanced datasets, including the original dataset which is imbalanced too
imbalanced_proportions = [0.98, 0.95, 0.9, 0.85, 0.8, 0.75, 0.7, 0.65, 0.6] # given imbalanced rate
balanced_datasets = []
balanced_proportions = [0.46, 0.47, 0.48, 0.49, 0.5, 0.51, 0.52, 0.53, 0.54, 0.55]

# create imbalanced datasets based on the imbalanced weight
for i in imbalanced_proportions:
    imbalanced_datasets.append(resample_dataset(ds2, whole_counts, i))

# create balanced datasets based on the balanced weight\
for j in balanced_proportions:
    balanced_datasets.append(resample_dataset(ds2, whole_counts, j))

The number of rows before resampling in sub_datasets1 is 36080.
The number of rows before resampling in sub_datasets2 is 11541.
Rows after resampling are 46668 and 953 respectively.
The number of rows before resampling in sub_datasets1 is 36080.
The number of rows before resampling in sub_datasets2 is 11541.
Rows after resampling are 45239 and 2382 respectively.
The number of rows before resampling in sub_datasets1 is 36080.
The number of rows before resampling in sub_datasets2 is 11541.
Rows after resampling are 42858 and 4763 respectively.
The number of rows before resampling in sub_datasets1 is 36080.
The number of rows before resampling in sub_datasets2 is 11541.
Rows after resampling are 40477 and 7144 respectively.
The number of rows before resampling in sub_datasets1 is 36080.
The number of rows before resampling in sub_datasets2 is 11541.
Rows after resampling are 38096 and 9525 respectively.
The number of rows before resampling in sub_datasets1 is 36080.
The number of rows bef

### 3.3 Data preprocessing - C. Number of features

#### 3.3.1 Data preprocessing

In [88]:
features_preprocessed = []
for ds in [ds75, ds367, ds722]:
    features_preprocessed.append(data_preprocessing(ds))

#### 3.3.2 Create new datasets with feature selection

In [90]:
# feauture selection with ds75

features_ds75 = []
for n in [135, 110, 95, 75, 60, 45, 30, 15, 5]:
    a = feature_selection(features_preprocessed[0], n)
    features_ds75.append(a)

In [91]:
# feature selection with ds367

features_ds367 = []
for n in [100, 88, 76, 64, 52, 40, 28, 16, 4]:
    a = feature_selection(features_preprocessed[1], n)
    features_ds367.append(a)

In [94]:
# feature selection with ds722

features_ds722 = []
for n in [78, 69, 60, 51, 42, 33, 24, 15, 6]:
    a = feature_selection(features_preprocessed[2], n)
    features_ds722.append(a)

#### 3.3.2 Check for balance

In [193]:
# check balance of the datasets

print(check_balance(features_ds75[0]))
print(check_balance(features_ds367[0]))
print(check_balance(features_ds722[0]))

[(0.0, 5581), (1.0, 1017)]
[(-1, 48660), (1, 54284)]
[(0, 14632), (1, 14700)]
[(0, 100945), (1, 134850)]


### 3.4 Data preprocessing - D. Binary or non-binary

#### 3.4.1 Preprocessing

In [107]:
# binary and ordinal
binary_ds_preprocessed = []
ordinal_ds_preprocessed = []
categorical_ds_preprocessed = []

for i in binary_ds:
    binary_ds_preprocessed.append(data_preprocessing(i))
for j in ordinal_ds:
    ordinal_ds_preprocessed.append(data_preprocessing(j))
for z in categorical_ds:
    categorical_ds_preprocessed.append(data_preprocessing(z))

#### 3.4.2 Check for balance and dimension

In [109]:
# binary check
index = 0
for i in binary_ds_preprocessed:
    print(index)
    print(check_balance(i)) # check for balance
    print(i.shape) # check for dimension
    index += 1

# need resample: [0, 1, 4 5]
# features: 23

0
[(0, 12332), (1, 6688)]
(19020, 11)
1
[(0, 6056), (1, 1786)]
(7842, 17)
2
[(-1, 48660), (1, 54284)]
(102944, 116)
3
[(0, 14632), (1, 14700)]
(29332, 87)
4
[(0, 218334), (1, 35346)]
(253680, 22)
5
[(0, 23364), (1, 6636)]
(30000, 24)


In [111]:
# ordinal check
index = 0
for i in ordinal_ds_preprocessed:
    print(index)
    print(check_balance(i))
    print(i.shape)
    print("\n")
    index += 1
    
# need resample: [1, 2, 3, 4]
# features: 17

0
[(1.0, 300), (2.0, 300), (3.0, 300), (4.0, 300), (5.0, 300), (6.0, 298), (7.0, 300), (8.0, 300), (9.0, 300), (10.0, 300), (11.0, 300), (12.0, 300), (13.0, 299), (14.0, 300), (15.0, 300), (16.0, 300), (17.0, 300), (18.0, 300), (19.0, 300), (20.0, 300), (21.0, 300), (22.0, 300), (23.0, 300), (24.0, 300), (25.0, 300), (26.0, 300)]
(7797, 618)


1
[(1, 1533), (2, 703), (3, 1358), (4, 626), (5, 707), (7, 1508)]
(6435, 37)


2
[(1, 45586), (2, 50), (3, 171), (4, 8903), (5, 3267), (6, 10), (7, 13)]
(58000, 8)


3
[(1, 4913), (2, 329), (3, 28), (4, 88), (5, 115)]
(5473, 11)


4
[(1, 211840), (2, 283301), (3, 35754), (4, 2747), (5, 9493), (6, 17367), (7, 20510)]
(581012, 55)


5
[(0, 1657), (1, 1647), (2, 1696)]
(5000, 22)




In [113]:
# categorical check
index = 0
for i in categorical_ds_preprocessed:
    print(index)
    print(check_balance(i))
    print(i.shape)
    print("\n")
    index += 1

# need resample: [0, 1, 2, 3, 4]
# features: 17

0
[(0, 1421), (1, 794), (2, 2209)]
(4424, 37)


1
[(0, 1322), (1, 522), (2, 1630), (3, 3546), (4, 1928), (5, 2027), (6, 2636)]
(13611, 17)


2
[(0, 6449), (1, 16635), (2, 44473)]
(67557, 43)


3
[(0, 1259), (1, 401), (2, 17), (3, 2069), (4, 32), (5, 151), (6, 758), (7, 622), (8, 30), (9, 389), (10, 7485), (11, 61), (12, 695), (13, 8491), (14, 9495), (15, 146), (16, 182), (17, 250), (18, 803), (19, 358), (20, 45), (21, 35), (22, 127), (23, 2371), (24, 1086), (25, 341), (26, 1480), (27, 58), (28, 10), (29, 229), (30, 2533), (31, 462), (32, 1877), (33, 291), (34, 68), (35, 361878), (36, 244)]
(406829, 6)


4
[(0, 2796), (1, 1433), (2, 2854), (3, 2166), (4, 471), (5, 198), (6, 4553), (7, 1712), (8, 78), (9, 683), (10, 592), (11, 390), (12, 1985), (13, 4194), (14, 81), (15, 3597), (16, 246), (17, 27)]
(28056, 7)


5
[(0, 789), (1, 766), (2, 736), (3, 805), (4, 768), (5, 775), (6, 773), (7, 734), (8, 755), (9, 747), (10, 739), (11, 761), (12, 792), (13, 783), (14, 753), (15, 803), (16, 783),

#### 3.4.3 Resample

#### 3.4.3.1 for binary datasets

In [115]:
print("Balance of binary datasets")
for i in binary_ds_preprocessed:
    print(check_balance(i))

Balance of binary datasets
[(0, 12332), (1, 6688)]
[(0, 6056), (1, 1786)]
[(-1, 48660), (1, 54284)]
[(0, 14632), (1, 14700)]
[(0, 218334), (1, 35346)]
[(0, 23364), (1, 6636)]


In [117]:
# binary need resample: [0, 1, 4, 5]
binary_ds_resample = []

for i in [0, 1, 4, 5]:
    a = binary_ds_preprocessed[i]
    result = resample_to_balance(a)
    binary_ds_resample.append(result)

for j in [2, 3]:
    b = binary_ds_preprocessed[j]
    binary_ds_resample.append(b)

In [118]:
print("Balance of binary datasets after resampling:")
for i in binary_ds_resample:
    print(check_balance(i))

Balance of binary datasets after resampling:
[(0, 12332), (1, 11953)]
[(0, 6056), (1, 5865)]
[(0, 218334), (1, 214093)]
[(0, 23364), (1, 23941)]
[(-1, 48660), (1, 54284)]
[(0, 14632), (1, 14700)]


#### 3.4.3.2 for ordinal datasets

In [120]:
print("Balance of ordinal datasets:")
for i in ordinal_ds_preprocessed:
    print(check_balance(i))


Balance of ordinal datasets:
[(1.0, 300), (2.0, 300), (3.0, 300), (4.0, 300), (5.0, 300), (6.0, 298), (7.0, 300), (8.0, 300), (9.0, 300), (10.0, 300), (11.0, 300), (12.0, 300), (13.0, 299), (14.0, 300), (15.0, 300), (16.0, 300), (17.0, 300), (18.0, 300), (19.0, 300), (20.0, 300), (21.0, 300), (22.0, 300), (23.0, 300), (24.0, 300), (25.0, 300), (26.0, 300)]
[(1, 1533), (2, 703), (3, 1358), (4, 626), (5, 707), (7, 1508)]
[(1, 45586), (2, 50), (3, 171), (4, 8903), (5, 3267), (6, 10), (7, 13)]
[(1, 4913), (2, 329), (3, 28), (4, 88), (5, 115)]
[(1, 211840), (2, 283301), (3, 35754), (4, 2747), (5, 9493), (6, 17367), (7, 20510)]
[(0, 1657), (1, 1647), (2, 1696)]


In [121]:
# ordinal need resample: [2, 3, 4]

ordinal_ds_resample = []

for i in [2, 3, 4]:
    a = ordinal_ds_preprocessed[i]
    result = resample_to_balance(a)
    ordinal_ds_resample.append(result)

for j in [0, 1, 5]:
    b = ordinal_ds_preprocessed[j]
    ordinal_ds_resample.append(b)

In [122]:
print("Balance of ordinal datasets after resampling:")
for i in ordinal_ds_resample:
    print(check_balance(i))

Balance of ordinal datasets after resampling:
[(1, 45586), (2, 45591), (3, 45567), (4, 45578), (5, 45579), (6, 45586), (7, 45587)]
[(1, 4913), (2, 4905), (3, 4913), (4, 4914), (5, 4919)]
[(1, 277115), (2, 283301), (3, 282296), (4, 283284), (5, 283455), (6, 282678), (7, 282914)]
[(1.0, 300), (2.0, 300), (3.0, 300), (4.0, 300), (5.0, 300), (6.0, 298), (7.0, 300), (8.0, 300), (9.0, 300), (10.0, 300), (11.0, 300), (12.0, 300), (13.0, 299), (14.0, 300), (15.0, 300), (16.0, 300), (17.0, 300), (18.0, 300), (19.0, 300), (20.0, 300), (21.0, 300), (22.0, 300), (23.0, 300), (24.0, 300), (25.0, 300), (26.0, 300)]
[(1, 1533), (2, 703), (3, 1358), (4, 626), (5, 707), (7, 1508)]
[(0, 1657), (1, 1647), (2, 1696)]


#### 3.4.3.3 for categorical datasets

In [123]:
print("Balance of categorical datasets:")
for i in categorical_ds_preprocessed:
    print(check_balance(i))

Balance of categorical datasets:
[(0, 1421), (1, 794), (2, 2209)]
[(0, 1322), (1, 522), (2, 1630), (3, 3546), (4, 1928), (5, 2027), (6, 2636)]
[(0, 6449), (1, 16635), (2, 44473)]
[(0, 1259), (1, 401), (2, 17), (3, 2069), (4, 32), (5, 151), (6, 758), (7, 622), (8, 30), (9, 389), (10, 7485), (11, 61), (12, 695), (13, 8491), (14, 9495), (15, 146), (16, 182), (17, 250), (18, 803), (19, 358), (20, 45), (21, 35), (22, 127), (23, 2371), (24, 1086), (25, 341), (26, 1480), (27, 58), (28, 10), (29, 229), (30, 2533), (31, 462), (32, 1877), (33, 291), (34, 68), (35, 361878), (36, 244)]
[(0, 2796), (1, 1433), (2, 2854), (3, 2166), (4, 471), (5, 198), (6, 4553), (7, 1712), (8, 78), (9, 683), (10, 592), (11, 390), (12, 1985), (13, 4194), (14, 81), (15, 3597), (16, 246), (17, 27)]
[(0, 789), (1, 766), (2, 736), (3, 805), (4, 768), (5, 775), (6, 773), (7, 734), (8, 755), (9, 747), (10, 739), (11, 761), (12, 792), (13, 783), (14, 753), (15, 803), (16, 783), (17, 758), (18, 748), (19, 796), (20, 813), (2

In [124]:
# categorical need resample: [0, 1, 2, 3]
categorical_ds_resample = []

for i in [0, 1, 2, 3]:
    a = categorical_ds_preprocessed[i]
    result = resample_to_balance(a)
    categorical_ds_resample.append(result)

for j in [4, 5]:
    b = categorical_ds_preprocessed[j]
    categorical_ds_resample.append(b)

In [125]:
print("Balance of categorical datasets after resampling:")
for i in categorical_ds_resample:
    print(check_balance(i))

Balance of categorical datasets after resampling:
[(0, 2187), (1, 2027), (2, 2209)]
[(0, 3466), (1, 3546), (2, 3679), (3, 3546), (4, 3310), (5, 3320), (6, 3320)]
[(0, 43553), (1, 46031), (2, 44473)]
[(0, 361879), (1, 361894), (2, 361878), (3, 361873), (4, 361875), (5, 361877), (6, 361835), (7, 361867), (8, 361878), (9, 361886), (10, 362711), (11, 361877), (12, 361909), (13, 361483), (14, 362255), (15, 361852), (16, 361872), (17, 361880), (18, 361908), (19, 361830), (20, 361867), (21, 361877), (22, 361867), (23, 361818), (24, 362036), (25, 361918), (26, 361805), (27, 361881), (28, 361879), (29, 361849), (30, 362007), (31, 361882), (32, 361760), (33, 361860), (34, 361881), (35, 361878), (36, 361889)]
[(0, 2796), (1, 1433), (2, 2854), (3, 2166), (4, 471), (5, 198), (6, 4553), (7, 1712), (8, 78), (9, 683), (10, 592), (11, 390), (12, 1985), (13, 4194), (14, 81), (15, 3597), (16, 246), (17, 27)]
[(0, 789), (1, 766), (2, 736), (3, 805), (4, 768), (5, 775), (6, 773), (7, 734), (8, 755), (9, 74

#### 3.4.4 Feature selection

In [134]:
# check for dimension
for i in binary_ds_resample:
    print(i.shape)

(24285, 11)
(11921, 17)
(432427, 22)
(47305, 24)
(102944, 116)
(29332, 87)


In [136]:
# binary datasets
# need feature selection: [3, 4, 5]
# features: 23

binary_features = 23
binary_ds_feature_selection = []

for i in [3, 4, 5]:
    ds = binary_ds_resample[i]
    a = feature_selection(ds, binary_features) # feature selection
    binary_ds_feature_selection.append(a)

for j in [0, 1, 2]:
    b = binary_ds_resample[j]
    binary_ds_feature_selection.append(b)

In [137]:
# check for dimension
for i in ordinal_ds_resample:
    print(i.shape)

(319074, 8)
(24564, 11)
(1975043, 55)
(7797, 618)
(6435, 37)
(5000, 22)


In [138]:
# ordinal datasets
# need feature selection: [3, 4, 5]
# features: 17

ordinal_features = 17
ordinal_ds_feature_selection = []

for i in [3, 4, 5]:
    ds = ordinal_ds_resample[i]
    a = feature_selection(ds, ordinal_features) # feature selection
    ordinal_ds_feature_selection.append(a)

for j in [0, 1, 2]:
    b = ordinal_ds_resample[j]
    ordinal_ds_feature_selection.append(b)

# check for dimension after feature selection
for z in ordinal_ds_feature_selection:
    print(z.shape)

(7797, 18)
(6435, 18)
(5000, 18)
(319074, 8)
(24564, 11)
(1975043, 55)


In [139]:
# check for dimension
for i in categorical_ds_resample:
    print(i.shape)

(6423, 37)
(24187, 17)
(134057, 43)
(13390273, 6)
(28056, 7)
(20000, 17)


In [140]:
# categorical datasets
# need feature selection: [0, 2]
# features: 17

categorical_features = 17
categorical_ds_feature_selection = []

for i in [0, 2]:
    ds = categorical_ds_resample[i]
    a = feature_selection(ds, categorical_features) # feature selection
    categorical_ds_feature_selection.append(a)

for j in [1, 3, 4, 5]:
    b = categorical_ds_resample[j]
    categorical_ds_feature_selection.append(b)

# check for dimension after feature selection
for z in categorical_ds_feature_selection:
    print(z.shape)

(6423, 18)
(134057, 18)
(24187, 17)
(13390273, 6)
(28056, 7)
(20000, 17)


In [146]:
# check balance
for i in categorical_ds_feature_selection:
    print(check_balance(i))

[(0, 2187), (1, 2027), (2, 2209)]
[(0, 43553), (1, 46031), (2, 44473)]
[(0, 3466), (1, 3546), (2, 3679), (3, 3546), (4, 3310), (5, 3320), (6, 3320)]
[(0, 361879), (1, 361894), (2, 361878), (3, 361873), (4, 361875), (5, 361877), (6, 361835), (7, 361867), (8, 361878), (9, 361886), (10, 362711), (11, 361877), (12, 361909), (13, 361483), (14, 362255), (15, 361852), (16, 361872), (17, 361880), (18, 361908), (19, 361830), (20, 361867), (21, 361877), (22, 361867), (23, 361818), (24, 362036), (25, 361918), (26, 361805), (27, 361881), (28, 361879), (29, 361849), (30, 362007), (31, 361882), (32, 361760), (33, 361860), (34, 361881), (35, 361878), (36, 361889)]
[(0, 2796), (1, 1433), (2, 2854), (3, 2166), (4, 471), (5, 198), (6, 4553), (7, 1712), (8, 78), (9, 683), (10, 592), (11, 390), (12, 1985), (13, 4194), (14, 81), (15, 3597), (16, 246), (17, 27)]
[(0, 789), (1, 766), (2, 736), (3, 805), (4, 768), (5, 775), (6, 773), (7, 734), (8, 755), (9, 747), (10, 739), (11, 761), (12, 792), (13, 783), (1

In [148]:
# for categorical_ds_feature_selection[4], remove category[8, 14, 17]

df = categorical_ds_feature_selection[4]
response_name = df.columns[-1]
specific_value = [8, 14, 17]
df_filtered = df[~df[response_name].isin(specific_value)]

categorical_ds_feature_selection[4] = df_filtered

for i in categorical_ds_feature_selection:
    print(check_balance(i))

[(0, 2187), (1, 2027), (2, 2209)]
[(0, 43553), (1, 46031), (2, 44473)]
[(0, 3466), (1, 3546), (2, 3679), (3, 3546), (4, 3310), (5, 3320), (6, 3320)]
[(0, 361879), (1, 361894), (2, 361878), (3, 361873), (4, 361875), (5, 361877), (6, 361835), (7, 361867), (8, 361878), (9, 361886), (10, 362711), (11, 361877), (12, 361909), (13, 361483), (14, 362255), (15, 361852), (16, 361872), (17, 361880), (18, 361908), (19, 361830), (20, 361867), (21, 361877), (22, 361867), (23, 361818), (24, 362036), (25, 361918), (26, 361805), (27, 361881), (28, 361879), (29, 361849), (30, 362007), (31, 361882), (32, 361760), (33, 361860), (34, 361881), (35, 361878), (36, 361889)]
[(0, 2796), (1, 1433), (2, 2854), (3, 2166), (4, 471), (5, 198), (6, 4553), (7, 1712), (9, 683), (10, 592), (11, 390), (12, 1985), (13, 4194), (15, 3597), (16, 246)]
[(0, 789), (1, 766), (2, 736), (3, 805), (4, 768), (5, 775), (6, 773), (7, 734), (8, 755), (9, 747), (10, 739), (11, 761), (12, 792), (13, 783), (14, 753), (15, 803), (16, 783)

#### 3.4.5 Adjusting size of datasets

In [156]:
# check for size
for i in categorical_ds_feature_selection:
    print(i.shape[0])

6423
134057
24187
13390273
27870
20000


In [158]:
sizelist = []

for datasets in [binary_ds_feature_selection, ordinal_ds_feature_selection, categorical_ds_feature_selection]:
    for ds in datasets:
        size = ds.shape[0]
        sizelist.append(size)
        
minsize = min(sizelist)
print(f"The minimum size of all the datasets is {minsize}!")   

The minimum size of all the datasets is 5000!


In [160]:
# sampling in accordance to the same size
binary_sample = []
ordinal_sample = []
categorical_sample = []

for a in binary_ds_feature_selection:
    a_sample = a.sample(n = minsize, random_state = 42)
    binary_sample.append(a_sample)
for b in ordinal_ds_feature_selection:
    b_sample = b.sample(n = minsize, random_state = 42)
    ordinal_sample.append(b_sample)
for c in categorical_ds_feature_selection:
    c_sample = c.sample(n = minsize, random_state = 42)
    categorical_sample.append(c_sample)


#### 3.4.6 One-hot encoding for categorical data

In [162]:
one_hot_encoder = OneHotEncoder(sparse = False)

categorical_one_hot = []

for ds in categorical_sample:
    response_name = ds.columns[-1]
    ds_one_hot = pd.get_dummies(ds, columns = [response_name], drop_first = True)
    ds_one_hot_int = ds_one_hot.astype(int)
    categorical_one_hot.append(ds_one_hot_int)
   

In [164]:
binary_datasets = binary_sample
ordinal_datasets = ordinal_sample
categorical_datasets = categorical_one_hot

# 4 Define models

In [166]:
# Create a dictionary that store all models
models = {
    'Logistic Regression': make_pipeline(LogisticRegression(random_state=42)),
    'Random Forest': make_pipeline(RandomForestClassifier(random_state=42)),
    'KNN': make_pipeline(MinMaxScaler(), KNeighborsClassifier(n_neighbors = 5)), # need normalization
    'SVM': make_pipeline(MinMaxScaler(), SVC(probability=True, random_state=42)) # need normalization
}

# 5 AUC score matrix

### 5.0 Define functions for calculating AUC scores

In [353]:
# define functions for calculating AUC score of singe classifier

def AUC_single_binary(dataframe): # for binary response datasets
    m = 4 # number of models
    X = dataframe.iloc[:, :-1] # take first 14 columns as predictor variables
    y = dataframe.iloc[:, -1] # take the last column as response variables
    # split the dataset into training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y)
    
    auc_single = np.zeros(m)
    kf = KFold(n_splits = 5, shuffle = True, random_state = 42) # define k fold cross validation
    # run all models
    count = 0
    for name, model in models.items():
        auc_scores = cross_val_score(model, X_train, y_train, cv = kf, scoring = 'roc_auc') # introduce cross validation
        auc_single[count] = auc_scores.mean()
        count += 1
        
    return auc_single # auc score of all classifiers in one dataset (binary response)

def AUC_single_multiclass(dataframe): # for multiclass response datasets
    m = 4 # number of models
    X = dataframe.iloc[:, :-1] # take first 14 columns as predictor variables
    y = dataframe.iloc[:, -1] # take the last column as response variables
    # split the dataset into training and test set
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42, stratify = y) # stratified sampling
    X_model = X_train
    y_model = y_train

    auc_single = np.zeros(m) # create an empty auc matrix for single model
    skf = StratifiedKFold(n_splits = 5, random_state = 42, shuffle = True) # introduce stratified K fold
    count = 0
    for name, model in models.items():
        roc_auc_scorer = make_scorer(roc_auc_score, needs_proba = True, multi_class = 'ovr')  # multiclass AUC score
        auc_scores = cross_val_score(model, X_model, y_model, cv = skf, scoring = roc_auc_scorer)
        auc_single[count] = auc_scores.mean()
        # model.fit(X_train, y_train)
        # y_pred_proba = model.predict_proba(X_test)[:, 1]
        # auc = roc_auc_score(y_test, y_pred_proba)
        # auc_single[count] = auc # auc score for one model
        count += 1
        
    return auc_single # auc score of all classifiers in one dataset (multiclass response)  

In [355]:
# define functions for generating the AUC matrix

def AUC_Score_Matrix(type_datasets):
    m = 4 # number of models
    auc_all = np.zeros((len(type_datasets), m)) # create a new auc matrix that store all models

    for i in range(len(type_datasets)): # for every dataset in the list
    # module 1 : dataset size
        if type_datasets is large_datasets or type_datasets is small_datasets:
            # sample the dataframe
            dataframe = ds2.sample(n = type_datasets[i], random_state = 42) # sample the dataset according to the predefined size
            auc_all[i] = AUC_single_binary(dataframe)
            
    # module 2 & 3 : dataset balance & feature dimensionality
        elif type_datasets is balanced_datasets or type_datasets is imbalanced_datasets or type_datasets is features_ds367 or type_datasets is features_ds722 or type_datasets is binary_datasets: 
            dataframe = type_datasets[i]
            auc_all[i] = AUC_single_binary(dataframe)
        
        elif type_datasets is features_ds75: # because ds75 is imbalanced, it need resample
            dataframe = type_datasets[i]
            auc_all[i] = AUC_single_binary(dataframe)     
            
    # module 4: binary or non-binary datasets        
        elif type_datasets is ordinal_datasets or type_datasets is categorical_datasets: # ordinal or categorical datasets
            dataframe = type_datasets[i]
            auc_all[i] = AUC_single_multiclass(dataframe)
         
    return auc_all

### 5.1 AUC score matrix - A. Size of datasets

In [357]:
# AUC score matrix for size of datasets

auc_matrix_large = AUC_Score_Matrix(large_datasets)
auc_matrix_small = AUC_Score_Matrix(small_datasets)

print("The AUC score matrix of all models for large datasets is shown below:\n")
print(auc_matrix_large)
print("\n")
print("The AUC score matrix of all models for small datasets is shown below:\n")
print(auc_matrix_small)

The AUC score matrix of all models for large datasets is shown below:

[[0.69539226 0.90727513 0.84426688 0.89256684]
 [0.72397659 0.90581826 0.84539995 0.89302327]
 [0.68974023 0.9047379  0.8427893  0.89286139]
 [0.65431596 0.90486471 0.84415209 0.8930976 ]
 [0.72452882 0.90510334 0.84092227 0.89143033]
 [0.69938951 0.90285384 0.84008703 0.88953524]
 [0.69052365 0.90553258 0.84017649 0.89426709]
 [0.6958603  0.90267355 0.83609486 0.89122538]
 [0.68538123 0.89995713 0.83246108 0.88769292]
 [0.69268814 0.89707672 0.8287755  0.88467313]]


The AUC score matrix of all models for small datasets is shown below:

[[0.47333333 0.77333333 0.85666667 0.81333333]
 [0.65008936 0.84768939 0.81885101 0.88244949]
 [0.58648099 0.81791279 0.72824097 0.80607519]
 [0.56291808 0.8072782  0.72604592 0.78991497]
 [0.64128982 0.80940102 0.7172785  0.76121492]
 [0.68150623 0.83685872 0.74190029 0.81739368]
 [0.52023997 0.85267849 0.80691398 0.83827936]
 [0.60486937 0.84857353 0.77887418 0.82590504]
 [0.64320

In [178]:
# output matrix to csv file

# classifier name
classifiers = ['Logistic Regression', 'Random Forest', 'K-Nearest Neighbors', 'Support Vector Machine']

# convert matrix to pandas DataFrame
df_auc_large = pd.DataFrame(auc_matrix_large, columns = classifiers)
df_auc_small = pd.DataFrame(auc_matrix_small, columns = classifiers)

# output dataframe to excel
df_auc_large.to_csv('pythonAUC_large_datasets.csv', index = False)
df_auc_small.to_csv('pythonAUC_small_datasets.csv', index = False) 

### 5.2 AUC score matrix - B. Balance of datasets

In [206]:
# AUC score matrix for balance of datasets

auc_matrix_imbalance = AUC_Score_Matrix(imbalanced_datasets)
auc_matrix_balance = AUC_Score_Matrix(balanced_datasets)

print("The AUC score matrix of all models for imbalanced datasets is shown below:\n")
print(auc_matrix_imbalance)

print("\n")

print("The AUC score matrix of all models for balanced datasets is shown below:\n")
print(auc_matrix_balance)

The AUC score matrix of all models for imbalanced datasets is shown below:

[[0.6211682  0.90570501 0.84363928 0.89490902]
 [0.55416446 0.88916787 0.68779502 0.84864036]
 [0.58399394 0.92580355 0.76864926 0.87488341]
 [0.6263118  0.94215407 0.82554887 0.88786047]
 [0.68880805 0.9491471  0.853429   0.89214803]
 [0.68150246 0.95545521 0.86936132 0.895783  ]
 [0.72926288 0.95930096 0.8756459  0.89529451]
 [0.73464785 0.96357295 0.88634457 0.89965432]
 [0.69353141 0.96643149 0.88771649 0.8996106 ]
 [0.72760884 0.96878944 0.89316298 0.90015819]]


The AUC score matrix of all models for balanced datasets is shown below:

[[0.75632494 0.97118978 0.8930501  0.89862011]
 [0.75861695 0.97152751 0.891235   0.89652483]
 [0.7593268  0.97146553 0.8908188  0.89631971]
 [0.68102486 0.97234931 0.89288063 0.89840103]
 [0.68596378 0.97098602 0.89658144 0.90173901]
 [0.75913533 0.97118249 0.89641191 0.89970636]
 [0.7600777  0.97198592 0.89379502 0.90087058]
 [0.75070462 0.97071539 0.89398639 0.89810796]
 

In [208]:
# output matrix to csv file

# classifier name
classifiers = ['Logistic Regression', 'Random Forest', 'K-Nearest Neighbors', 'Support Vector Machine']

# convert matrix to pandas DataFrame
df_auc_imbalance = pd.DataFrame(auc_matrix_imbalance, columns = classifiers)
df_auc_balance = pd.DataFrame(auc_matrix_balance, columns = classifiers)

# output dataframe to excel
df_auc_imbalance.to_csv('pythonAUC_imbalanced_datasets.csv', index = False)
df_auc_balance.to_csv('pythonAUC_balanced_datasets.csv', index = False) 

### 5.3 AUC score matrix - C. Number of features

In [212]:
# AUC score matrix of number of features - ds75

auc_matrix_features_ds75 = AUC_Score_Matrix(features_ds75)
print(auc_matrix_features_ds75)

[[0.96356265 0.98767048 0.96796982 0.98477041]
 [0.95865422 0.98858624 0.96180403 0.97719382]
 [0.95227353 0.9860978  0.96336822 0.97895566]
 [0.93970242 0.9856204  0.95393315 0.97702892]
 [0.940801   0.983419   0.94843281 0.96962831]
 [0.92374383 0.98403034 0.9479038  0.96055243]
 [0.91923198 0.98162876 0.93917775 0.95622085]
 [0.89566397 0.97141951 0.92423056 0.93838743]
 [0.77874233 0.95172647 0.90444976 0.8128372 ]]


In [214]:
# output matrix to csv file

# classifier name
classifiers = ['Logistic Regression', 'Random Forest', 'K-Nearest Neighbors', 'Support Vector Machine']

# convert matrix to pandas DataFrame
df_auc_ds75 = pd.DataFrame(auc_matrix_features_ds75, columns = classifiers)

# output dataframe to excel
df_auc_ds75.to_csv('pythonAUC_feratures_ds75.csv', index = False) 

In [216]:
# AUC score matrix of number of features - ds367

auc_matrix_features_ds367 = AUC_Score_Matrix(features_ds367)
print(auc_matrix_features_ds367)

[[0.62317714 0.58736983 0.52568898 0.60818623]
 [0.61561493 0.57803666 0.52275834 0.60104791]
 [0.61383079 0.57356113 0.52095043 0.59726097]
 [0.60600964 0.56178836 0.51842288 0.59102936]
 [0.59004378 0.54699183 0.51987069 0.57661412]
 [0.59097643 0.53297904 0.52044847 0.57798509]
 [0.58806349 0.53433266 0.52336219 0.57347874]
 [0.56142889 0.5297608  0.51348229 0.54027196]
 [0.5071057  0.50349046 0.50262378 0.50089198]]


In [226]:
# output matrix to csv file

# classifier name
classifiers = ['Logistic Regression', 'Random Forest', 'K-Nearest Neighbors', 'Support Vector Machine']

# convert matrix to pandas DataFrame
df_auc_ds367 = pd.DataFrame(auc_matrix_features_ds367, columns = classifiers)

# output dataframe to excel
df_auc_ds367.to_csv('pythonAUC_feratures_ds367.csv', index = False) 

In [217]:
# AUC score matrix of number of features - ds722

auc_matrix_features_ds722 = AUC_Score_Matrix(features_ds722)
print(auc_matrix_features_ds722)

[[0.98862278 0.99251354 0.98260478 0.99051347]
 [0.98673006 0.99236434 0.98160905 0.9894059 ]
 [0.9873352  0.9916819  0.9817589  0.98946139]
 [0.98578217 0.99135431 0.9792632  0.98866355]
 [0.98219865 0.99003243 0.97939536 0.98695832]
 [0.98100975 0.98912696 0.97712601 0.98458984]
 [0.98095494 0.98882522 0.97721374 0.98542686]
 [0.97470299 0.98467944 0.97419442 0.97514318]
 [0.96916639 0.9720296  0.95652958 0.93192881]]


In [228]:
# output matrix to csv file

# classifier name
classifiers = ['Logistic Regression', 'Random Forest', 'K-Nearest Neighbors', 'Support Vector Machine']

# convert matrix to pandas DataFrame
df_auc_ds722 = pd.DataFrame(auc_matrix_features_ds722, columns = classifiers)

# output dataframe to excel
df_auc_ds722.to_csv('pythonAUC_feratures_ds722.csv', index = False) 

### 5.4 AUC score matrix - D. Binary or non-binary

In [230]:
# AUC score matrix of binary datasets

auc_matrix_binary = AUC_Score_Matrix(binary_datasets)
print("The AUC score matrix of all models for binary datasets is shown below:\n")
print(auc_matrix_binary)

The AUC score matrix of all models for binary datasets is shown below:

[[0.61529356 0.82908794 0.73888883 0.78353723]
 [0.56436366 0.50433269 0.50166074 0.54165069]
 [0.981404   0.98505123 0.97258917 0.98230585]
 [0.78737719 0.90775457 0.84128088 0.88208039]
 [0.84565902 0.93692565 0.85609032 0.8995241 ]
 [0.78456777 0.79648749 0.72053529 0.7891202 ]]


In [236]:
# output matrix to csv file

# classifier name
classifiers = ['Logistic Regression', 'Random Forest', 'K-Nearest Neighbors', 'Support Vector Machine']

# convert matrix to pandas DataFrame
df_auc_binary = pd.DataFrame(auc_matrix_binary, columns = classifiers)

# output dataframe to excel
df_auc_binary.to_csv('pythonAUC_binary_datasets.csv', index = False) 

In [231]:
# AUC score matrix of ordinal datasets

auc_matrix_ordinal = AUC_Score_Matrix(ordinal_datasets)
print("The AUC score matrix of all models for ordinal datasets is shown below:\n")
print(auc_matrix_ordinal)

The AUC score matrix of all models for ordinal datasets is shown below:

[[0.94926494 0.9584784  0.88448015 0.95944052]
 [0.94297964 0.98511828 0.96863441 0.98028775]
 [0.96852317 0.9644391  0.94627901 0.96782058]
 [0.89691896 0.99813767 0.98862382 0.96060396]
 [0.97728336 0.99842006 0.98092447 0.97900958]
 [0.8355242  0.95993619 0.89806101 0.92693791]]


In [238]:
# output matrix to csv file

# classifier name
classifiers = ['Logistic Regression', 'Random Forest', 'K-Nearest Neighbors', 'Support Vector Machine']

# convert matrix to pandas DataFrame
df_auc_ordinal = pd.DataFrame(auc_matrix_ordinal, columns = classifiers)

# output dataframe to excel
df_auc_ordinal.to_csv('pythonAUC_ordinal_datasets.csv', index = False) 

In [242]:
# AUC score matrix of categorical datasets

auc_matrix_categorical = AUC_Score_Matrix(categorical_datasets)
print("The AUC score matrix of all models for categorical datasets is shown below:\n")
print(auc_matrix_categorical)

The AUC score matrix of all models for categorical datasets is shown below:

[[0.89067837 0.97927366 0.95378335 0.97201097]
 [0.85535156 0.89586345 0.84213093 0.88196876]
 [0.72292079 0.99845492 0.99643296 0.99770076]
 [0.69297817 0.99987781 0.99880254 0.99848485]
 [0.99467637 0.99949505 0.98556658 1.        ]
 [0.99457239 0.99797856 0.9998644  0.99990138]]


In [240]:
# output matrix to csv file

# classifier name
classifiers = ['Logistic Regression', 'Random Forest', 'K-Nearest Neighbors', 'Support Vector Machine']

# convert matrix to pandas DataFrame
df_auc_categorical = pd.DataFrame(auc_matrix_categorical, columns = classifiers)

# output dataframe to excel
df_auc_categorical.to_csv('pythonAUC_categorical_datasets.csv', index = False) 

# 6 Comparison

### 6.0 Define model names and comparison function

In [244]:
model_names = ['Logistic Regression','Random Forests','K-Nearest Neighbors','Support Vector Machine']

In [252]:
def Sub_AUC_Vector(auc1, auc2):
    auc_diff = auc1 - auc2
    auc_diff_dict = {}

    # create sub vector from the difference matrix
    for i in range(auc_diff.shape[1]): # take every column as datasets of each model
        col = auc_diff[:, i]
        auc_diff_dict[f"{model_names[i]}"] = col
    
    return auc_diff_dict

### 6.1 Comparison - A. Size of datasets

In [272]:
# create difference vector based on models

difference_large_small = Sub_AUC_Vector(auc_matrix_large, auc_matrix_small)
difference_large_small

{'Logistic Regression': array([ 0.22205892,  0.07388724,  0.10325924,  0.09139788,  0.08323899,
         0.01788327,  0.17028367,  0.09099092,  0.04217973, -0.02216264]),
 'Random Forests': array([0.1339418 , 0.05812887, 0.08682511, 0.0975865 , 0.09570233,
        0.06599511, 0.05285409, 0.05410002, 0.03855669, 0.0374456 ]),
 'K-Nearest Neighbors': array([-0.01239979,  0.02654894,  0.11454833,  0.11810617,  0.12364377,
         0.09818674,  0.03326251,  0.05722068,  0.07348575,  0.09254304]),
 'Support Vector Machine': array([0.07923351, 0.01057378, 0.08678621, 0.10318263, 0.13021541,
        0.07214157, 0.05598773, 0.06532034, 0.06637794, 0.05488215])}

### 6.2 Comparison - B. Balance of datasets

In [274]:
# calcualte the difference score between balanced and imbalanced datasets

difference_balance_imbalance = Sub_AUC_Vector(auc_matrix_balance, auc_matrix_imbalance)
difference_balance_imbalance

{'Logistic Regression': array([ 0.13515674,  0.20445249,  0.17533286,  0.05471307, -0.00284428,
         0.07763287,  0.03081482,  0.01605677,  0.03672587, -0.09842309]),
 'Random Forests': array([0.06548477, 0.08235964, 0.04566198, 0.03019524, 0.02183893,
        0.01572728, 0.01268497, 0.00714244, 0.00306748, 0.00076476]),
 'K-Nearest Neighbors': array([0.04941083, 0.20343998, 0.12216954, 0.06733175, 0.04315244,
        0.0270506 , 0.01814911, 0.00764183, 0.00392914, 0.00179484]),
 'Support Vector Machine': array([ 0.00371109,  0.04788447,  0.0214363 ,  0.01054056,  0.00959098,
         0.00392336,  0.00557607, -0.00154636,  0.00190175,  0.00269645])}

### 6.3 Comparison - D. Binary or non-binary

#### 6.3.1 binary vs. ordinal

In [276]:
# calculate the difference between ordinal and binay datatsets

difference_ordinal_binary = Sub_AUC_Vector(auc_matrix_ordinal, auc_matrix_binary)
difference_ordinal_binary

{'Logistic Regression': array([ 0.33397138,  0.37861598, -0.01288083,  0.10954177,  0.13162434,
         0.05095643]),
 'Random Forests': array([ 0.12939046,  0.48078558, -0.02061213,  0.0903831 ,  0.06149441,
         0.1634487 ]),
 'K-Nearest Neighbors': array([ 0.14559132,  0.46697367, -0.02631016,  0.14734294,  0.12483414,
         0.17752572]),
 'Support Vector Machine': array([ 0.17590329,  0.43863705, -0.01448527,  0.07852357,  0.07948548,
         0.1378177 ])}

#### 6.3.2 binary vs. categorical

In [278]:
# calculate the difference between categorical and binary datasets

difference_categorical_binary = Sub_AUC_Vector(auc_matrix_categorical, auc_matrix_binary)
difference_categorical_binary

{'Logistic Regression': array([ 0.27538481,  0.29098791, -0.25848321, -0.09439902,  0.14901735,
         0.21000462]),
 'Random Forests': array([0.15018572, 0.39153076, 0.0134037 , 0.09212324, 0.06256941,
        0.20149107]),
 'K-Nearest Neighbors': array([0.21489453, 0.34047019, 0.02384379, 0.15752166, 0.12947626,
        0.2793291 ]),
 'Support Vector Machine': array([0.18847374, 0.34031807, 0.01539492, 0.11640446, 0.1004759 ,
        0.21078118])}

# 7 Hypothesis Tests

### 7.0 Define functions for hypothesis tests

In [282]:
# define the function of wilcoxon signed rank test for every classifier

def Wilcoxon_single_model(difference_dict):
    for name, vector in difference_dict.items():
        stat, p_value = wilcoxon(vector)
        print(f"Hypothesis test for {name} model:\n U Statistic: {stat}, p-Value: {p_value}")
        if p_value < 0.05:
            print(f"{name} performs significantly different on above two kinds of datasets!\n")

In [314]:
# define the function of Mann-Kendall trend test for every classifier

def MannKendall_trend_test(auc_matrix):
    n_column = matrix.shape[1]
    trend_result = []
    
    for i in range(n_column):
        column = auc_matrix[:, i]
        result = mk.original_test(column)
        trend_result.append(result)
        
    for index, res in enumerate(trend_result):
        if index == 0:
            print(f"Logistic Regression: Trend = {res.trend}, S statistic: {res.s}, P-value = {res.p}")
        elif index == 1:
            print(f"Random Forest: Trend = {res.trend}, S statistic: {res.s}, P-value = {res.p}")
        elif index == 2:
            print(f"K-Nearest Neighbors: Trend = {res.trend}, S statistic: {res.s}, P-value = {res.p}")
        elif index == 3:
            print(f"Support Vector Machine: Trend = {res.trend}, S statistic: {res.s}, P-value = {res.p}")  


In [324]:
# define the function of wilcoxon signed rank test for pariwise classifiers

def Wilcoxon_pairwise_model(difference_dict):
    for i in range(len(model_names)):
        for j in range(i+1, len(model_names)):
            name1 = model_names[i]
            name2 = model_names[j]
            vector1 = difference_dict[name1]
            vector2 = difference_dict[name2]

            # check whether they are significantly different
            stat_1, p_value_1 = wilcoxon(vector1, vector2)
            print(f"{name1} vs. {name2}\n U Statistic: {stat_1}, p-Value: {p_value_1}")

            # further check which one is better
            if p_value_1 < 0.05:
                print(f"The degree of differences between {name1} and {name2} is statistically significant!")

            # compare signed rank
                more = 0
                less = 0
                for p in vector1:
                    for q in vector2:
                        if p - q > 0:
                            more += 1
                        elif p - q <0:
                            less += 1

                if more > less:
                    print(f"The degree of differences of {name1} in size of datasets is significantly larger than {name2}!")
                elif more < less:
                    print(f"The degree of differences of {name2} in size of datasets is significantly larger than {name1}!")
                print("\n")
                
            else:
                print(f"The degree of differences between {name1} and {name2} is not statistically significant!")
                print("\n")
                continue

### 7.1 Single Classifier Hypothesis Test

#### 7.1.1 Wilcoxon-Signed Rank Test - A. Size of datasets

In [284]:
# wilcoxon signed rank test for every classifier between large and small datasets

Wilcoxon_single_model(difference_large_small)

Hypothesis test for Logistic Regression model:
 U Statistic: 2.0, p-Value: 0.005859375
Logistic Regression performs significantly different on above two kinds of datasets!

Hypothesis test for Random Forests model:
 U Statistic: 0.0, p-Value: 0.001953125
Random Forests performs significantly different on above two kinds of datasets!

Hypothesis test for K-Nearest Neighbors model:
 U Statistic: 1.0, p-Value: 0.00390625
K-Nearest Neighbors performs significantly different on above two kinds of datasets!

Hypothesis test for Support Vector Machine model:
 U Statistic: 0.0, p-Value: 0.001953125
Support Vector Machine performs significantly different on above two kinds of datasets!



#### 7.1.2 Wilcoxon-Signed Rank Test - B. Balance of datasets

In [286]:
# wilcoxon signed rank test for every classifier between balanced and imbalanced datasets

Wilcoxon_single_model(difference_balance_imbalance)

Hypothesis test for Logistic Regression model:
 U Statistic: 8.0, p-Value: 0.048828125
Logistic Regression performs significantly different on above two kinds of datasets!

Hypothesis test for Random Forests model:
 U Statistic: 0.0, p-Value: 0.001953125
Random Forests performs significantly different on above two kinds of datasets!

Hypothesis test for K-Nearest Neighbors model:
 U Statistic: 0.0, p-Value: 0.001953125
K-Nearest Neighbors performs significantly different on above two kinds of datasets!

Hypothesis test for Support Vector Machine model:
 U Statistic: 1.0, p-Value: 0.00390625
Support Vector Machine performs significantly different on above two kinds of datasets!



#### 7.1.3 Mann-Kendall Trend test - C. Number of features

In [308]:
# Mann-Kendall trend test in feature dimensionality for ds75

MannKendall_trend_test(auc_matrix_features_ds75)

Logistic Regression: Trend = decreasing, S statistic: -34.0, P-value = 0.0005806665459491267
Random Forest: Trend = decreasing, S statistic: -32.0, P-value = 0.0012293849189448647
K-Nearest Neighbors: Trend = decreasing, S statistic: -34.0, P-value = 0.0005806665459491267
Support Vector Machine: Trend = decreasing, S statistic: -34.0, P-value = 0.0005806665459491267


In [310]:
# Mann-Kendall trend test in feature dimensionality for ds367

MannKendall_trend_test(auc_matrix_features_ds367)

Logistic Regression: Trend = decreasing, S statistic: -34.0, P-value = 0.0005806665459491267
Random Forest: Trend = decreasing, S statistic: -34.0, P-value = 0.0005806665459491267
K-Nearest Neighbors: Trend = decreasing, S statistic: -20.0, P-value = 0.04760395472787149
Support Vector Machine: Trend = decreasing, S statistic: -34.0, P-value = 0.0005806665459491267


In [312]:
# Mann-Kendall trend test in feature dimensionality for ds722

MannKendall_trend_test(auc_matrix_features_ds722)

Logistic Regression: Trend = decreasing, S statistic: -34.0, P-value = 0.0005806665459491267
Random Forest: Trend = decreasing, S statistic: -36.0, P-value = 0.00026326080270355767
K-Nearest Neighbors: Trend = decreasing, S statistic: -30.0, P-value = 0.0024990288576112185
Support Vector Machine: Trend = decreasing, S statistic: -32.0, P-value = 0.0012293849189448647


#### 7.1.4 Wilcoxon-Signed Rank Test - D. Binary or non-binary

In [316]:
# wilcoxon signed rank test: between binary and ordinal datasets

Wilcoxon_single_model(difference_ordinal_binary)

Hypothesis test for Logistic Regression model:
 U Statistic: 1.0, p-Value: 0.0625
Hypothesis test for Random Forests model:
 U Statistic: 1.0, p-Value: 0.0625
Hypothesis test for K-Nearest Neighbors model:
 U Statistic: 1.0, p-Value: 0.0625
Hypothesis test for Support Vector Machine model:
 U Statistic: 1.0, p-Value: 0.0625


In [320]:
# wilcoxon signed rank test: between binary and categorical datasets

Wilcoxon_single_model(difference_categorical_binary)

Hypothesis test for Logistic Regression model:
 U Statistic: 5.0, p-Value: 0.3125
Hypothesis test for Random Forests model:
 U Statistic: 0.0, p-Value: 0.03125
Random Forests performs significantly different on above two kinds of datasets!

Hypothesis test for K-Nearest Neighbors model:
 U Statistic: 0.0, p-Value: 0.03125
K-Nearest Neighbors performs significantly different on above two kinds of datasets!

Hypothesis test for Support Vector Machine model:
 U Statistic: 0.0, p-Value: 0.03125
Support Vector Machine performs significantly different on above two kinds of datasets!



### 7.2 Pairwise Classifiers Hypothesis Test

#### 7.2.1 Wilcoxon-Signed Rank Test - A. Size of datasets

In [361]:
# wilcoxon pairwise test: large vs. small

Wilcoxon_pairwise_model(difference_large_small)

Logistic Regression vs. Random Forests
 U Statistic: 20.0, p-Value: 0.4921875
The degree of differences between Logistic Regression and Random Forests is not statistically significant!


Logistic Regression vs. K-Nearest Neighbors
 U Statistic: 26.0, p-Value: 0.921875
The degree of differences between Logistic Regression and K-Nearest Neighbors is not statistically significant!


Logistic Regression vs. Support Vector Machine
 U Statistic: 23.0, p-Value: 0.6953125
The degree of differences between Logistic Regression and Support Vector Machine is not statistically significant!


Random Forests vs. K-Nearest Neighbors
 U Statistic: 18.0, p-Value: 0.375
The degree of differences between Random Forests and K-Nearest Neighbors is not statistically significant!


Random Forests vs. Support Vector Machine
 U Statistic: 20.0, p-Value: 0.4921875
The degree of differences between Random Forests and Support Vector Machine is not statistically significant!


K-Nearest Neighbors vs. Support Vector

#### 7.2.2 Wilcoxon-Signed Rank Test - B. Balance of datasets

In [340]:
# wilcoxon pairwise test: balance vs. imbalance

Wilcoxon_pairwise_model(difference_balance_imbalance)

Logistic Regression vs. Random Forests
 U Statistic: 12.0, p-Value: 0.130859375
The degree of differences between Logistic Regression and Random Forests is not statistically significant!


Logistic Regression vs. K-Nearest Neighbors
 U Statistic: 19.0, p-Value: 0.431640625
The degree of differences between Logistic Regression and K-Nearest Neighbors is not statistically significant!


Logistic Regression vs. Support Vector Machine
 U Statistic: 8.0, p-Value: 0.048828125
The degree of differences between Logistic Regression and Support Vector Machine is statistically significant!
The degree of differences of Logistic Regression in size of datasets is significantly larger than Support Vector Machine!


Random Forests vs. K-Nearest Neighbors
 U Statistic: 6.0, p-Value: 0.02734375
The degree of differences between Random Forests and K-Nearest Neighbors is statistically significant!
The degree of differences of K-Nearest Neighbors in size of datasets is significantly larger than Random Fore

#### 7.2.3 Wilcoxon-Signed Rank Test - D. Binary or non-binary

In [365]:
# wilcoxon pairwise test: ordinal vs. binary

Wilcoxon_pairwise_model(difference_ordinal_binary)

Logistic Regression vs. Random Forests
 U Statistic: 9.0, p-Value: 0.84375
The degree of differences between Logistic Regression and Random Forests is not statistically significant!


Logistic Regression vs. K-Nearest Neighbors
 U Statistic: 9.0, p-Value: 0.84375
The degree of differences between Logistic Regression and K-Nearest Neighbors is not statistically significant!


Logistic Regression vs. Support Vector Machine
 U Statistic: 9.0, p-Value: 0.84375
The degree of differences between Logistic Regression and Support Vector Machine is not statistically significant!


Random Forests vs. K-Nearest Neighbors
 U Statistic: 3.0, p-Value: 0.15625
The degree of differences between Random Forests and K-Nearest Neighbors is not statistically significant!


Random Forests vs. Support Vector Machine
 U Statistic: 10.0, p-Value: 1.0
The degree of differences between Random Forests and Support Vector Machine is not statistically significant!


K-Nearest Neighbors vs. Support Vector Machine
 U S

In [344]:
# wilcoxon pairwise test: categorical vs. binary

Wilcoxon_pairwise_model(difference_categorical_binary)

Logistic Regression vs. Random Forests
 U Statistic: 7.0, p-Value: 0.5625
The degree of differences between Logistic Regression and Random Forests is not statistically significant!


Logistic Regression vs. K-Nearest Neighbors
 U Statistic: 4.0, p-Value: 0.21875
The degree of differences between Logistic Regression and K-Nearest Neighbors is not statistically significant!


Logistic Regression vs. Support Vector Machine
 U Statistic: 6.0, p-Value: 0.4375
The degree of differences between Logistic Regression and Support Vector Machine is not statistically significant!


Random Forests vs. K-Nearest Neighbors
 U Statistic: 2.0, p-Value: 0.09375
The degree of differences between Random Forests and K-Nearest Neighbors is not statistically significant!


Random Forests vs. Support Vector Machine
 U Statistic: 6.0, p-Value: 0.4375
The degree of differences between Random Forests and Support Vector Machine is not statistically significant!


K-Nearest Neighbors vs. Support Vector Machine
 U S