# Import libaries

In [1]:
import openml
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from progressbar import ProgressBar
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer
import time
import json

# Import all datatsets

In [2]:
datasets_df = openml.datasets.list_datasets(output_format="dataframe")
datasets_df.head(10)

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
2,2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,39.0,898.0,898.0,22175.0,6.0,33.0
3,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
4,4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,17.0,57.0,56.0,326.0,8.0,9.0
5,5,arrhythmia,1,1,active,ARFF,245.0,13.0,2.0,13.0,280.0,452.0,384.0,408.0,206.0,74.0
6,6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,17.0,20000.0,0.0,0.0,16.0,1.0
7,7,audiology,1,1,active,ARFF,57.0,24.0,1.0,24.0,70.0,226.0,222.0,317.0,0.0,70.0
8,8,liver-disorders,1,1,active,ARFF,,,,0.0,6.0,345.0,0.0,0.0,6.0,0.0
9,9,autos,1,1,active,ARFF,67.0,22.0,3.0,6.0,26.0,205.0,46.0,59.0,15.0,11.0
10,10,lymph,1,1,active,ARFF,81.0,8.0,2.0,4.0,19.0,148.0,0.0,0.0,3.0,16.0
11,11,balance-scale,1,1,active,ARFF,288.0,3.0,49.0,3.0,5.0,625.0,0.0,0.0,4.0,1.0


# Extract Dataset

In [101]:
dataset = openml.datasets.get_dataset(43)

# Print description about dataset

In [102]:
# Print a summary
print(
    f"This is dataset '{dataset.name}', the target feature is "
    f"'{dataset.default_target_attribute}'"
)
print(f"URL: {dataset.url}")
print(dataset.description[:])

This is dataset 'haberman', the target feature is 'Survival_status'
URL: https://www.openml.org/data/v1/download/43/haberman.arff
**Author**:   

**Source**: Unknown -   

**Please cite**:   



1. Title: Haberman's Survival Data

 

 2. Sources:

    (a) Donor:   Tjen-Sien Lim (limt@stat.wisc.edu)

    (b) Date:    March 4, 1999

 

 3. Past Usage:

    1. Haberman, S. J. (1976). Generalized Residuals for Log-Linear

       Models, Proceedings of the 9th International Biometrics

       Conference, Boston, pp. 104-122.

    2. Landwehr, J. M., Pregibon, D., and Shoemaker, A. C. (1984),

       Graphical Models for Assessing Logistic Regression Models (with

       discussion), Journal of the American Statistical Association 79:

       61-83.

    3. Lo, W.-D. (1993). Logistic Regression Trees, PhD thesis,

       Department of Statistics, University of Wisconsin, Madison, WI.

 

 4. Relevant Information:

    The dataset contains cases from a study that was conducted between

    19

# Get depent and indepent variables

In [103]:
x, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format="dataframe", target=dataset.default_target_attribute
)

# Data types

In [104]:
x.dtypes

Age_of_patient_at_time_of_operation              uint8
Patients_year_of_operation                    category
Number_of_positive_axillary_nodes_detected       uint8
dtype: object

In [105]:
cat_columns = x.select_dtypes(['category']).columns
x[cat_columns] = x[cat_columns].apply(lambda x: pd.factorize(x)[0])

# cat_columns1 = x.select_dtypes(['uint8']).columns
# x[cat_columns1] = x[cat_columns1].apply(lambda x: pd.factorize(x)[0])

In [106]:
x.dtypes

Age_of_patient_at_time_of_operation           uint8
Patients_year_of_operation                    int64
Number_of_positive_axillary_nodes_detected    uint8
dtype: object

# Change response variable to int

In [107]:
y

0      1
1      1
2      1
3      1
4      1
      ..
301    1
302    1
303    1
304    2
305    2
Name: Survival_status, Length: 306, dtype: category
Categories (2, object): ['1' < '2']

In [108]:
y=y.map({'1' :0,'2':1})

In [109]:
y.astype(int)

0      0
1      0
2      0
3      0
4      0
      ..
301    0
302    0
303    0
304    1
305    1
Name: Survival_status, Length: 306, dtype: int32

In [110]:
df_min_max_scaled = x.copy()
  
# apply normalization techniques
for column in df_min_max_scaled.columns:
    df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min())    
  
# view normalized data
print(df_min_max_scaled)

     Age_of_patient_at_time_of_operation  Patients_year_of_operation  \
0                               0.000000                    0.000000   
1                               0.000000                    0.090909   
2                               0.000000                    0.181818   
3                               0.018868                    0.272727   
4                               0.018868                    0.181818   
..                                   ...                         ...   
301                             0.849057                    0.090909   
302                             0.867925                    0.727273   
303                             0.886792                    0.181818   
304                             0.905660                    0.181818   
305                             1.000000                    0.363636   

     Number_of_positive_axillary_nodes_detected  
0                                      0.019231  
1                                  

In [111]:
pbar = ProgressBar()
x=df_min_max_scaled.to_numpy()

In [12]:
x

array([[0.2976295 , 0.30977444, 0.77951002, ..., 0.31040892, 0.        ,
        0.        ],
       [0.23178227, 0.21503759, 0.78396437, ..., 0.28810409, 0.        ,
        0.        ],
       [0.2976295 , 0.37293233, 0.77505568, ..., 0.27881041, 0.        ,
        0.        ],
       ...,
       [0.21861282, 0.47969925, 0.78396437, ..., 0.23327138, 0.        ,
        0.2745098 ],
       [0.25197542, 0.29172932, 0.64142539, ..., 0.28903346, 0.        ,
        0.        ],
       [0.32352941, 0.50526316, 0.48775056, ..., 0.36152416, 0.        ,
        0.        ]])

In [13]:
k='Glass.json'

# k-Nearest Neighbors

In [14]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
f1 = make_scorer(f1_score , average='macro')

In [15]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_knn=cross_val_score(knn,x,y,cv=5,scoring=f1).mean()
end = time.time()
knn_time=end-start

In [16]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
model = KNeighborsClassifier()

n_neighbors = range(1, 21)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)

grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=cross_validation, scoring=f1)
grid_result = grid_search.fit(x,y)



# Best score using KNN parameter

In [17]:
Knn_Para=grid_result.best_params_
Knn_Para

{'metric': 'manhattan', 'n_neighbors': 1, 'weights': 'uniform'}

# Best score using KNN score

In [18]:
Knn_f1_score=grid_result.best_score_
Knn_f1_score

0.6948215051575267

In [19]:
details = {
    'name': 'KNN',
    'time': knn_time,
    'f1_score':Knn_f1_score,
    'Best_Parameter':Knn_Para,
    'Cross validation score':cross_val_score_knn,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Linear Discriminant Analysis

In [20]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis()

In [21]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_linear=cross_val_score(knn,x,y,cv=5,scoring=f1).mean()
end = time.time()
Linear_dis_time=end-start

In [22]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
model = LinearDiscriminantAnalysis()

solver = ['svd', 'lsqr', 'eigen']
grid = dict(solver=solver)

grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=cross_validation, scoring=f1)
grid_result = grid_search.fit(x,y)

# Best score using linear discriminent parameter

In [23]:
Linear_Para=grid_result.best_params_
Linear_Para

{'solver': 'svd'}

# Best score using discriminent score

In [24]:
Linear_f1_score=grid_result.best_score_
Linear_f1_score

0.4943306388355354

In [25]:
details = {
    'name': 'Linear_Dis',
    'time': Linear_dis_time,
    'f1_score':Linear_f1_score,
    'Best_Parameter':Linear_Para,
    'Cross validation score':cross_val_score_linear,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Naive Bayes

In [26]:
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()
f1 = make_scorer(f1_score , average='macro')

In [27]:
nb.priors

In [28]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_naive=cross_val_score(nb,x,y,cv=5,scoring=f1).mean()
end = time.time()
Naive_time=end-start

In [29]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
model = GaussianNB()


var_smoothing = [1e-09,2e-09,3e-09,4e-09,5e-09,6e-09,7e-09,8e-09,9e-09,10e-09]

grid = dict(var_smoothing=var_smoothing)

grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=cross_validation, scoring=f1)
grid_result = grid_search.fit(x,y)



# Best score using Naive parameter

In [30]:
naiv_Para=grid_result.best_params_
naiv_Para

{'var_smoothing': 1e-09}

# Best score using Naive score

In [31]:
naive_f1_score=grid_result.best_score_
naive_f1_score

0.46021406505240126

In [32]:
details = {
    'name': 'Naive Bayes',
    'time': Naive_time,
    'f1_score':naive_f1_score,
    'Best_Parameter':naiv_Para,
    'Cross validation score':cross_val_score_naive,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Support Vector Machine

In [33]:
from sklearn.svm import SVC
svm=SVC()
f1 = make_scorer(f1_score , average='macro')

In [34]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_support=cross_val_score(svm,x,y,cv=5,scoring=f1).mean()
end = time.time()
Support_time=end-start

In [35]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
model = SVC()

c = range(1, 20, 1)
kernel = ['poly', 'rbf', 'sigmoid']
gamma = ['scale', 'auto']
grid = dict(C=c,kernel=kernel,gamma=gamma)

grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=cross_validation, scoring=f1)
grid_result = grid_search.fit(x,y)



# Best score using support parameter

In [36]:
support_Para=grid_result.best_params_
support_Para

{'C': 19, 'gamma': 'scale', 'kernel': 'poly'}

# Best score using support score

In [37]:
support_f1_score=grid_result.best_score_
support_f1_score

0.6978232649400613

In [38]:
details = {
    'name': 'Support_Vector',
    'time': Support_time,
    'f1_score':support_f1_score,
    'Best_Parameter':support_Para,
    'Cross validation score':cross_val_score_support,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Logistic regression

In [39]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=1)
f1 = make_scorer(f1_score , average='macro')

In [40]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_Logistic=cross_val_score(svm,x,y,cv=5,scoring=f1).mean()

end = time.time()
logistic_time=end-start

In [41]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
Logistic = LogisticRegression()

# dual = ['bool','False']
penalty = [ 'l2']
# class_weight = ['dict', 'balanced']
grid = dict(penalty=penalty)

grid_search = GridSearchCV(estimator=Logistic, param_grid=grid, cv=cross_validation, scoring=f1)
grid_result = grid_search.fit(x,y)


# Best score using Logistic parameter

In [42]:
logistic_Para=grid_result.best_params_
logistic_Para

{'penalty': 'l2'}

# Best score using Logistic score

In [43]:
logistic_f1_score=grid_result.best_score_
logistic_f1_score

0.3665027853775585

In [44]:
details = {
    'name': 'Logistic',
    'time': logistic_time,
    'f1_score':logistic_f1_score,
    'Best_Parameter':logistic_Para,
    'Cross validation score':cross_val_score_Logistic,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Random Forest

In [45]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
f1 = make_scorer(f1_score , average='macro')

In [46]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_random=cross_val_score(rf,x,y,cv=5,scoring=f1).mean()

end = time.time()
random_time=end-start

In [47]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
rf=RandomForestClassifier()

n_estimators = range(1, 20, 1)
criterion = ['gini', 'entropy']
grid = dict(n_estimators=n_estimators,criterion=criterion)

grid_search = GridSearchCV(estimator=rf, param_grid=grid, cv=cross_validation, scoring=f1)
grid_result = grid_search.fit(x,y)


# Best score using random parameter

In [48]:
random_Para=grid_result.best_params_
random_Para

{'criterion': 'gini', 'n_estimators': 18}

# Best score using random score

In [49]:
random_f1_score=grid_result.best_score_
random_f1_score

0.7919171818685167

In [50]:
details = {
    'name': 'Random',
    'time': random_time,
    'f1_score':random_f1_score,
    'Best_Parameter':random_Para,
    'Cross validation score':cross_val_score_random,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Ada boost

In [51]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
f1 = make_scorer(f1_score , average='macro')

In [52]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_Adaboost=cross_val_score(clf,x,y,cv=5,scoring=f1).mean()
end = time.time()
Adaboost_time=end-start

In [53]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
clf = AdaBoostClassifier()

n_estimators = range(1, 100, 1)
algorithm = ['SAMME', 'SAMME.R']
grid = dict(n_estimators=n_estimators,algorithm=algorithm)

grid_search = GridSearchCV(estimator=clf, param_grid=grid, cv=cross_validation, scoring=f1)
grid_result = grid_search.fit(x,y)



# Best score using Adaboost parameter

In [54]:
Adaboost_Para=grid_result.best_params_
Adaboost_Para

{'algorithm': 'SAMME', 'n_estimators': 97}

# Best score using Adaboost score

In [55]:
Adaboost_f1_score=grid_result.best_score_
Adaboost_f1_score

0.4977531590345568

In [56]:
Adaboost_time

2.2499382495880127

In [57]:
details = {
    'name': 'Adaboost',
    'time': Adaboost_time,
    'f1_score':Adaboost_f1_score,
    'Best_Parameter':Adaboost_Para,
    'Cross validation score':cross_val_score_Adaboost
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Gradient boost

In [58]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=2, random_state=0)
f1 = make_scorer(f1_score , average='macro')   

In [59]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_gradient=cross_val_score(clf,x,y,cv=5,scoring=f1).mean()
end = time.time()
gradient_time=end-start

In [62]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
gb_clf=GradientBoostingClassifier()

max_depth=range(1, 10, 1)
n_estimators=[100]
# learning_rate =np.arange(0.05, 0.2, 0.05)
learning_rate=[0.01]
grid = dict(n_estimators=n_estimators,max_depth=max_depth,learning_rate=learning_rate)

grid_search = GridSearchCV(estimator=gb_clf, param_grid=grid, cv=cross_validation, scoring=f1)
grid_result = grid_search.fit(x,y)



# Best score using Gradient boost parameter

In [63]:
Gradient_Para=grid_result.best_params_
Gradient_Para

{'learning_rate': 0.01, 'max_depth': 6, 'n_estimators': 100}

# Best score using Gradient boost score

In [64]:
Gradient_f1_score=grid_result.best_score_
Gradient_f1_score

0.6296959320223696

In [65]:
gradient_time

1.2191698551177979

In [66]:
details = {
    'name': 'Gradient',
    'time': gradient_time,
    'f1_score':Gradient_f1_score,
    'Best_Parameter':Gradient_Para,
    'Cross validation score':cross_val_score_gradient
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# XGBoost

In [67]:
import xgboost as xgb
gb_clf = xgb.XGBClassifier(eta=0.3,gamma=0,max_depth=6)
f1 = make_scorer(f1_score , average='macro')  

In [68]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_xgboost=cross_val_score(gb_clf,x,y,cv=5,scoring=f1).mean()
end = time.time()
xgboost_time=end-start





















In [69]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
gb_clf=xgb.XGBClassifier()

gamma=range(1, 10, 1)
eta=[0.001,0.01,0.05]
grid = dict(eta=eta,gamma=gamma)

grid_search = GridSearchCV(estimator=gb_clf, param_grid=grid, cv=cross_validation, scoring=f1)
grid_result = grid_search.fit(x,y)

































































































































































































































































































































































































































































































































































# Best score using XG boost parameter

In [70]:
XG_Para=grid_result.best_params_
XG_Para

{'eta': 0.05, 'gamma': 2}

# Best score using Gradient boost score

In [71]:
XG_f1_score=grid_result.best_score_
XG_f1_score

0.6935561421851745

In [72]:
xgboost_time

3.612431287765503

In [73]:
details = {
    'name': 'XGboost',
    'time': xgboost_time,
    'f1_score':XG_f1_score,
    'Best_Parameter':XG_Para,
    'Cross validation score':cross_val_score_xgboost
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)