# Import libaries

In [1]:
import openml
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from progressbar import ProgressBar
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer
import time
import json

# Import all datatsets

In [2]:
datasets_df = openml.datasets.list_datasets(output_format="dataframe")
datasets_df.head(10)

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
2,2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,39.0,898.0,898.0,22175.0,6.0,33.0
3,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
4,4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,17.0,57.0,56.0,326.0,8.0,9.0
5,5,arrhythmia,1,1,active,ARFF,245.0,13.0,2.0,13.0,280.0,452.0,384.0,408.0,206.0,74.0
6,6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,17.0,20000.0,0.0,0.0,16.0,1.0
7,7,audiology,1,1,active,ARFF,57.0,24.0,1.0,24.0,70.0,226.0,222.0,317.0,0.0,70.0
8,8,liver-disorders,1,1,active,ARFF,,,,0.0,6.0,345.0,0.0,0.0,6.0,0.0
9,9,autos,1,1,active,ARFF,67.0,22.0,3.0,6.0,26.0,205.0,46.0,59.0,15.0,11.0
10,10,lymph,1,1,active,ARFF,81.0,8.0,2.0,4.0,19.0,148.0,0.0,0.0,3.0,16.0
11,11,balance-scale,1,1,active,ARFF,288.0,3.0,49.0,3.0,5.0,625.0,0.0,0.0,4.0,1.0


# Extract Dataset

In [3]:
dataset = openml.datasets.get_dataset(26)

# Print description about dataset

In [4]:
# Print a summary
print(
    f"This is dataset '{dataset.name}', the target feature is "
    f"'{dataset.default_target_attribute}'"
)
print(f"URL: {dataset.url}")
print(dataset.description[:])

This is dataset 'nursery', the target feature is 'class'
URL: https://www.openml.org/data/v1/download/26/nursery.arff
**Author**:   
**Source**: Unknown -   
**Please cite**:   

1. Title: Nursery Database
 
 2. Sources:
    (a) Creator: Vladislav Rajkovic et al. (13 experts)
    (b) Donors: Marko Bohanec   (marko.bohanec@ijs.si)
                Blaz Zupan      (blaz.zupan@ijs.si)
    (c) Date: June, 1997
 
 3. Past Usage:
 
    The hierarchical decision model, from which this dataset is
    derived, was first presented in 
 
    M. Olave, V. Rajkovic, M. Bohanec: An application for admission in
    public school systems. In (I. Th. M. Snellen and W. B. H. J. van de
    Donk and J.-P. Baquiast, editors) Expert Systems in Public
    Administration, pages 145-160. Elsevier Science Publishers (North
    Holland)}, 1989.
 
    Within machine-learning, this dataset was used for the evaluation
    of HINT (Hierarchy INduction Tool), which was proved to be able to
    completely reconstruct t

# Get depent and indepent variables

In [5]:
x, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format="dataframe", target=dataset.default_target_attribute
)

# Data types

In [6]:
x.dtypes

parents     category
has_nurs    category
form        category
children    category
housing     category
finance     category
social      category
health      category
dtype: object

In [7]:
cat_columns = x.select_dtypes(['category']).columns
x[cat_columns] = x[cat_columns].apply(lambda x: pd.factorize(x)[0])

In [8]:
x.dtypes

parents     int64
has_nurs    int64
form        int64
children    int64
housing     int64
finance     int64
social      int64
health      int64
dtype: object

# Change response variable to int

In [9]:
y

0         recommend
1          priority
2         not_recom
3         recommend
4          priority
            ...    
12955    spec_prior
12956     not_recom
12957    spec_prior
12958    spec_prior
12959     not_recom
Name: class, Length: 12960, dtype: category
Categories (5, object): ['not_recom' < 'recommend' < 'very_recom' < 'priority' < 'spec_prior']

In [10]:
y=y.map({'not_recom' :0,'recommend':1,'very_recom' :2,'priority':3,'spec_prior':4})

In [11]:
y.astype(int)

0        1
1        3
2        0
3        1
4        3
        ..
12955    4
12956    0
12957    4
12958    4
12959    0
Name: class, Length: 12960, dtype: int32

In [12]:
df_min_max_scaled = x.copy()
  
# apply normalization techniques
for column in df_min_max_scaled.columns:
    df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min())    
  
# view normalized data
print(df_min_max_scaled)

       parents  has_nurs  form  children  housing  finance  social  health
0          0.0       0.0   0.0       0.0      0.0      0.0     0.0     0.0
1          0.0       0.0   0.0       0.0      0.0      0.0     0.0     0.5
2          0.0       0.0   0.0       0.0      0.0      0.0     0.0     1.0
3          0.0       0.0   0.0       0.0      0.0      0.0     0.5     0.0
4          0.0       0.0   0.0       0.0      0.0      0.0     0.5     0.5
...        ...       ...   ...       ...      ...      ...     ...     ...
12955      1.0       1.0   1.0       1.0      1.0      1.0     0.5     0.5
12956      1.0       1.0   1.0       1.0      1.0      1.0     0.5     1.0
12957      1.0       1.0   1.0       1.0      1.0      1.0     1.0     0.0
12958      1.0       1.0   1.0       1.0      1.0      1.0     1.0     0.5
12959      1.0       1.0   1.0       1.0      1.0      1.0     1.0     1.0

[12960 rows x 8 columns]


In [13]:
pbar = ProgressBar()
x=df_min_max_scaled.to_numpy()

In [14]:
x

array([[0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0.5],
       [0. , 0. , 0. , ..., 0. , 0. , 1. ],
       ...,
       [1. , 1. , 1. , ..., 1. , 1. , 0. ],
       [1. , 1. , 1. , ..., 1. , 1. , 0.5],
       [1. , 1. , 1. , ..., 1. , 1. , 1. ]])

In [15]:
k='Nursery.json'

# k-Nearest Neighbors

In [16]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
f1 = make_scorer(f1_score , average='macro')

In [17]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_knn=cross_val_score(knn,x,y,cv=5,scoring=f1).mean()
end = time.time()
knn_time=end-start



In [18]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
model = KNeighborsClassifier()

n_neighbors = range(1, 21)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)

grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=cross_validation, scoring=f1,n_jobs=8)
grid_result = grid_search.fit(x,y)





# Best score using KNN parameter

In [19]:
Knn_Para=grid_result.best_params_
Knn_Para

{'metric': 'manhattan', 'n_neighbors': 6, 'weights': 'distance'}

# Best score using KNN score

In [20]:
Knn_f1_score=grid_result.best_score_
Knn_f1_score

0.8903990915239495

In [21]:
details = {
    'name': 'KNN',
    'time': knn_time,
    'f1_score':Knn_f1_score,
    'Best_Parameter':Knn_Para,
    'Cross validation score':cross_val_score_knn,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Linear Discriminant Analysis

In [22]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis()

In [23]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_linear=cross_val_score(knn,x,y,cv=5,scoring=f1).mean()
end = time.time()
Linear_dis_time=end-start



In [24]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
model = LinearDiscriminantAnalysis()

solver = ['svd', 'lsqr', 'eigen']
grid = dict(solver=solver)

grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=cross_validation, scoring=f1,n_jobs=8)
grid_result = grid_search.fit(x,y)



# Best score using linear discriminent parameter

In [25]:
Linear_Para=grid_result.best_params_
Linear_Para

{'solver': 'svd'}

# Best score using discriminent score

In [26]:
Linear_f1_score=grid_result.best_score_
Linear_f1_score

0.7032401130376225

In [27]:
details = {
    'name': 'Linear_Dis',
    'time': Linear_dis_time,
    'f1_score':Linear_f1_score,
    'Best_Parameter':Linear_Para,
    'Cross validation score':cross_val_score_linear,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Naive Bayes

In [28]:
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()
f1 = make_scorer(f1_score , average='macro')

In [29]:
nb.priors

In [30]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_naive=cross_val_score(nb,x,y,cv=5,scoring=f1).mean()
end = time.time()
Naive_time=end-start



In [31]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
model = GaussianNB()


var_smoothing = [1e-09,2e-09,3e-09,4e-09,5e-09,6e-09,7e-09,8e-09,9e-09,10e-09]

grid = dict(var_smoothing=var_smoothing)

grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=cross_validation, scoring=f1,n_jobs=8)
grid_result = grid_search.fit(x,y)





# Best score using Naive parameter

In [32]:
naiv_Para=grid_result.best_params_
naiv_Para

{'var_smoothing': 1e-08}

# Best score using Naive score

In [33]:
naive_f1_score=grid_result.best_score_
naive_f1_score

0.5785478207451185

In [34]:
details = {
    'name': 'Naive Bayes',
    'time': Naive_time,
    'f1_score':naive_f1_score,
    'Best_Parameter':naiv_Para,
    'Cross validation score':cross_val_score_naive,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Support Vector Machine

In [35]:
from sklearn.svm import SVC
svm=SVC()
f1 = make_scorer(f1_score , average='macro')

In [36]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_support=cross_val_score(svm,x,y,cv=5,scoring=f1).mean()
end = time.time()
Support_time=end-start



In [37]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
model = SVC()

c = range(1, 20, 1)
kernel = ['poly', 'rbf', 'sigmoid']
gamma = ['scale', 'auto']
grid = dict(C=c,kernel=kernel,gamma=gamma)

grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=cross_validation, scoring=f1,n_jobs=8)
grid_result = grid_search.fit(x,y)





# Best score using support parameter

In [38]:
support_Para=grid_result.best_params_
support_Para

{'C': 16, 'gamma': 'scale', 'kernel': 'rbf'}

# Best score using support score

In [39]:
support_f1_score=grid_result.best_score_
support_f1_score

0.9139576282873618

In [40]:
details = {
    'name': 'Support_Vector',
    'time': Support_time,
    'f1_score':support_f1_score,
    'Best_Parameter':support_Para,
    'Cross validation score':cross_val_score_support,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Logistic regression

In [41]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=1)
f1 = make_scorer(f1_score , average='macro')

In [42]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_Logistic=cross_val_score(svm,x,y,cv=5,scoring=f1).mean()

end = time.time()
logistic_time=end-start



In [43]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
Logistic = LogisticRegression()

# dual = ['bool','False']
penalty = [ 'l2']
# class_weight = ['dict', 'balanced']
grid = dict(penalty=penalty)

grid_search = GridSearchCV(estimator=Logistic, param_grid=grid, cv=cross_validation, scoring=f1,n_jobs=8)
grid_result = grid_search.fit(x,y)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


# Best score using Logistic parameter

In [44]:
logistic_Para=grid_result.best_params_
logistic_Para

{'penalty': 'l2'}

# Best score using Logistic score

In [45]:
logistic_f1_score=grid_result.best_score_
logistic_f1_score

0.7393078940835973

In [46]:
details = {
    'name': 'Logistic',
    'time': logistic_time,
    'f1_score':logistic_f1_score,
    'Best_Parameter':logistic_Para,
    'Cross validation score':cross_val_score_Logistic,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Random Forest

In [47]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
f1 = make_scorer(f1_score , average='macro')

In [48]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_random=cross_val_score(rf,x,y,cv=5,scoring=f1).mean()

end = time.time()
random_time=end-start



In [49]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
rf=RandomForestClassifier()

n_estimators = range(1, 20, 1)
criterion = ['gini', 'entropy']
grid = dict(n_estimators=n_estimators,criterion=criterion)

grid_search = GridSearchCV(estimator=rf, param_grid=grid, cv=cross_validation, scoring=f1,n_jobs=8)
grid_result = grid_search.fit(x,y)




# Best score using random parameter

In [50]:
random_Para=grid_result.best_params_
random_Para

{'criterion': 'gini', 'n_estimators': 18}

# Best score using random score

In [51]:
random_f1_score=grid_result.best_score_
random_f1_score

0.9108349030790832

In [52]:
details = {
    'name': 'Random',
    'time': random_time,
    'f1_score':random_f1_score,
    'Best_Parameter':random_Para,
    'Cross validation score':cross_val_score_random,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Ada boost

In [53]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
f1 = make_scorer(f1_score , average='macro')

In [54]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_Adaboost=cross_val_score(clf,x,y,cv=5,scoring=f1).mean()
end = time.time()
Adaboost_time=end-start



In [55]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
clf = AdaBoostClassifier()

n_estimators = range(1, 100, 1)
algorithm = ['SAMME', 'SAMME.R']
grid = dict(n_estimators=n_estimators,algorithm=algorithm)

grid_search = GridSearchCV(estimator=clf, param_grid=grid, cv=cross_validation, scoring=f1,n_jobs=8)
grid_result = grid_search.fit(x,y)





# Best score using Adaboost parameter

In [56]:
Adaboost_Para=grid_result.best_params_
Adaboost_Para

{'algorithm': 'SAMME', 'n_estimators': 23}

# Best score using Adaboost score

In [57]:
Adaboost_f1_score=grid_result.best_score_
Adaboost_f1_score

0.6080903624267606

In [58]:
Adaboost_time

8.573315143585205

In [59]:
details = {
    'name': 'Adaboost',
    'time': Adaboost_time,
    'f1_score':Adaboost_f1_score,
    'Best_Parameter':Adaboost_Para,
    'Cross validation score':cross_val_score_Adaboost
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Gradient boost

In [60]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=2, random_state=0)
f1 = make_scorer(f1_score , average='macro')   

In [61]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_gradient=cross_val_score(clf,x,y,cv=5,scoring=f1).mean()
end = time.time()
gradient_time=end-start



In [62]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
gb_clf=GradientBoostingClassifier()

max_depth=range(1, 10, 1)
n_estimators=[100]
# learning_rate =np.arange(0.05, 0.2, 0.05)
learning_rate=[0.01]
grid = dict(n_estimators=n_estimators,max_depth=max_depth,learning_rate=learning_rate)

grid_search = GridSearchCV(estimator=gb_clf, param_grid=grid, cv=cross_validation, scoring=f1,n_jobs=8)
grid_result = grid_search.fit(x,y)





# Best score using Gradient boost parameter

In [63]:
Gradient_Para=grid_result.best_params_
Gradient_Para

{'learning_rate': 0.01, 'max_depth': 9, 'n_estimators': 100}

# Best score using Gradient boost score

In [64]:
Gradient_f1_score=grid_result.best_score_
Gradient_f1_score

0.9125029872210744

In [65]:
gradient_time

4.438530683517456

In [66]:
details = {
    'name': 'Gradient',
    'time': gradient_time,
    'f1_score':Gradient_f1_score,
    'Best_Parameter':Gradient_Para,
    'Cross validation score':cross_val_score_gradient
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# XGBoost

In [67]:
import xgboost as xgb
gb_clf = xgb.XGBClassifier(eta=0.3,gamma=0,max_depth=6)
f1 = make_scorer(f1_score , average='macro')  

In [68]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_xgboost=cross_val_score(gb_clf,x,y,cv=5,scoring=f1).mean()
end = time.time()
xgboost_time=end-start





















In [69]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
gb_clf=xgb.XGBClassifier()

gamma=range(1, 10, 1)
eta=[0.001,0.01,0.05]
grid = dict(eta=eta,gamma=gamma)

grid_search = GridSearchCV(estimator=gb_clf, param_grid=grid, cv=cross_validation, scoring=f1,n_jobs=8)
grid_result = grid_search.fit(x,y)





# Best score using XG boost parameter

In [70]:
XG_Para=grid_result.best_params_
XG_Para

{'eta': 0.05, 'gamma': 2}

# Best score using Gradient boost score

In [71]:
XG_f1_score=grid_result.best_score_
XG_f1_score

0.9007025701402759

In [72]:
xgboost_time

16.46075677871704

In [73]:
details = {
    'name': 'XGboost',
    'time': xgboost_time,
    'f1_score':XG_f1_score,
    'Best_Parameter':XG_Para,
    'Cross validation score':cross_val_score_xgboost
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)