# Import libaries

In [4]:
import openml
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from progressbar import ProgressBar
from sklearn.model_selection import GridSearchCV,StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score, make_scorer
import time
import json

# Import all datatsets

In [5]:
datasets_df = openml.datasets.list_datasets(output_format="dataframe")
datasets_df.head(10)

Unnamed: 0,did,name,version,uploader,status,format,MajorityClassSize,MaxNominalAttDistinctValues,MinorityClassSize,NumberOfClasses,NumberOfFeatures,NumberOfInstances,NumberOfInstancesWithMissingValues,NumberOfMissingValues,NumberOfNumericFeatures,NumberOfSymbolicFeatures
2,2,anneal,1,1,active,ARFF,684.0,7.0,8.0,5.0,39.0,898.0,898.0,22175.0,6.0,33.0
3,3,kr-vs-kp,1,1,active,ARFF,1669.0,3.0,1527.0,2.0,37.0,3196.0,0.0,0.0,0.0,37.0
4,4,labor,1,1,active,ARFF,37.0,3.0,20.0,2.0,17.0,57.0,56.0,326.0,8.0,9.0
5,5,arrhythmia,1,1,active,ARFF,245.0,13.0,2.0,13.0,280.0,452.0,384.0,408.0,206.0,74.0
6,6,letter,1,1,active,ARFF,813.0,26.0,734.0,26.0,17.0,20000.0,0.0,0.0,16.0,1.0
7,7,audiology,1,1,active,ARFF,57.0,24.0,1.0,24.0,70.0,226.0,222.0,317.0,0.0,70.0
8,8,liver-disorders,1,1,active,ARFF,,,,0.0,6.0,345.0,0.0,0.0,6.0,0.0
9,9,autos,1,1,active,ARFF,67.0,22.0,3.0,6.0,26.0,205.0,46.0,59.0,15.0,11.0
10,10,lymph,1,1,active,ARFF,81.0,8.0,2.0,4.0,19.0,148.0,0.0,0.0,3.0,16.0
11,11,balance-scale,1,1,active,ARFF,288.0,3.0,49.0,3.0,5.0,625.0,0.0,0.0,4.0,1.0


# Extract Dataset

In [6]:
dataset = openml.datasets.get_dataset(61)

# Print description about dataset

In [7]:
# Print a summary
print(
    f"This is dataset '{dataset.name}', the target feature is "
    f"'{dataset.default_target_attribute}'"
)
print(f"URL: {dataset.url}")
print(dataset.description[:])

This is dataset 'iris', the target feature is 'class'
URL: https://www.openml.org/data/v1/download/61/iris.arff
**Author**: R.A. Fisher  

**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/Iris) - 1936 - Donated by Michael Marshall  

**Please cite**:   



**Iris Plants Database**  

This is perhaps the best known database to be found in the pattern recognition literature.  Fisher's paper is a classic in the field and is referenced frequently to this day.  (See Duda & Hart, for example.)  The data set contains 3 classes of 50 instances each, where each class refers to a type of iris plant.  One class is     linearly separable from the other 2; the latter are NOT linearly separable from each other.



Predicted attribute: class of iris plant.  

This is an exceedingly simple domain.  

 

### Attribute Information:

    1. sepal length in cm

    2. sepal width in cm

    3. petal length in cm

    4. petal width in cm

    5. class: 

       -- Iris Setosa

       -- Iris Vers

# Get depent and indepent variables

In [8]:
x, y, categorical_indicator, attribute_names = dataset.get_data(
    dataset_format="dataframe", target=dataset.default_target_attribute
)

# Data types

In [9]:
x.dtypes

sepallength    float64
sepalwidth     float64
petallength    float64
petalwidth     float64
dtype: object

In [10]:
cat_columns = x.select_dtypes(['category']).columns
x[cat_columns] = x[cat_columns].apply(lambda x: pd.factorize(x)[0])

In [11]:
x.dtypes

sepallength    float64
sepalwidth     float64
petallength    float64
petalwidth     float64
dtype: object

# Change response variable to int

In [12]:
y

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
            ...      
145    Iris-virginica
146    Iris-virginica
147    Iris-virginica
148    Iris-virginica
149    Iris-virginica
Name: class, Length: 150, dtype: category
Categories (3, object): ['Iris-setosa' < 'Iris-versicolor' < 'Iris-virginica']

In [13]:
y=y.map({'Iris-setosa' :0,'Iris-versicolor':1,'Iris-virginica' :2})

In [14]:
y.astype(int)

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: class, Length: 150, dtype: int32

In [15]:
df_min_max_scaled = x.copy()
  
# apply normalization techniques
for column in df_min_max_scaled.columns:
    df_min_max_scaled[column] = (df_min_max_scaled[column] - df_min_max_scaled[column].min()) / (df_min_max_scaled[column].max() - df_min_max_scaled[column].min())    
  
# view normalized data
print(df_min_max_scaled)

     sepallength  sepalwidth  petallength  petalwidth
0       0.222222    0.625000     0.067797    0.041667
1       0.166667    0.416667     0.067797    0.041667
2       0.111111    0.500000     0.050847    0.041667
3       0.083333    0.458333     0.084746    0.041667
4       0.194444    0.666667     0.067797    0.041667
..           ...         ...          ...         ...
145     0.666667    0.416667     0.711864    0.916667
146     0.555556    0.208333     0.677966    0.750000
147     0.611111    0.416667     0.711864    0.791667
148     0.527778    0.583333     0.745763    0.916667
149     0.444444    0.416667     0.694915    0.708333

[150 rows x 4 columns]


In [16]:
pbar = ProgressBar()
x=df_min_max_scaled.to_numpy()

In [17]:
x

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667],
       [0.30555556, 0.79166667, 0.11864407, 0.125     ],
       [0.08333333, 0.58333333, 0.06779661, 0.08333333],
       [0.19444444, 0.58333333, 0.08474576, 0.04166667],
       [0.02777778, 0.375     , 0.06779661, 0.04166667],
       [0.16666667, 0.45833333, 0.08474576, 0.        ],
       [0.30555556, 0.70833333, 0.08474576, 0.04166667],
       [0.13888889, 0.58333333, 0.10169492, 0.04166667],
       [0.13888889, 0.41666667, 0.06779661, 0.        ],
       [0.        , 0.41666667, 0.01694915, 0.        ],
       [0.41666667, 0.83333333, 0.03389831, 0.04166667],
       [0.38888889, 1.        , 0.08474576, 0.125     ],
       [0.30555556, 0.79166667, 0.05084746, 0.125     ],
       [0.22222222, 0.625     ,

In [18]:
DatasetName='Iris_Main.json'

# Grid search cv function

In [114]:
def gridfun(fName,x,y,estimator,param_grid,DatasetName):
    for i in range(1,11):
        cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=i)
        grid_search = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=cross_validation, scoring=f1,n_jobs=8)
        start = time.time()
        grid_result = grid_search.fit(x,y)
        end = time.time()
        knn_time=end-start
        random_state=i
        Knn_Para=grid_result.best_params_
        Knn_f1_score=grid_result.best_score_
        
        details = {
            'Random State':random_state,
            'name': fName,
            'time': knn_time,
            'f1_score':Knn_f1_score,
            'Best_Parameter':Knn_Para,
            
            
    
        }
        with open(DatasetName, 'a') as json_file:
            json.dzzzzzzzump(details, json_file)
            json_file.write('\n')
     
        
        
    

In [115]:
# increment ,function random state, time, best parameter

# k-Nearest Neighbors

In [116]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()
f1 = make_scorer(f1_score , average='macro')

In [117]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_knn=cross_val_score(knn,x,y,cv=5,scoring=f1).mean()
end = time.time()
knn_time=end-start

In [118]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
model = KNeighborsClassifier()

n_neighbors = range(1, 21)
weights = ['uniform', 'distance']
metric = ['euclidean', 'manhattan', 'minkowski']
grid = dict(n_neighbors=n_neighbors,weights=weights,metric=metric)

gridfun("KNN",x,y,model,grid,DatasetName)



# Best score using KNN parameter

In [None]:
Knn_Para=grid_result.best_params_
Knn_Para

# Best score using KNN score

In [None]:
Knn_f1_score=grid_result.best_score_
Knn_f1_score

In [None]:
details = {
    'name': 'KNN',
    'time': knn_time,
    'f1_score':Knn_f1_score,
    'Best_Parameter':Knn_Para,
    'Cross validation score':cross_val_score_knn,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Linear Discriminant Analysis

In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
clf = LinearDiscriminantAnalysis()

In [None]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_linear=cross_val_score(knn,x,y,cv=5,scoring=f1).mean()
end = time.time()
Linear_dis_time=end-start

In [None]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
model = LinearDiscriminantAnalysis()

solver = ['svd', 'lsqr', 'eigen']
grid = dict(solver=solver)

gridfun("KNN",x,y,model,grid,DatasetName)

# Best score using linear discriminent parameter

In [None]:
Linear_Para=grid_result.best_params_
Linear_Para

# Best score using discriminent score

In [None]:
Linear_f1_score=grid_result.best_score_
Linear_f1_score

In [None]:
details = {
    'name': 'Linear_Dis',
    'time': Linear_dis_time,
    'f1_score':Linear_f1_score,
    'Best_Parameter':Linear_Para,
    'Cross validation score':cross_val_score_linear,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
nb=GaussianNB()
f1 = make_scorer(f1_score , average='macro')

In [None]:
nb.priors

In [None]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_naive=cross_val_score(nb,x,y,cv=5,scoring=f1).mean()
end = time.time()
Naive_time=end-start

In [None]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
model = GaussianNB()


var_smoothing = [1e-09,2e-09,3e-09,4e-09,5e-09,6e-09,7e-09,8e-09,9e-09,10e-09]

grid = dict(var_smoothing=var_smoothing)

grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=cross_validation, scoring=f1,n_jobs=8)
grid_result = grid_search.fit(x,y)



# Best score using Naive parameter

In [None]:
naiv_Para=grid_result.best_params_
naiv_Para

# Best score using Naive score

In [None]:
naive_f1_score=grid_result.best_score_
naive_f1_score

In [None]:
details = {
    'name': 'Naive Bayes',
    'time': Naive_time,
    'f1_score':naive_f1_score,
    'Best_Parameter':naiv_Para,
    'Cross validation score':cross_val_score_naive,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Support Vector Machine

In [None]:
from sklearn.svm import SVC
svm=SVC()
f1 = make_scorer(f1_score , average='macro')

In [None]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_support=cross_val_score(svm,x,y,cv=5,scoring=f1).mean()
end = time.time()
Support_time=end-start

In [None]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
model = SVC()

c = range(1, 20, 1)
kernel = ['poly', 'rbf', 'sigmoid']
gamma = ['scale', 'auto']
grid = dict(C=c,kernel=kernel,gamma=gamma)

grid_search = GridSearchCV(estimator=model, param_grid=grid, cv=cross_validation, scoring=f1,n_jobs=8)
grid_result = grid_search.fit(x,y)



# Best score using support parameter

In [None]:
support_Para=grid_result.best_params_
support_Para

# Best score using support score

In [None]:
support_f1_score=grid_result.best_score_
support_f1_score

In [None]:
details = {
    'name': 'Support_Vector',
    'time': Support_time,
    'f1_score':support_f1_score,
    'Best_Parameter':support_Para,
    'Cross validation score':cross_val_score_support,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=1)
f1 = make_scorer(f1_score , average='macro')

In [None]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_Logistic=cross_val_score(svm,x,y,cv=5,scoring=f1).mean()

end = time.time()
logistic_time=end-start

In [None]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
Logistic = LogisticRegression()

# dual = ['bool','False']
penalty = [ 'l2']
# class_weight = ['dict', 'balanced']
grid = dict(penalty=penalty)

grid_search = GridSearchCV(estimator=Logistic, param_grid=grid, cv=cross_validation, scoring=f1,n_jobs=8)
grid_result = grid_search.fit(x,y)


# Best score using Logistic parameter

In [None]:
logistic_Para=grid_result.best_params_
logistic_Para

# Best score using Logistic score

In [None]:
logistic_f1_score=grid_result.best_score_
logistic_f1_score

In [None]:
details = {
    'name': 'Logistic',
    'time': logistic_time,
    'f1_score':logistic_f1_score,
    'Best_Parameter':logistic_Para,
    'Cross validation score':cross_val_score_Logistic,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()
f1 = make_scorer(f1_score , average='macro')

In [None]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_random=cross_val_score(rf,x,y,cv=5,scoring=f1).mean()

end = time.time()
random_time=end-start

In [None]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
rf=RandomForestClassifier()

n_estimators = range(1, 20, 1)
criterion = ['gini', 'entropy']
grid = dict(n_estimators=n_estimators,criterion=criterion)

grid_search = GridSearchCV(estimator=rf, param_grid=grid, cv=cross_validation, scoring=f1,n_jobs=8)
grid_result = grid_search.fit(x,y)


# Best score using random parameter

In [None]:
random_Para=grid_result.best_params_
random_Para

# Best score using random score

In [None]:
random_f1_score=grid_result.best_score_
random_f1_score

In [None]:
details = {
    'name': 'Random',
    'time': random_time,
    'f1_score':random_f1_score,
    'Best_Parameter':random_Para,
    'Cross validation score':cross_val_score_random,
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Ada boost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
f1 = make_scorer(f1_score , average='macro')

In [None]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_Adaboost=cross_val_score(clf,x,y,cv=5,scoring=f1).mean()
end = time.time()
Adaboost_time=end-start

In [None]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
clf = AdaBoostClassifier()

n_estimators = range(1, 100, 1)
algorithm = ['SAMME', 'SAMME.R']
grid = dict(n_estimators=n_estimators,algorithm=algorithm)

grid_search = GridSearchCV(estimator=clf, param_grid=grid, cv=cross_validation, scoring=f1,n_jobs=8)
grid_result = grid_search.fit(x,y)



# Best score using Adaboost parameter

In [None]:
Adaboost_Para=grid_result.best_params_
Adaboost_Para

# Best score using Adaboost score

In [None]:
Adaboost_f1_score=grid_result.best_score_
Adaboost_f1_score

In [None]:
Adaboost_time

In [None]:
details = {
    'name': 'Adaboost',
    'time': Adaboost_time,
    'f1_score':Adaboost_f1_score,
    'Best_Parameter':Adaboost_Para,
    'Cross validation score':cross_val_score_Adaboost
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# Gradient boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=2, random_state=0)
f1 = make_scorer(f1_score , average='macro')   

In [None]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_gradient=cross_val_score(clf,x,y,cv=5,scoring=f1).mean()
end = time.time()
gradient_time=end-start

In [None]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
gb_clf=GradientBoostingClassifier()

max_depth=range(1, 10, 1)
n_estimators=[100]
# learning_rate =np.arange(0.05, 0.2, 0.05)
learning_rate=[0.01]
grid = dict(n_estimators=n_estimators,max_depth=max_depth,learning_rate=learning_rate)

grid_search = GridSearchCV(estimator=gb_clf, param_grid=grid, cv=cross_validation, scoring=f1,n_jobs=8)
grid_result = grid_search.fit(x,y)



# Best score using Gradient boost parameter

In [None]:
Gradient_Para=grid_result.best_params_
Gradient_Para

# Best score using Gradient boost score

In [None]:
Gradient_f1_score=grid_result.best_score_
Gradient_f1_score

In [None]:
gradient_time

In [None]:
details = {
    'name': 'Gradient',
    'time': gradient_time,
    'f1_score':Gradient_f1_score,
    'Best_Parameter':Gradient_Para,
    'Cross validation score':cross_val_score_gradient
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)

# XGBoost

In [None]:
import xgboost as xgb
gb_clf = xgb.XGBClassifier(eta=0.3,gamma=0,max_depth=6)
f1 = make_scorer(f1_score , average='macro')  

In [None]:
from sklearn.model_selection import cross_val_score
start = time.time()
cross_val_score_xgboost=cross_val_score(gb_clf,x,y,cv=5,scoring=f1).mean()
end = time.time()
xgboost_time=end-start

In [None]:
cross_validation = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)
gb_clf=xgb.XGBClassifier()

gamma=range(1, 10, 1)
eta=[0.001,0.01,0.05]
grid = dict(eta=eta,gamma=gamma)

grid_search = GridSearchCV(estimator=gb_clf, param_grid=grid, cv=cross_validation, scoring=f1,n_jobs=8)
grid_result = grid_search.fit(x,y)

# Best score using XG boost parameter

In [None]:
XG_Para=grid_result.best_params_
XG_Para

# Best score using Gradient boost score

In [None]:
XG_f1_score=grid_result.best_score_
XG_f1_score

In [None]:
xgboost_time

In [None]:
details = {
    'name': 'XGboost',
    'time': xgboost_time,
    'f1_score':XG_f1_score,
    'Best_Parameter':XG_Para,
    'Cross validation score':cross_val_score_xgboost
    
}
with open(k, 'a') as json_file:
    json.dump(details, json_file)