In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load


import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# virtualization
import matplotlib.pyplot as plt
import seaborn as sns

# scalar, preprocessing
from sklearn import preprocessing

# tuning parameter
from sklearn.model_selection import train_test_split,GridSearchCV,RandomizedSearchCV,cross_val_score

# model
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from xgboost import XGBClassifier

# metric
from sklearn import model_selection
from sklearn.metrics import confusion_matrix, classification_report, plot_confusion_matrix,accuracy_score

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
# skip warning
from sklearn import preprocessing
import warnings
warnings.filterwarnings( action= 'ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

this code aims to use multiple models to find the best model and best parameter to predict

# IMPORT DATASET

In [None]:
df = pd.read_csv("/kaggle/input/gender-classification/Transformed Data Set - Sheet1.csv")
df.info()
df

# PREPROCESSING DATA

In [None]:
# rename for easy to use
df = df.rename(columns={'Favorite Color' :'favorite_color', 'Favorite Music Genre':'favorite_music_genre', 
                          'Favorite Beverage':'favorite_beverage', 'Favorite Soft Drink':'favorite_soft_drink'})

In [None]:
# check unique each column
print('unique column: favorite_color =',set(df['favorite_color'].tolist()))
print('unique column: favorite_music_genre =',set(df['favorite_music_genre'].tolist()))
print('unique column: favorite_beverage =',set(df['favorite_beverage'].tolist()))
print('unique column: favorite_soft_drink =',set(df['favorite_soft_drink'].tolist()))
print('unique column: Gender =',set(df['Gender'].tolist()))

In [None]:
# rename for clean data
df = df.replace({'favorite_color': {'Warm': 0, 'Cool':1,'Neutral':2},
                'favorite_music_genre': {'Folk/Traditional': 0, 'Hip hop':1,'Jazz/Blues':2,'Rock':3,'R&B and soul':4,'Pop':5,'Electronic':6},
                'favorite_beverage': {"""Doesn't drink""": 0, "Whiskey":1,"Wine":2,"Vodka":3,"Other":4,"Beer":5},
                'favorite_soft_drink': {'Fanta': 0, '7UP/Sprite':1, 'Coca Cola/Pepsi':2,'Other':3},
                'Gender': {'F': 0, 'M':1}})
df.info()
df

# Exploratory Data Analysis

In [None]:
# check gender dataset
plt.figure(figsize=(4,6))
df["Gender"].hist()  

In [None]:
# show box plot to find outlier
df.plot(kind='box', subplots=True, figsize=(20, 40),layout=(20,4))
plt.show()

don't have outlier and gender_male and gender_female is equal

# Train test split

In [None]:
# train test split to protect data overfitting
features = ['favorite_color','favorite_music_genre','favorite_beverage','favorite_soft_drink']
X = df.loc[:, features].to_numpy()
y = df.loc[:, 'Gender'].to_numpy()
print(X.shape, y.shape)

In [None]:
# train 70% test 30%
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=100)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
# using standard scalar
standard_scaler = preprocessing.StandardScaler()
X_train_scalar = standard_scaler.fit_transform(X_train)
X_test_scalar = standard_scaler.fit_transform(X_test)
print(X_train_scalar.shape,X_test_scalar.shape)

# SVM

In [None]:
# default_parameter with svm
svm_model = SVC()
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(svm_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
svm_model.fit(X_train_scalar, y_train)
y_pred_model =svm_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(svm_model, X_test_scalar, y_test)
plt.show()

In [None]:
# find_best_parameter with svm
svm_model = SVC()
svm_parameter = {
    'gamma': ['scale','auto'],
    'verbose':[True,False],
    'probability':[True,False],
    'shrinking':[True,False],
    'break_ties':[True,False],
    'decision_function_shape': ['ovo','ovr'],
}

clf = GridSearchCV(svm_model, svm_parameter)
clf.fit(X_train_scalar, y_train)
print("\nbest_score = ",clf.best_score_)
print("best_params = ",clf.best_params_)

In [None]:
# tuning_parameter with svm
svm_model = SVC(break_ties = True, decision_function_shape = 'ovr', gamma = 'scale',probability = True,shrinking=True,verbose=True)
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(svm_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
svm_model.fit(X_train_scalar, y_train)
y_pred_model =svm_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(svm_model, X_test_scalar, y_test)
plt.show()

# LogisticRegression

In [None]:
# default parameter with LogisticRegression
logistic_regrssion_model = LogisticRegression()
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(logistic_regrssion_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
logistic_regrssion_model.fit(X_train_scalar, y_train)
y_pred_model =svm_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(svm_model, X_test_scalar, y_test)
plt.show()

In [None]:
# find_best_parameter with LogisticRegression
logistic_regression_model = LogisticRegression()
logistic_regression_parameter = {    
    'penalty': ['l1','l2','elasticnet','none'],
    'dual': [True,False],
    'fit_intercept': [True,False],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'max_iter': [100,200,400,800],
    'multi_class': ['auto','ovr','multinomial'],
}

clf = GridSearchCV(logistic_regression_model, logistic_regression_parameter)
clf.fit(X_train_scalar, y_train)
print("\nbest_score = ",clf.best_score_)
print("best_params = ",clf.best_params_)

In [None]:
# tuning parameter with LogisticRegression
logistic_regrssion_model = LogisticRegression(dual= False, fit_intercept= True, max_iter= 100, multi_class= 'auto', penalty= 'l1', solver= 'saga')
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(logistic_regrssion_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
logistic_regrssion_model.fit(X_train_scalar, y_train)
y_pred_model =svm_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(svm_model, X_test_scalar, y_test)
plt.show()

# Decision Tree

In [None]:
# default regression with DecisionTreeClassifier
decision_tree_classifier_model = DecisionTreeClassifier()
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(decision_tree_classifier_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
decision_tree_classifier_model.fit(X_train_scalar, y_train)
y_pred_model =decision_tree_classifier_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(decision_tree_classifier_model, X_test_scalar, y_test)
plt.show()

In [None]:
# find_best_parameter with DecisionTreeClassifier
decision_tree_classifier_model = DecisionTreeClassifier()
decision_tree_classifier_parameter = {
    'criterion': ["gini", "entropy"],
    'splitter': ["best","random"],
    'max_depth': [3,5,None],
    'min_samples_split': [2,4,8],
    'min_samples_leaf': [1,2,4],
    'max_features':['auto','sqrt','log2']
}

clf = GridSearchCV(decision_tree_classifier_model, decision_tree_classifier_parameter)
clf.fit(X_train_scalar, y_train)
print("\nbest_score = ",clf.best_score_)
print("best_params = ",clf.best_params_)

In [None]:
# tuning_parameter with DecisionTreeClassifier
decision_tree_classifier_model = DecisionTreeClassifier(criterion = 'entropy', max_depth = None, max_features = 'sqrt', min_samples_leaf = 1, min_samples_split = 2, splitter = 'random')
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(decision_tree_classifier_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
decision_tree_classifier_model.fit(X_train_scalar, y_train)
y_pred_model =decision_tree_classifier_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(decision_tree_classifier_model, X_test_scalar, y_test)
plt.show()

# KNeighborsClassifier

In [None]:
# default regression with KNeighborsClassifier
KNeighbors_classifier_model = KNeighborsClassifier()
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(KNeighbors_classifier_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
KNeighbors_classifier_model.fit(X_train_scalar, y_train)
y_pred_model =KNeighbors_classifier_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(KNeighbors_classifier_model, X_test_scalar, y_test)
plt.show()

In [None]:
# find_best_parameter with KNeighborsClassifier
KNeighbors_classifier_model = KNeighborsClassifier()
KNeighbors_classifier_parameter = {
    'n_neighbors': [5,7,9],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1,2],
}

clf = GridSearchCV(KNeighbors_classifier_model, KNeighbors_classifier_parameter)
clf.fit(X_train_scalar, y_train)
print("\nbest_score = ",clf.best_score_)
print("best_params = ",clf.best_params_)

In [None]:
# tuning_parameter with KNeighborsClassifier
KNeighbors_classifier_model = KNeighborsClassifier(algorithm = 'auto', n_neighbors = 7, p = 1, weights = 'distance')
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(KNeighbors_classifier_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
KNeighbors_classifier_model.fit(X_train_scalar, y_train)
y_pred_model =KNeighbors_classifier_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(KNeighbors_classifier_model, X_test_scalar, y_test)
plt.show()

# GaussianNB

In [None]:
# default regression with GaussianNB
gaussian_nb_classifier_model = GaussianNB()
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(gaussian_nb_classifier_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
gaussian_nb_classifier_model.fit(X_train_scalar, y_train)
y_pred_model =gaussian_nb_classifier_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(gaussian_nb_classifier_model, X_test_scalar, y_test)
plt.show()

In [None]:
# find_best_parameter with GaussianNB
gaussian_nb_classifier_model = GaussianNB()
gaussian_nb_classifier_parameter = {
    'priors':[0.1,0.5, 0.9,None]
}

clf = GridSearchCV(gaussian_nb_classifier_model, gaussian_nb_classifier_parameter)
clf.fit(X_train_scalar, y_train)
print("\nbest_score = ",clf.best_score_)
print("best_params = ",clf.best_params_)

In [None]:
# tuning_parameter with GaussianNB
gaussian_nb_classifier_model = GaussianNB(priors = None)
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(gaussian_nb_classifier_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
gaussian_nb_classifier_model.fit(X_train_scalar, y_train)
y_pred_model =gaussian_nb_classifier_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(gaussian_nb_classifier_model, X_test_scalar, y_test)
plt.show()

# SGDClassifier

In [None]:
# default regression with SGDClassifier
sgd_classifier_model = SGDClassifier()
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(sgd_classifier_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
sgd_classifier_model.fit(X_train_scalar, y_train)
y_pred_model =sgd_classifier_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(sgd_classifier_model, X_test_scalar, y_test)
plt.show()

In [None]:
# find_best_parameter with SGDClassifier
sgd_classifier_model = SGDClassifier()
sgd_classifier_parameter = {
    'penalty': ['l2', 'l1', 'elasticnet'],
    'l1_ratio': [0.15,0.50,0.85],
    'fit_intercept': [True,False],
    'max_iter': [1000,2000,4000],
    'shuffle': [True,False],
    'learning_rate': ['constant','optimal','invscaling','adaptive'],
}

clf = GridSearchCV(sgd_classifier_model, sgd_classifier_parameter)
clf.fit(X_train_scalar, y_train)
print("\nbest_score = ",clf.best_score_)
print("best_params = ",clf.best_params_)

In [None]:
# tuning_parameter with SGDClassifier
sgd_classifier_model = SGDClassifier(fit_intercept = False, l1_ratio = 0.85, learning_rate = 'optimal', max_iter = 1000, penalty = 'l2', shuffle = True)
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(sgd_classifier_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
sgd_classifier_model.fit(X_train_scalar, y_train)
y_pred_model =sgd_classifier_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(sgd_classifier_model, X_test_scalar, y_test)
plt.show()

# Random Forest Classifier

In [None]:
# default regression with RandomForestClassifier
random_forest_classifier_model = RandomForestClassifier()
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(random_forest_classifier_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
random_forest_classifier_model.fit(X_train_scalar, y_train)
y_pred_model =random_forest_classifier_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(random_forest_classifier_model, X_test_scalar, y_test)
plt.show()

In [None]:
# find_best_parameter with RandomForestClassifier
random_forest_classifier_model = RandomForestClassifier()
random_forest_classifier_parameter = {
    'n_estimators': [100,400,700,1000],
    'max_features': ["auto", "sqrt"],
    'max_depth' : [3,5,None],
    'min_samples_split' : [2, 5, 10],
    'min_samples_leaf' : [1, 2, 4],
    'bootstrap': [True, False],
}

clf = GridSearchCV(random_forest_classifier_model, random_forest_classifier_parameter)
clf.fit(X_train_scalar, y_train)
print("\nbest_score = ",clf.best_score_)
print("best_params = ",clf.best_params_)

In [None]:
# tuning_parameter with RandomForestClassifier
random_forest_classifier_model = RandomForestClassifier(n_estimators = 700, max_features = "auto", max_depth = None, min_samples_leaf =1,bootstrap =True,min_samples_split=2)
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(random_forest_classifier_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
random_forest_classifier_model.fit(X_train_scalar, y_train)
y_pred_model =random_forest_classifier_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(random_forest_classifier_model, X_test_scalar, y_test)
plt.show()

# XGBClassifier

In [None]:
# default regression with XGBClassifier
xgb_classifier_model = XGBClassifier()
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(xgb_classifier_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
xgb_classifier_model.fit(X_train_scalar, y_train)
y_pred_model =xgb_classifier_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(xgb_classifier_model, X_test_scalar, y_test)
plt.show()

In [None]:
# find_best_parameter with XGBClassifier
xgb_classifier_model = XGBClassifier()
xgb_classifier_parameter = {
    'max_depth': [ 3, 4, 5, 6, 8, 10, 12, 15 ],
#     'learning_rate': [ 0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ],
#     'n_estimators': [ 50, 75, 100, 125, 150 ],
#     'min_child_weight': [ 1, 3, 5, 7 ],
#     'gamma': [ 0.0, 0.1, 0.2, 0.3, 0.4 ],
#     'colsample_bytree': [ 0.3, 0.4, 0.5, 0.7 ] 
}

clf = GridSearchCV(xgb_classifier_model, xgb_classifier_parameter)
clf.fit(X_train_scalar, y_train)
print("\nbest_score = ",clf.best_score_)
print("best_params = ",clf.best_params_)

In [None]:
# tuning parameter with XGBClassifier
xgb_classifier_model = XGBClassifier(max_depth =3)
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(xgb_classifier_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
xgb_classifier_model.fit(X_train_scalar, y_train)
y_pred_model =xgb_classifier_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(xgb_classifier_model, X_test_scalar, y_test)
plt.show()

# Multi layer perceptron

In [None]:
# default regression with MLPClassifier
mlp_classifier_model = MLPClassifier()
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(mlp_classifier_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
mlp_classifier_model.fit(X_train_scalar, y_train)
y_pred_model =mlp_classifier_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(mlp_classifier_model, X_test_scalar, y_test)
plt.show()

In [None]:
# find_best_parameter with MLPClassifier
mlp_classifier_model = MLPClassifier()
mlp_classifier_parameter = {
    'activation': ['identity', 'logistic', 'tanh', 'relu'],
    'solver': ['lbfgs', 'sgd', 'adam'],
#     'power_t': [0.1,0.5,0.8],
    'max_iter': [200,800,1600],
#     'shuffle': [True,False],
#     'verbose': [True,False],
#     'warm_start': [True,False],
#     'nesterovs_momentum': [True,False],
#     'early_stopping': [True,False],
#     'validation_fraction': [0.1,0.5,0.8]
}

clf = GridSearchCV(mlp_classifier_model, mlp_classifier_parameter)
clf.fit(X_train_scalar, y_train)
print("\nbest_score = ",clf.best_score_)
print("best_params = ",clf.best_params_)

In [None]:
# tuning parameter with MLPClassifier
mlp_classifier_model = MLPClassifier(activation = 'relu',solver = 'adam',max_iter =200)
kfold = model_selection.KFold(n_splits=10)
cv = model_selection.cross_val_score(mlp_classifier_model, X_train_scalar, y_train, cv=kfold)
cv_mean = cv.mean()
cv_std = cv.std()
print('mean =',cv_mean)
print('std =',cv_std)
mlp_classifier_model.fit(X_train_scalar, y_train)
y_pred_model =mlp_classifier_model.predict(X_test_scalar)
print('accuracy_score = ',accuracy_score(y_test, y_pred_model))
print(classification_report(y_test, y_pred_model))
plot_confusion_matrix(mlp_classifier_model, X_test_scalar, y_test)
plt.show()

In [None]:
df_default_parameter = pd.DataFrame({
    'model':['svm','LogisticRegression','DecisionTreeClassifier','KNeighborsClassifier','GaussianNB','SGDClassifier','RandomForestClassifier','XGBClassifier','MLPClassifier'],
    'accuracy': [0.7,0.7,0.65,0.45,0.4,0.4,0.6,0.65,0.75]
})
df_default_parameter

In [None]:
# compare default parameter
plt.figure(figsize=(40,20))
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="model", y="accuracy", data=df_default_parameter)

In [None]:
df_tuning_parameter = pd.DataFrame({
    'model':['svm','LogisticRegression','DecisionTreeClassifier','KNeighborsClassifier','GaussianNB','SGDClassifier','RandomForestClassifier','XGBClassifier','MLPClassifier'],
    'accuracy': [0.7,0.7,0.75,0.6,0.4,0.6,0.7,0.65,0.75]
})
df_tuning_parameter

In [None]:
# compare default parameter
plt.figure(figsize=(40,20))
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="model", y="accuracy", data=df_tuning_parameter)

# Conclusion

# limitation
The reason why the accuracy does not exceed 80 percent and the predicted volatility is due to the very small number of datasets.
# advantage
1. data is relatively clean and columns are relatively few, easy to get started with.
2. Relatively less data means less training time.

Finally!!!!