## Hyperparameter Optimization

### General Libraries

In [166]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

### Constructing the classifier

In [167]:
# read the data
df = pd.read_csv("./data/Latest_Data_Science_Salaries.csv",sep=";")

# check data frame shape
df.shape

(3300, 11)

In [168]:
def preprocessing(df):
    """ 
        Cette methode permet de faire le preprocessing d'un dataset
    """
    # colsReturns = getColumns(df,df.columns)

    # newData = df[colsReturns]

    newData = pd.get_dummies(df)

    return newData

In [169]:
df = preprocessing(df)

In [170]:
from sklearn.model_selection import train_test_split

# splitting data
X = df.drop(["Salary in USD","Salary","Year"], 1)
y = df[["Salary in USD"]]

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,shuffle=False)

In [171]:
y

Unnamed: 0,Salary in USD
0,210000
1,165000
2,185900
3,129300
4,140000
...,...
3295,412000
3296,151000
3297,105000
3298,100000


In [172]:
y.value_counts()

Salary in USD
100000           56
200000           50
150000           50
120000           46
160000           36
                 ..
113366            1
113476            1
113600            1
114047            1
450000            1
Length: 1315, dtype: int64

In [173]:
from sklearn.model_selection import train_test_split

# splitting data
X = df.drop(["Salary in USD","Year","Salary"], 1)
y = df[["Salary in USD"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,shuffle=False)

In [174]:
from sklearn.preprocessing import StandardScaler,RobustScaler,MinMaxScaler
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

### Hyperparameter Optimization

#### Defining hyperparameter space

In [152]:
hyperparameters = {
    'max_depth': [5, 10, 15,100,200,300,400,500,1000],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'splitter': ['best', 'random']
}

#### Random search

In [145]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor

# perform random search
clf_rs = RandomizedSearchCV(DecisionTreeRegressor(), hyperparameters, cv = 3, scoring = "f1", n_iter = 20)
random_search = clf_rs.fit(X_train, y_train)

# identify best parameters from random search
random_search.best_params_

{'splitter': 'best',
 'min_samples_split': 5,
 'min_samples_leaf': 4,
 'max_depth': 10}

In [146]:
model_arbre = DecisionTreeRegressor(max_depth=random_search.best_params_["max_depth"],
                                     min_samples_leaf=random_search.best_params_["min_samples_leaf"],
                                     min_samples_split=2,splitter=random_search.best_params_["splitter"],
                                     min_weight_fraction_leaf=0.0)
model_arbre.fit(X_train,y_train)
model_arbre.score(X_test,y_test)

0.3554797491828071

#### Grid search

In [139]:
from sklearn.model_selection import GridSearchCV

# perform grid search
clf_gs = GridSearchCV(DecisionTreeRegressor(), hyperparameters, cv = 3, scoring = "f1")
grid_search = clf_gs.fit(X_train, y_train)

# identify best parameters from grid search
grid_search.best_params_

{'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'best'}

In [140]:
model_arbre1 = DecisionTreeRegressor(max_depth=grid_search.best_params_["max_depth"],
                                     min_samples_leaf=grid_search.best_params_["min_samples_leaf"],
                                     min_samples_split=2,splitter=grid_search.best_params_["splitter"],
                                     min_weight_fraction_leaf=0.0)
model_arbre1.fit(X_train,y_train)
model_arbre1.score(X_test,y_test)

0.304381312871428

#### Bayesian Optimization

In [141]:
from skopt import BayesSearchCV

# perform bayesian optimization
clf_bo = BayesSearchCV(DecisionTreeRegressor(), hyperparameters, cv =3,  n_iter = 20)
bayes_search = clf_bo.fit(X_train, y_train)

# identify best parameters from bayesian optimization
bayes_search.best_params_

OrderedDict([('max_depth', 400),
             ('min_samples_leaf', 4),
             ('min_samples_split', 2),
             ('splitter', 'random')])

In [142]:
model_arbre2 = DecisionTreeRegressor(max_depth=bayes_search.best_params_["max_depth"],
                                     min_samples_leaf=bayes_search.best_params_["min_samples_leaf"],
                                     min_samples_split=2,splitter=bayes_search.best_params_["splitter"],
                                     min_weight_fraction_leaf=0.0)
model_arbre2.fit(X_train,y_train)
model_arbre2.score(X_test,y_test)

0.3552607280723453

In [175]:
model_arbre = DecisionTreeRegressor(max_depth=500,min_samples_leaf=11,min_samples_split=2,splitter='best',min_weight_fraction_leaf=0.0)
model_arbre.fit(X_train,y_train)
model_arbre.score(X_test,y_test)

0.45083356175149125