## Hyperparameter Optimization

### General Libraries

In [47]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings("ignore")

### Constructing the classifier

In [48]:
# read the data
df = pd.read_csv("./data/Latest_Data_Science_Salaries.csv",sep=";")

# check data frame shape
df.shape

(3300, 11)

In [50]:
df.columns

Index(['Job Title', 'Employment Type', 'Experience Level', 'Expertise Level',
       'Salary', 'Salary Currency', 'Company Location', 'Salary in USD',
       'Employee Residence', 'Company Size', 'Year'],
      dtype='object')

In [51]:
# remplace chaque modalité dont le nombre d'occurences est inferieur à 10 par une nouvelle modalité 

imbalanceCol = ["Company Location", "Job Title"]

for col in imbalanceCol:
    counts = df[col].value_counts()
    modalities = counts[counts < 10].index.tolist()
    df[col] = df[col].replace(modalities, f"rare_{col}")

In [52]:
imbalanceCol = ["Salary Currency"]

for col in imbalanceCol:
    counts = df[col].value_counts()
    modalities = counts[counts < 45].index.tolist()
    df[col] = df[col].replace(modalities, f"rare_{col}")

In [53]:
def preprocessing(df):
    """ 
        Cette methode permet de faire le preprocessing d'un dataset
    """
    # colsReturns = getColumns(df,df.columns)

    # newData = df[colsReturns]

    newData = pd.get_dummies(df)

    return newData

In [54]:
df = preprocessing(df)

In [55]:
df.shape

(3300, 155)

In [56]:
from sklearn.model_selection import train_test_split

# splitting data
X = df.drop(["Salary in USD","Salary"], 1)
y = df[["Salary in USD"]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20,shuffle=False)

In [57]:
from sklearn.preprocessing import StandardScaler,RobustScaler,MinMaxScaler
scaler = MinMaxScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

### Hyperparameter Optimization

#### Defining hyperparameter space

In [58]:
hyperparameters = {
    'n_estimators': [100, 200, 500],                # Nombre d'arbres dans la forêt
    'max_depth': [10, 20, 50,100,500,1000],                # Profondeur maximale des arbres
    'min_samples_split': [2, 5, 10],                # Nombre minimum d'échantillons requis pour scinder un nœud interne
    'min_samples_leaf': [1, 2, 4,10]                  # Nombre minimum d'échantillons requis dans une feuille

}

#### Random search

In [59]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# perform random search
clf_rs = RandomizedSearchCV(RandomForestRegressor(), hyperparameters, cv = 3, n_iter = 20)
random_search = clf_rs.fit(X_train, y_train)

# identify best parameters from random search
random_search.best_params_

{'n_estimators': 200,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_depth': 100}

In [60]:
model_arbre = RandomForestRegressor(
                                     max_depth=random_search.best_params_["max_depth"],
                                     min_samples_leaf=random_search.best_params_["min_samples_leaf"],
                                     min_samples_split=random_search.best_params_["min_samples_split"],
                                     min_weight_fraction_leaf=0.0,
                                    #  n_estimators=random_search.best_params_["n_estimators"],
                                    #  bootstrap=random_search.best_params_["bootstrap"],
                                    #  max_features=random_search.best_params_["max_features"]

                                     
                                     )
model_arbre.fit(X_train,y_train)
model_arbre.score(X_test,y_test)

0.4495586046253258

#### Grid search

In [61]:
# from sklearn.model_selection import GridSearchCV

# # perform grid search
# clf_gs = GridSearchCV(RandomForestRegressor(), hyperparameters, cv = 3)
# grid_search = clf_gs.fit(X_train, y_train)

# # identify best parameters from grid search
# grid_search.best_params_

In [62]:
# model_arbre1 = RandomForestRegressor(
#                                      max_depth=grid_search.best_params_["max_depth"],
#                                      min_samples_leaf=grid_search.best_params_["min_samples_leaf"],
#                                      min_samples_split=grid_search.best_params_["min_samples_split"],
#                                      min_weight_fraction_leaf=0.0,
#                                     #  n_estimators=grid_search.best_params_["n_estimators"],
#                                     #  bootstrap=grid_search.best_params_["bootstrap"],
#                                     #  max_features=grid_search.best_params_["max_features"]

                                     
#                                      )
# model_arbre1.fit(X_train,y_train)
# model_arbre1.score(X_test,y_test)

#### Bayesian Optimization

In [63]:
from skopt import BayesSearchCV

# perform bayesian optimization
clf_bo = BayesSearchCV(RandomForestRegressor(), hyperparameters, cv =3,  n_iter = 20)
bayes_search = clf_bo.fit(X_train, y_train)

# identify best parameters from bayesian optimization
bayes_search.best_params_

OrderedDict([('max_depth', 1000),
             ('min_samples_leaf', 2),
             ('min_samples_split', 10),
             ('n_estimators', 500)])

In [64]:
model_arbre2 = RandomForestRegressor(
                                    max_depth=bayes_search.best_params_["max_depth"],
                                     min_samples_leaf=bayes_search.best_params_["min_samples_leaf"],
                                     min_samples_split=bayes_search.best_params_["min_samples_split"],
                                     min_weight_fraction_leaf=0.0,
                                     n_estimators=bayes_search.best_params_["n_estimators"],
                                    #  bootstrap=grid_search.best_params_["bootstrap"],
                                    #  max_features=grid_search.best_params_["max_features"]
                                    )
model_arbre2.fit(X_train,y_train)
model_arbre2.score(X_test,y_test)

0.45478108676043194