In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import metrics
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline

from skopt import BayesSearchCV

import datetime

In [None]:
df = pd.read_csv('../Data/salary_cleaned.csv')
df.head()

In [None]:
df.info()

In [None]:
# convert year and month from integer to string
df['year'] = df['year'].astype(str)
df['month'] = df['month'].astype(str)
df.info()

In [None]:
df['location'].value_counts(normalize=True)

In [None]:
# create 
features_all = ['company', 'title', 'location', 'yearsofexperience', 'yearsatcompany', 'year', 'month', 'year_month', 'state_short', 'inflation_rate', 'inflation_rate_3mos', 'employment_rate', 'employment_rate_3mos']
features_short = ['company', 'title', 'yearsofexperience', 'yearsatcompany', 'year', 'month', 'state', 'inflation_rate', 'inflation_rate_3mos', 'employment_rate', 'employment_rate_3mos']

X = df[features_short]
y = df['totalyearlycompensation']
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
X_train.head()

### Transform data: standardize and one hot encoding

In [None]:
ct = ColumnTransformer([
    ('sc', StandardScaler(), make_column_selector(dtype_include=np.number)),
    #('ohe', OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object)),
    #('ohe', OneHotEncoder(handle_unknown='ignore'), ['company','title','year','month','state']),
    ('ohe', OneHotEncoder(handle_unknown='ignore'), [0,1,6])
    ])

X_train_ct = ct.fit_transform(X_train)
X_test_ct = ct.transform(X_test)

In [None]:
X_train_ct.shape

In [None]:
X_train

In [None]:
ct.get_feature_names_out()

### Graident Boosting Classifer

#### RandomizedSearchCV
[**Documentation on RandomizedSearchCV**](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)

In [None]:
# build a randomized search for hyperparameters

gbr = GradientBoostingRegressor(random_state=42)

gbr_params = {
    'n_estimators': range(400, 1001, 200),
    #'learning_rate':[0.01, 0.1],
    'max_depth': range(2, 6, 1),
    #'min_samples_split': [5,7,10],
    #'min_samples_leaf': [2,3,5],
    'max_features': range(400, 1001, 200)
}

rs_gbr = RandomizedSearchCV(gbr,
                            gbr_params,
                            n_iter=50,
                            random_state=42,
                            cv=5)

# train the model

print(datetime.datetime.now())

rs_gbr.fit(X_train_ct, y_train)

print(datetime.datetime.now())