# Modeling

## Hans

## Annie: KNN and gradient boosting classifier

In [49]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn import metrics
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline

from skopt import BayesSearchCV

import datetime

ModuleNotFoundError: No module named 'skopt'

In [2]:
df = pd.read_csv('../Data/salary_cleaned.csv')
df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,year,month,year_month,timestamp_3mos,year_month_3mos,state_short,inflation_rate,inflation_rate_3mos,state,employment_rate,employment_rate_3mos
0,2018-06-03 13:58:20,Yahoo,IC2,Software Engineer,160.5,"Sunnyvale, CA",0.58,0.58,Full Stack,2018,6,2018-06,2018-03-03 13:58:20,2018-03,CA,0.029,0.024,California,0.95766,0.956797
1,2018-06-04 20:28:22,Facebook,E3,Software Engineer,165.0,"Seattle, WA",1.0,1.0,Full Stack,2018,6,2018-06,2018-03-04 20:28:22,2018-03,WA,0.029,0.024,Washington,0.955998,0.954978
2,2018-06-05 00:56:33,VmWare,Senior MTS,Software Engineer,218.0,"Palo Alto, CA",8.0,1.0,Distributed Systems (Back-End),2018,6,2018-06,2018-03-05 00:56:33,2018-03,CA,0.029,0.024,California,0.95766,0.956797
3,2018-06-05 01:19:05,Uber,L4,Software Engineer,240.0,"San Francisco, CA",3.0,0.0,Web Development (Front-End),2018,6,2018-06,2018-03-05 01:19:05,2018-03,CA,0.029,0.024,California,0.95766,0.956797
4,2018-06-05 07:13:17,Capital One,Master Software Engineer,Software Engineer,196.0,"New York, NY",8.0,2.0,iOS,2018,6,2018-06,2018-03-05 07:13:17,2018-03,NY,0.029,0.024,New York,0.959053,0.955962


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24496 entries, 0 to 24495
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   timestamp                24496 non-null  object 
 1   company                  24496 non-null  object 
 2   level                    24496 non-null  object 
 3   title                    24496 non-null  object 
 4   totalyearlycompensation  24496 non-null  float64
 5   location                 24496 non-null  object 
 6   yearsofexperience        24496 non-null  float64
 7   yearsatcompany           24496 non-null  float64
 8   tag                      24496 non-null  object 
 9   year                     24496 non-null  int64  
 10  month                    24496 non-null  int64  
 11  year_month               24496 non-null  object 
 12  timestamp_3mos           24496 non-null  object 
 13  year_month_3mos          24496 non-null  object 
 14  state_short           

In [4]:
# convert year and month from integer to string
df['year'] = df['year'].astype(str)
df['month'] = df['month'].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24496 entries, 0 to 24495
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   timestamp                24496 non-null  object 
 1   company                  24496 non-null  object 
 2   level                    24496 non-null  object 
 3   title                    24496 non-null  object 
 4   totalyearlycompensation  24496 non-null  float64
 5   location                 24496 non-null  object 
 6   yearsofexperience        24496 non-null  float64
 7   yearsatcompany           24496 non-null  float64
 8   tag                      24496 non-null  object 
 9   year                     24496 non-null  object 
 10  month                    24496 non-null  object 
 11  year_month               24496 non-null  object 
 12  timestamp_3mos           24496 non-null  object 
 13  year_month_3mos          24496 non-null  object 
 14  state_short           

In [5]:
df['location'].value_counts(normalize=True)

Seattle, WA          0.161169
San Francisco, CA    0.136920
New York, NY         0.078829
Redmond, WA          0.065072
Sunnyvale, CA        0.049110
                       ...   
Schenectady, NY      0.000041
Allentown, PA        0.000041
Portsmouth, RI       0.000041
Fullerton, CA        0.000041
Elk Grove, CA        0.000041
Name: location, Length: 447, dtype: float64

In [6]:
# create 
features_all = ['company', 'title', 'location', 'yearsofexperience', 'yearsatcompany', 'year', 'month', 'year_month', 'state_short', 'inflation_rate', 'inflation_rate_3mos', 'employment_rate', 'employment_rate_3mos']
features_short = ['company', 'title', 'yearsofexperience', 'yearsatcompany', 'year', 'month', 'state', 'inflation_rate', 'inflation_rate_3mos', 'employment_rate', 'employment_rate_3mos']

X = df[features_short]
y = df['totalyearlycompensation']
X.shape, y.shape

((24496, 11), (24496,))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((18372, 11), (18372,), (6124, 11), (6124,))

In [8]:
X_train.head()

Unnamed: 0,company,title,yearsofexperience,yearsatcompany,year,month,state,inflation_rate,inflation_rate_3mos,employment_rate,employment_rate_3mos
19069,Google,Product Manager,15.0,1.0,2020,7,Washington,0.01,0.003,0.898081,0.83723
3156,Amazon,Software Engineer,5.0,5.0,2019,3,Washington,0.019,0.019,0.955238,0.954989
14183,Twilio,Software Engineer,4.0,1.0,2020,4,California,0.003,0.025,0.836228,0.96139
13104,Intel,Software Engineer,9.0,9.0,2020,3,Arizona,0.015,0.023,0.938761,0.95457
20679,Cisco,Software Engineer,0.0,0.0,2020,8,California,0.013,0.001,0.888158,0.835571


### Transform data: standardize and one hot encoding

In [26]:
ct = ColumnTransformer([
    ('sc', StandardScaler(), make_column_selector(dtype_include=np.number)),
    #('ohe', OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object)),
    #('ohe', OneHotEncoder(handle_unknown='ignore'), ['company','title','year','month','state']),
    ('ohe', OneHotEncoder(handle_unknown='ignore'), [0,1,6])
    ])

X_train_ct = ct.fit_transform(X_train)
X_test_ct = ct.transform(X_test)

In [42]:
X_train_ct.shape

(18372, 1219)

In [25]:
X_train

Unnamed: 0,company,title,yearsofexperience,yearsatcompany,year,month,state,inflation_rate,inflation_rate_3mos,employment_rate,employment_rate_3mos
19069,Google,Product Manager,15.0,1.0,2020,7,Washington,0.010,0.003,0.898081,0.837230
3156,Amazon,Software Engineer,5.0,5.0,2019,3,Washington,0.019,0.019,0.955238,0.954989
14183,Twilio,Software Engineer,4.0,1.0,2020,4,California,0.003,0.025,0.836228,0.961390
13104,Intel,Software Engineer,9.0,9.0,2020,3,Arizona,0.015,0.023,0.938761,0.954570
20679,Cisco,Software Engineer,0.0,0.0,2020,8,California,0.013,0.001,0.888158,0.835571
...,...,...,...,...,...,...,...,...,...,...,...
21575,BlackRock,Product Manager,8.0,2.0,2020,8,California,0.013,0.001,0.888158,0.835571
5390,Amazon,Software Engineer,2.0,0.0,2019,7,Washington,0.018,0.020,0.957501,0.955663
860,Splunk,Product Manager,9.0,1.0,2018,9,Washington,0.023,0.029,0.956046,0.955998
15795,Facebook,Software Engineer,5.0,1.0,2020,5,California,0.001,0.023,0.835571,0.961092


In [22]:
ct.get_feature_names_out()

array(['sc__yearsofexperience', 'sc__yearsatcompany',
       'sc__inflation_rate', ..., 'ohe__state_Washington',
       'ohe__state_West Virginia', 'ohe__state_Wisconsin'], dtype=object)

### KNN 

In [38]:
knn = KNeighborsRegressor(n_jobs=-1)

knn_params = {
    'n_neighbors': [3,5,7,9,11,13,15,17,19]
    ,'weights': ['uniform', 'distance']
    ,'p': [1,2]   # this one makes the training time much longer, and didn't improve R2 significantly
}

gs_knn = GridSearchCV(estimator = knn,
                     param_grid = knn_params,
                     cv = 5)

# train the model

print(datetime.datetime.now())

gs_knn.fit(X_train_ct, y_train)

print(datetime.datetime.now())
print()

2022-01-04 12:09:34.704417
2022-01-04 12:31:22.106470



In [None]:
# predict
knn_pred_train = gs_knn.predict(X_train_ct)
knn_pred_test = gs_knn.predict(X_test_ct)

# evaluate
gs_knn_R2_train = gs_knn.score(X_train_ct, y_train)
gs_knn_R2_test = gs_knn.score(X_test_ct, y_test)

print(f'gs_knn_R2_train: {round(gs_knn_R2_train, 4)}')
print(f'gs_knn_R2_test: {round(gs_knn_R2_test, 4)}')

In [None]:
gs_knn_mse_train = metrics.mean_squared_error(y_train, knn_pred_train)
gs_knn_mse_test = metrics.mean_squared_error(y_test, knn_pred_test)

print(f'gs_knn_mse_train: {round(gs_knn_mse_train, 4)}')
print(f'gs_knn_mse_test: {round(gs_knn_mse_test, 4)}')

In [None]:
perf_dict = {
    'R2':[gs_knn_R2_train, gs_knn_R2_test],
    'mse':[gs_knn_mse_train, gs_knn_mse_test]
}

In [None]:
perf_df = pd.DataFrame(perf_dict, index=['Train','Test'])


In [40]:
gs_knn.best_estimator_

KNeighborsRegressor(n_jobs=-1, n_neighbors=13, p=1, weights='distance')

In [None]:
gs_knn.cv_results_

In [None]:
pd.DataFrame(gs_knn.cv_results_)

**Observation**: 
1. Comparing the R2 score and mse between the training and testing sets, the model is very overfit.
2. The best model from grid search CV uses 13 nearest neighbors, p=1, and distance as weights.

### Graident Boosting Classifer

#### No grid search

In [54]:
# fit the model
gbr = GradientBoostingRegressor(n_estimators=400,
                                max_depth=3,
                                max_features=800,
                                random_state=42)

print(datetime.datetime.now())

gbr.fit(X_train_ct, y_train)

print(datetime.datetime.now())

2022-01-05 15:52:09.839168
2022-01-05 15:54:41.358522


In [58]:
# predict
gbr_pred_train = gbr.predict(X_train_ct)
gbr_pred_test = gbr.predict(X_test_ct)

# evaluate
gbr_R2_train = gbr.score(X_train_ct, y_train)
gbr_R2_test = gbr.score(X_test_ct, y_test)

gbr_mse_train = metrics.mean_squared_error(y_train, gbr_pred_train)
gbr_mse_test = metrics.mean_squared_error(y_test, gbr_pred_test)

gbr_perf_dict = {
    'R2': [gbr_R2_train, gbr_R2_test],
    'MSE': [gbr_mse_train, gbr_mse_test]
}

gbr_perf_df = pd.DataFrame(gbr_perf_dict, index=['Train','Test'])
gbr_perf_df.T

Unnamed: 0,Train,Test
R2,0.597249,0.531825
MSE,6834.117462,8198.404325


#### GridSearchCV

##### GridSearch 0 
Model No. 0: killed after running for 19 hours

##### GridSearch 1

In [69]:
# build the grid search for hyperparameters 

gbr = GradientBoostingRegressor(random_state=42)

gbr_params = {
    'n_estimators': [300,400,500],
    #'learning_rate': [0.01, 0.1],
    'max_depth': [5,6,7,8],
    #'min_samples_split': [5,7,10],
    #'min_samples_leaf': [2,3,5],
    'max_features': [100,150,200]
}

gs_gbr = GridSearchCV(gbr,
                     gbr_params,
                     cv=5)

# train

print(datetime.datetime.now())

gs_gbr.fit(X_train_ct, y_train)

print(datetime.datetime.now())

2022-01-05 21:12:19.908880
2022-01-05 23:36:23.269022


In [78]:
# predict

gs_gbr_pred_train = gs_gbr.predict(X_train_ct)
gs_gbr_pred_test = gs_gbr.predict(X_test_ct)

# evaluate

gs_gbr_R2_train = gs_gbr.score(X_train_ct, y_train)
gs_gbr_R2_test = gs_gbr.score(X_test_ct, y_test)

print(f'gs_gbr_R2_train: {round(gs_gbr_R2_train, 4)}')
print(f'gs_gbr_R2_test: {round(gs_gbr_R2_test, 4)}')
print()

gs_gbr_mse_train = metrics.mean_squared_error(y_train, gs_gbr_pred_train)
gs_gbr_mse_test = metrics.mean_squared_error(y_test, gs_gbr_pred_test)

print(f'gs_gbr_mse_train: {round(gs_gbr_mse_train, 4)}')
print(f'gs_gbr_mse_test: {round(gs_gbr_mse_test, 4)}')

gs_gbr_perf_dict = {
    'R2': [gs_gbr_R2_train, gs_gbr_R2_test],
    'MSE': [gs_gbr_mse_train, gs_gbr_mse_test]
}

gs_gbr_perf_df = pd.DataFrame(gs_gbr_perf_dict, index=['Train','Test'])
gs_gbr_perf_df.T

gs_gbr_R2_train: 0.7683
gs_gbr_R2_test: 0.5496

gs_gbr_mse_train: 3932.287
gs_gbr_mse_test: 7887.0258


Unnamed: 0,Train,Test
R2,0.768261,0.549606
MSE,3932.28703,7887.02579


In [79]:
gs_gbr.best_estimator_

GradientBoostingRegressor(max_depth=7, max_features=200, n_estimators=500,
                          random_state=42)

In [80]:
gs_gbr.best_score_

0.5460109618022845

In [81]:
# pd.DataFrame(gs_gbr.cv_results_)

##### GridSearch 2

In [66]:
# build the grid search for hyperparameters 

gbr2 = GradientBoostingRegressor(random_state=42)

gbr_params = {
    'n_estimators': [400],
    #'learning_rate': [0.01, 0.1],
    'max_depth': [3,4,5,6],
    #'min_samples_split': [5,7,10],
    #'min_samples_leaf': [2,3,5],
    'max_features': [200, 300, 400]
}

gs_gbr2 = GridSearchCV(gbr,
                     gbr_params,
                     cv=5)

# train

print(datetime.datetime.now())

gs_gbr2.fit(X_train_ct, y_train)

print(datetime.datetime.now())

2022-01-05 18:54:09.653189
2022-01-05 19:58:42.168533


In [67]:
# predict

gs_gbr2_pred_train = gs_gbr2.predict(X_train_ct)
gs_gbr2_pred_test = gs_gbr2.predict(X_test_ct)

# evaluate

gs_gbr2_R2_train = gs_gbr2.score(X_train_ct, y_train)
gs_gbr2_R2_test = gs_gbr2.score(X_test_ct, y_test)

print(f'gs_gbr2_R2_train: {round(gs_gbr2_R2_train), 4}')
print(f'gs_gbr2_R2_test: {round(gs_gbr2_R2_test), 4}')
print()

gs_gbr2_mse_train = metrics.mean_squared_error(y_train, gs_gbr2_pred_train)
gs_gbr2_mse_test = metrics.mean_squared_error(y_test, gs_gbr2_pred_test)

print(f'gs_gbr2_mse_train: {round(gs_gbr2_mse_train), 4}')
print(f'gs_gbr2_mse_test: {round(gs_gbr2_mse_test), 4}')

gs_gbr2_perf_dict = {
    'R2': [gs_gbr2_R2_train, gs_gbr2_R2_test],
    'MSE': [gs_gbr2_mse_train, gs_gbr2_mse_test]
}

gs_gbr2_perf_df = pd.DataFrame(gs_gbr2_perf_dict, index=['Train','Test'])
gs_gbr2_perf_df.T

gs_gbr2_R2_train: (1.0, 4)
gs_gbr2_R2_test: (1.0, 4)

gs_gbr2_mse_train: (4778.0, 4)
gs_gbr2_mse_test: (7860.0, 4)


Unnamed: 0,Train,Test
R2,0.7184,0.551165
MSE,4778.349247,7859.727603


In [68]:
gs_gbr2.best_estimator_

GradientBoostingRegressor(max_depth=6, max_features=200, n_estimators=400,
                          random_state=42)

In [None]:
pd.DataFrame(gs_gbr2.cv_results_)

#### RandomizedSearchCV
[**Documentation on RandomizedSearchCV**](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)

In [82]:
# build a randomized search for hyperparameters - killed after running 5 hours

'''
gbr = GradientBoostingRegressor(random_state=42)

gbr_params = {
    'n_estimators': range(300, 501, 100),
    #'learning_rate':[0.01, 0.1],
    'max_depth': range(2, 5, 1),
    #'min_samples_split': [5,7,10],
    #'min_samples_leaf': [2,3,5],
    'max_features': range(200, 501, 100)
}

rs_gbr = RandomizedSearchCV(gbr,
                            gbr_params,
                            n_iter=50,
                            random_state=42,
                            cv=5)

# train the model

print(datetime.datetime.now())

rs_gbr.fit(X_train_ct, y_train)

print(datetime.datetime.now())
'''

"\ngbr = GradientBoostingRegressor(random_state=42)\n\ngbr_params = {\n    'n_estimators': range(300, 501, 100),\n    #'learning_rate':[0.01, 0.1],\n    'max_depth': range(2, 5, 1),\n    #'min_samples_split': [5,7,10],\n    #'min_samples_leaf': [2,3,5],\n    'max_features': range(200, 501, 100)\n}\n\nrs_gbr = RandomizedSearchCV(gbr,\n                            gbr_params,\n                            n_iter=50,\n                            random_state=42,\n                            cv=5)\n\n# train the model\n\nprint(datetime.datetime.now())\n\nrs_gbr.fit(X_train_ct, y_train)\n\nprint(datetime.datetime.now())\n"

In [None]:
# predict
rs_gbr_pred_train = rs_gbr.predict(X_train_ct)
rs_gbr_pred_test = rs.gbr.predict(X_test_ct)


# evaluate
rs_gbr_R2_train = rs_gbr.score(X_train_ct, y_train)
rs_gbr_R2_test = rs.gbr.score(X_test_ct, y_test)

print(f'rs_gbr_R2_train: {round(rs_gbr_R2_train), 4}')
print(f'rs_gbr_R2_test: {round(rs_gbr_R2_test), 4}')

rs_gbr_mse_train = metrics.mean_squared_error(y_train, rs_gbr_pred_train)
rs_gbr_mse_test = metrics.mean_squared_error(y_test, rs_gbr_pred_test)

rs_gbr_perf_dict = {
    'R2':  [rs_gbr_R2_train,  rs_gbr_R2_test],
    'MSE': [rs_gbr_mse_train, rs_gbr_mse_test]
}

rs_gbr_perf_df = pd.DataFrame(rs_gbr_perf_dict, index=['Train','Test'])
rs_gbr_perf_df.T

In [None]:
rs_gbr.best_estimator_

In [None]:
pd.DataFrame(rs_gbr.cv_results_)

#### BayesSearchCV
[**Documentation on skopt.BayesSearchCV**](https://scikit-optimize.github.io/stable/modules/generated/skopt.BayesSearchCV.html)

In [None]:
# build the parameter search CV - didn't get to run due to time limit

gbr = GradientBoostingRegressor(random_state=42)

gbr_params = {
    'n_estimators': [400, 600, 800],
    'learning_rate': [0.01, 0.1],
    'max_depth': [2,3,5,7],
    'min_samples_split': [5,7,10],
    'min_samples_leaf': [2,3,5],
    'max_features': [400, 600, 800, 1000] 
}

bs_gbr = BayesSearchCV(gbr,
                       gbr_params,
                       n_iter=32,
                       random_state=42)

# train
print(datetime.datetime.now())

bs_gbr.fit(X_train_ct, y_train)

print(datetime.datetime.now())

In [None]:
# predict
bs_gbr_pred_train = bs_gbr.predict(X_train_ct)
bs_gbr_pred_test = bs_gbr.predict(X_teset_ct)

# evaluate
bs_gbr_R2_train = bs_gbr.score(X_train_ct, y_train)
bs_gbr_R2_test = bs_gbr.score(X_test_ct, y_test)

print(f'bs_gbr_R2_train: {round(bs_gbr_R2_train), 4}')
print(f'bs_gbr_R2_test: {round(bs_gbr_R2_test), 4}')
print()

bs_gbr_mse_train = metrics.mean_squared_error(X_train_ct, y_train)
bs_gbr_mse_test = metrics.mean_squared_error(X_test_ct, y_test)

print(f'bs_gbr_mse_train, {round(bs_gbr_mse_train), 4}')
print(f'bs_gbr_mse_test, {round(bs_gbr_mse_test), 4}')

In [None]:
bs_gbr.best_estimator_

In [None]:
pd.DataFrame(bs_gbr.cv_results_)

## Sileshi

## Mason