# Modeling

In [1]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, Lasso, LassoCV, ElasticNet, ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, classification_report, mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.svm import SVR
from sklearn import metrics
from skopt import BayesSearchCV
# pip install scikit-optimize

import pickle
import datetime

## Hans

### SVM Model

In [2]:
df = pd.read_csv('../Data/salary_cleaned.csv')
df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,year,month,year_month,timestamp_3mos,year_month_3mos,state_short,inflation_rate,inflation_rate_3mos,state,employment_rate,employment_rate_3mos
0,2018-06-03 13:58:20,Yahoo,IC2,Software Engineer,160.5,"Sunnyvale, CA",0.58,0.58,Full Stack,2018,6,2018-06,2018-03-03 13:58:20,2018-03,CA,0.029,0.024,California,0.95766,0.956797
1,2018-06-04 20:28:22,Facebook,E3,Software Engineer,165.0,"Seattle, WA",1.0,1.0,Full Stack,2018,6,2018-06,2018-03-04 20:28:22,2018-03,WA,0.029,0.024,Washington,0.955998,0.954978
2,2018-06-05 00:56:33,VmWare,Senior MTS,Software Engineer,218.0,"Palo Alto, CA",8.0,1.0,Distributed Systems (Back-End),2018,6,2018-06,2018-03-05 00:56:33,2018-03,CA,0.029,0.024,California,0.95766,0.956797
3,2018-06-05 01:19:05,Uber,L4,Software Engineer,240.0,"San Francisco, CA",3.0,0.0,Web Development (Front-End),2018,6,2018-06,2018-03-05 01:19:05,2018-03,CA,0.029,0.024,California,0.95766,0.956797
4,2018-06-05 07:13:17,Capital One,Master Software Engineer,Software Engineer,196.0,"New York, NY",8.0,2.0,iOS,2018,6,2018-06,2018-03-05 07:13:17,2018-03,NY,0.029,0.024,New York,0.959053,0.955962


In [3]:
df.drop(columns = ['level', 'timestamp', 'location', 'timestamp_3mos', 'state'], inplace = True)

In [4]:
df.columns

Index(['company', 'title', 'totalyearlycompensation', 'yearsofexperience',
       'yearsatcompany', 'tag', 'year', 'month', 'year_month',
       'year_month_3mos', 'state_short', 'inflation_rate',
       'inflation_rate_3mos', 'employment_rate', 'employment_rate_3mos'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24496 entries, 0 to 24495
Data columns (total 15 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   company                  24496 non-null  object 
 1   title                    24496 non-null  object 
 2   totalyearlycompensation  24496 non-null  float64
 3   yearsofexperience        24496 non-null  float64
 4   yearsatcompany           24496 non-null  float64
 5   tag                      24496 non-null  object 
 6   year                     24496 non-null  int64  
 7   month                    24496 non-null  int64  
 8   year_month               24496 non-null  object 
 9   year_month_3mos          24496 non-null  object 
 10  state_short              24496 non-null  object 
 11  inflation_rate           24496 non-null  float64
 12  inflation_rate_3mos      24496 non-null  float64
 13  employment_rate          24496 non-null  float64
 14  employment_rate_3mos  

In [6]:
X = df[['company', 'title', 'yearsofexperience', 'yearsatcompany', 'year', 'month', 'state_short', 'inflation_rate',
            'inflation_rate_3mos', 'employment_rate', 'employment_rate_3mos']].copy()
X_dummy = pd.get_dummies(X, columns = ['company', 'title', 'state_short'], drop_first = True)

y = df[['totalyearlycompensation']].copy()

X_dummy.head()

Unnamed: 0,yearsofexperience,yearsatcompany,year,month,inflation_rate,inflation_rate_3mos,employment_rate,employment_rate_3mos,company_ Credit Karma,company_23andMe,...,state_short_RI,state_short_SC,state_short_TN,state_short_TX,state_short_UT,state_short_VA,state_short_VT,state_short_WA,state_short_WI,state_short_WV
0,0.58,0.58,2018,6,0.029,0.024,0.95766,0.956797,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,1.0,2018,6,0.029,0.024,0.955998,0.954978,0,0,...,0,0,0,0,0,0,0,1,0,0
2,8.0,1.0,2018,6,0.029,0.024,0.95766,0.956797,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3.0,0.0,2018,6,0.029,0.024,0.95766,0.956797,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8.0,2.0,2018,6,0.029,0.024,0.959053,0.955962,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X_dummy, y, test_size = 0.2, random_state = 42)

In [8]:
#Feature Scaling

sc = StandardScaler()

X_train_sc = sc.fit_transform(X_train)
X_test_sc = sc.transform(X_test)

In [None]:
# Fitting SVR to the dataset

svr = SVR()
svr.fit(X_train_sc, y_train)

  return f(*args, **kwargs)


In [None]:
# Predictions
y_preds = svr.predict(X_test)

# Measure performance based on accuracy.
print('train score:', svr.score(X_train_sc, y_train))
print('test score:', svr.score(X_test, y_test))

In [None]:
# Gridsearch:

params = {
    'C': [0.1, 1, 10, 100],
    'kernel': ['rbf'],
    'gamma': [0.001, 0.0001]
}

model = SVR()

#Gridsearch:
gs = GridSearchCV(model, params, n_jobs=-1, verbose=1)

# Fit on training data.
gs.fit(X_train_sc, y_train);

In [None]:
gs.best_params_

In [None]:
# Accuracy
print(f'Training acc: {gs.score(X_train_sc, y_train)}')
print(f'Testing acc: {gs.score(X_test_sc, y_test)}')

In [None]:
# Creating Predictions
#train_preds = gs.predict(X_train_sc)
#test_preds = gs.predict(X_test_sc)

In [None]:
# DataFrame with column for predicted values.
#results = pd.DataFrame(gs.predict(X_test), columns=['predicted'])

# Column for observed values.
#results['actual'] = y_test

#results.head()

### Neural Networks

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

model_es = Sequential()

n_input = X_train_sc.shape[1]

model_es.add(Dense(17, input_dim=n_input, activatioan='relu'))
model_es.add(Dense(10, activation='relu'))
model_es.add(Dense(1))

model_es.compile(loss='mse', optimizer='adam', metrics=['mae', 'acc'])

early_stop = EarlyStopping(monitor='val_loss', min_delta=0, patience=5, verbose=1, mode='auto')

history_es = model_es.fit(
    X_train,
    y_train,
    validation_data=(X_test_sc, y_test),
    epochs=100,
    batch_size=None,
    callbacks=[early_stop]
)

In [None]:
model.predict(X_test_sc)

In [None]:
model.evaluate(X_test_sc, y_test) # kind of like .score() in sklearn

In [None]:
from tensorflow.keras.regularizers import l2

model_l2 = Sequential()

n_input = X_train.shape[1]

model_l2.add(Dense(
    17,
    input_dim=n_input,
    activation='relu',
    kernel_regularizer=l2(0.001)
))

model_l2.add(Dense(
    10,
    activation='relu',
    kernel_regularizer=l2(0.001)
))

model_l2.add(Dense(
    1,
    kernel_regularizer=l2(0.001)
))

model_l2.compile(
    loss='mse',
    optimizer='adam',
    metrics=['acc', 'mae']
)

history_l2 = model_l2.fit(
    X_train_sc,
    y_train,
    validation_data=(X_test_sc, y_test),
    epochs=100,
    batch_size=None
)

In [None]:
model.predict(X_test_sc)

In [None]:
model.evaluate(X_test_sc, y_test) # kind of like .score() in sklearn

In [None]:
from tensorflow.keras.layers import Dropout

model_dropout = Sequential()

n_input = X_train.shape[1]
n_hidden = n_input

model_dropout.add(Dense(n_hidden, input_dim=n_input, activation='relu'))
model_dropout.add(Dropout(0.2)) # refers to nodes in the first hidden layer
model_dropout.add(Dense(10, activation='relu'))
model_dropout.add(Dropout(0.2))
model_dropout.add(Dense(1))

model_dropout.compile(loss='mse', optimizer='adam', metrics=['acc', 'mae'])

history_dropout = model_dropout.fit(
    X_train_sc,
    y_train,
    validation_data=(X_test_sc, y_test), 
    epochs=100,
    batch_size=None
)

In [None]:
model.predict(X_test_sc)

In [None]:
model.evaluate(X_test_sc, y_test) # kind of like .score() in sklearn

## Annie

### KNN and Gradient Boosting Classifier

In [None]:
df = pd.read_csv('../Data/salary_cleaned.csv')
df.head()

In [None]:
df.info()

In [None]:
# convert year and month from integer to string
df['year'] = df['year'].astype(str)
df['month'] = df['month'].astype(str)
df.info()

In [None]:
df['location'].value_counts(normalize=True)

In [None]:
# create 
features_all = ['company', 'title', 'location', 'yearsofexperience', 'yearsatcompany', 'year', 'month', 'year_month', 'state_short', 'inflation_rate', 'inflation_rate_3mos', 'employment_rate', 'employment_rate_3mos']
features_short = ['company', 'title', 'yearsofexperience', 'yearsatcompany', 'year', 'month', 'state', 'inflation_rate', 'inflation_rate_3mos', 'employment_rate', 'employment_rate_3mos']

X = df[features_short]
y = df['totalyearlycompensation']
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.shape, y_train.shape, X_test.shape, y_test.shape

In [None]:
X_train.head()

#### Transform Data: Standardize and OneHotEncoding

In [None]:
ct = ColumnTransformer([
    ('sc', StandardScaler(), make_column_selector(dtype_include=np.number)),
    #('ohe', OneHotEncoder(handle_unknown='ignore'), make_column_selector(dtype_include=object)),
    #('ohe', OneHotEncoder(handle_unknown='ignore'), ['company','title','year','month','state']),
    ('ohe', OneHotEncoder(handle_unknown='ignore'), [0,1,6])
    ])

X_train_ct = ct.fit_transform(X_train)
X_test_ct = ct.transform(X_test)

In [None]:
X_train_ct.shape

In [None]:
X_train

In [None]:
ct.get_feature_names_out()

### KNN

In [None]:
knn = KNeighborsRegressor(n_jobs=-1)

knn_params = {
    'n_neighbors': [3,5,7,9,11,13,15,17,19]
    ,'weights': ['uniform', 'distance']
    ,'p': [1,2]   # this one makes the training time much longer, and didn't improve R2 significantly
}

gs_knn = GridSearchCV(estimator = knn,
                     param_grid = knn_params,
                     cv = 5)

# train the model

print(datetime.datetime.now())

gs_knn.fit(X_train_ct, y_train)

print(datetime.datetime.now())
print()

In [None]:
# predict
knn_pred_train = gs_knn.predict(X_train_ct)
knn_pred_test = gs_knn.predict(X_test_ct)

# evaluate
gs_knn_R2_train = gs_knn.score(X_train_ct, y_train)
gs_knn_R2_test = gs_knn.score(X_test_ct, y_test)

print(f'gs_knn_R2_train: {round(gs_knn_R2_train, 4)}')
print(f'gs_knn_R2_test: {round(gs_knn_R2_test, 4)}')

In [None]:
gs_knn_mse_train = metrics.mean_squared_error(y_train, knn_pred_train)
gs_knn_mse_test = metrics.mean_squared_error(y_test, knn_pred_test)

print(f'gs_knn_mse_train: {round(gs_knn_mse_train, 4)}')
print(f'gs_knn_mse_test: {round(gs_knn_mse_test, 4)}')

In [None]:
perf_dict = {
    'R2':[gs_knn_R2_train, gs_knn_R2_test],
    'mse':[gs_knn_mse_train, gs_knn_mse_test]
}

In [None]:
perf_df = pd.DataFrame(perf_dict, index=['Train','Test'])


In [None]:
gs_knn.best_estimator_

#### Pickling and Saving the model

In [None]:
with open('../Models/totalcomp_gs_knn.pkl', 'wb') as f:
    pickle.dump(gs_knn, f)

In [None]:
'''
with open('../Models/totalcomp_gs_knn.pkl', 'rb') as f:
    gs_knn = pickle.load(f)
'''

In [None]:
#gs_knn.cv_results_

In [None]:
#pd.DataFrame(gs_knn.cv_results_)

**Observation**: 
1. Comparing the R2 score and mse between the training and testing sets, the model is very overfit.
2. The best model from grid search CV uses 13 nearest neighbors, p=1, and distance as weights.

### Gradient Boosting Classifier

#### No Gridsearch

In [None]:
# fit the model
gbr = GradientBoostingRegressor(n_estimators=400,
                                max_depth=3,
                                max_features=800,
                                random_state=42)

print(datetime.datetime.now())

gbr.fit(X_train_ct, y_train)

print(datetime.datetime.now())

In [None]:
# predict
gbr_pred_train = gbr.predict(X_train_ct)
gbr_pred_test = gbr.predict(X_test_ct)

# evaluate
gbr_R2_train = gbr.score(X_train_ct, y_train)
gbr_R2_test = gbr.score(X_test_ct, y_test)

gbr_mse_train = metrics.mean_squared_error(y_train, gbr_pred_train)
gbr_mse_test = metrics.mean_squared_error(y_test, gbr_pred_test)

gbr_perf_dict = {
    'R2': [gbr_R2_train, gbr_R2_test],
    'MSE': [gbr_mse_train, gbr_mse_test]
}

gbr_perf_df = pd.DataFrame(gbr_perf_dict, index=['Train','Test'])
gbr_perf_df.T

#### GridSearchCV

##### GridSearch 0 
Model No. 0: killed after running for 19 hours

##### GridSearch 1

In [None]:
# build the grid search for hyperparameters 

gbr = GradientBoostingRegressor(random_state=42)

gbr_params = {
    'n_estimators': [300,400,500],
    #'learning_rate': [0.01, 0.1],
    'max_depth': [5,6,7,8],
    #'min_samples_split': [5,7,10],
    #'min_samples_leaf': [2,3,5],
    'max_features': [100,150,200]
}

gs_gbr = GridSearchCV(gbr,
                     gbr_params,
                     cv=5)

# train

print(datetime.datetime.now())

gs_gbr.fit(X_train_ct, y_train)

print(datetime.datetime.now())

In [None]:
# predict

gs_gbr_pred_train = gs_gbr.predict(X_train_ct)
gs_gbr_pred_test = gs_gbr.predict(X_test_ct)

# evaluate

gs_gbr_R2_train = gs_gbr.score(X_train_ct, y_train)
gs_gbr_R2_test = gs_gbr.score(X_test_ct, y_test)

print(f'gs_gbr_R2_train: {round(gs_gbr_R2_train, 4)}')
print(f'gs_gbr_R2_test: {round(gs_gbr_R2_test, 4)}')
print()

gs_gbr_mse_train = metrics.mean_squared_error(y_train, gs_gbr_pred_train)
gs_gbr_mse_test = metrics.mean_squared_error(y_test, gs_gbr_pred_test)

print(f'gs_gbr_mse_train: {round(gs_gbr_mse_train, 4)}')
print(f'gs_gbr_mse_test: {round(gs_gbr_mse_test, 4)}')

gs_gbr_perf_dict = {
    'R2': [gs_gbr_R2_train, gs_gbr_R2_test],
    'MSE': [gs_gbr_mse_train, gs_gbr_mse_test]
}

gs_gbr_perf_df = pd.DataFrame(gs_gbr_perf_dict, index=['Train','Test'])
gs_gbr_perf_df.T

In [None]:
gs_gbr.best_estimator_

In [None]:
gs_gbr.best_score_

In [None]:
# pd.DataFrame(gs_gbr.cv_results_)

##### GridSearch 2

In [None]:
# build the grid search for hyperparameters 

gbr2 = GradientBoostingRegressor(random_state=42)

gbr2_params = {
    'n_estimators': [400],
    #'learning_rate': [0.01, 0.1],
    'max_depth': [3,4,5,6],
    #'min_samples_split': [5,7,10],
    #'min_samples_leaf': [2,3,5],
    'max_features': [200, 300]
}

gs_gbr2 = GridSearchCV(gbr2,
                     gbr2_params,
                     cv=5)

# train

print(datetime.datetime.now())

gs_gbr2.fit(X_train_ct, y_train)

print(datetime.datetime.now())

In [None]:
# predict

gs_gbr2_pred_train = gs_gbr2.predict(X_train_ct)
gs_gbr2_pred_test = gs_gbr2.predict(X_test_ct)

# evaluate

gs_gbr2_R2_train = gs_gbr2.score(X_train_ct, y_train)
gs_gbr2_R2_test = gs_gbr2.score(X_test_ct, y_test)

print(f'gs_gbr2_R2_train: {round(gs_gbr2_R2_train, 4)}')
print(f'gs_gbr2_R2_test: {round(gs_gbr2_R2_test, 4)}')
print()

gs_gbr2_mse_train = metrics.mean_squared_error(y_train, gs_gbr2_pred_train)
gs_gbr2_mse_test = metrics.mean_squared_error(y_test, gs_gbr2_pred_test)

print(f'gs_gbr2_mse_train: {round(gs_gbr2_mse_train, 4)}')
print(f'gs_gbr2_mse_test: {round(gs_gbr2_mse_test, 4)}')

gs_gbr2_perf_dict = {
    'R2': [gs_gbr2_R2_train, gs_gbr2_R2_test],
    'MSE': [gs_gbr2_mse_train, gs_gbr2_mse_test]
}

gs_gbr2_perf_df = pd.DataFrame(gs_gbr2_perf_dict, index=['Train','Test'])
gs_gbr2_perf_df.T

In [None]:
gs_gbr2.best_estimator_

In [None]:
# pd.DataFrame(gs_gbr2.cv_results_)

##### Pickling and Saving the Model

In [None]:
with open('../Models/totalcomp_gs_gbr2.pkl', 'wb') as f:
    pickle.dump(gs_gbr2, f)

In [None]:
with open('../Models/totalcomp_gs_gbr2.pkl', 'rb') as f:
    gs_gbr2 = pickle.load(f)

In [None]:
new_data = {
    'company': 'Google', 
    'title': 'Data Scientist', 
    'yearsofexperience': '1', 
    'yearsatcompany': '0.1', 
    'year': '2020', 
    'month': '9', 
    'state': 'California', 
    'inflation_rate': '0.3', 
    'inflation_rate_3mos': '0.2', 
    'employment_rate': '0.99', 
    'employment_rate_3mos': '0.98'
}

new_data_df = pd.DataFrame(new_data, index=[0])
new_data_df

In [None]:
new_data_df_ct = ct.transform(new_data_df)
print(new_data_df_ct.shape)

pred_gs_gbr2_new_data = gs_gbr2.predict(new_data_df_ct)
pred_gs_gbr2_new_data

## Sileshi

In [None]:
# Loading dataset
df = pd.read_csv('./Data/salary_cleaned.csv')
df.head()

In [None]:
df.drop(columns = ['timestamp', 'location','level', 'timestamp_3mos', 'state'], inplace = True)

In [None]:
X = df[['company', 'title', 'yearsofexperience', 'yearsatcompany', 'year', 'month', 'state_short', 'inflation_rate',
            'inflation_rate_3mos', 'employment_rate', 'employment_rate_3mos']].copy()
X_dummy = pd.get_dummies(X, columns = ['company', 'title', 'state_short'], drop_first = True)

y = df[['totalyearlycompensation']].copy()

X_dummy.head()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_dummy, y, test_size = 0.3, random_state = 42)

In [None]:
#Feature Scaling

sc = StandardScaler()

X_train_ct = sc.fit_transform(X_train)
X_test_ct = sc.transform(X_test)


### Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(X_train_ct,y_train)

In [None]:
lr.score(X_train_ct,y_train), lr.score(X_test_ct,y_test)


In [None]:
def display_R2_scores(model, X_train_ct, y_train, X_test_ct, y_test):
    print(f'The mean cross validation score: {round(cross_val_score(model, X_train_ct, y_train).mean(),4)}.')
    print(f'Training score: {round(model.score(X_train_ct, y_train), 4)}.')
    print(f'Testing score: {round(model.score(X_test_ct, y_test),4)}.')

In [None]:
display_R2_scores(lr, X_train_ct, y_train, X_test_ct, y_test)

In [None]:
pd.DataFrame(list(zip(X.columns, lr.coef_)), columns=['features', 'coefficients']).sort_values(by='coefficients', ascending=False).tail()

In [None]:
intercept = lr.intercept_
intercept

In [None]:
lr_preds = lr.predict(X_test_ct)

In [None]:
# training data mse
lr_predstr = lr.predict(X_train_ct)

# testing data mse
lr_predstst = lr.predict(X_test_ct)

In [None]:
print(round(metrics.mean_squared_error(y_test,lr_predstst), 4))

### Lasso

In [None]:
l_alphas = np.logspace(-1, 0,100)

lasso_cv = LassoCV(alphas=l_alphas, cv=5, n_jobs=-1)

lasso_cv.fit(X_train_ct, y_train);

In [None]:
display_R2_scores(lasso_cv, X_train_ct, y_train, X_test_ct, y_test)

In [None]:
# training data mse
lasso_predstr = lasso_cv.predict(X_train_ct)

# testing data mse
lasso_predstst = lasso_cv.predict(X_test_ct)

### Ridge

In [None]:
# Ridge

r_alphas = np.logspace(0,5,100)

ridge_cv = RidgeCV(alphas = r_alphas, scoring='r2', cv=5)

ridge_cv.fit(X_train_ct, y_train);

In [None]:
display_R2_scores(ridge_cv, X_train_ct, y_train, X_test_ct, y_test)

In [None]:
# training mse
ridge_predstr = ridge_cv.predict(X_train_ct)
print(f'\nRidge train data MSE: {metrics.mean_squared_error(y_train,ridge_predstr)}.')

# testing data mse
ridge_predstst = ridge_cv.predict(X_test_ct)
print(f'Ridge test data MSE: {metrics.mean_squared_error(y_test,ridge_predstst)}.')

### Elastic Net

In [None]:
e_alphas = np.logspace(0,5,100)

elastic_cv = ElasticNetCV(alphas = e_alphas, cv=5)

elastic_cv.fit(X_train_ct, y_train);

In [None]:
display_R2_scores(elastic_cv, X_train_ct, y_train, X_test_ct, y_test)

In [None]:
# training data mse
elastic_predstr = elastic_cv.predict(X_train_ct)

# testing data mse
elastic_predstst = elastic_cv.predict(X_test_ct)

### MSE

In [None]:
# training/testing data mse
print("MSE".center(30, "="))
print(f'\nLinear Regression train/test data MSE: {round(metrics.mean_squared_error(y_train,lr_predstr),4)}/{round(metrics.mean_squared_error(y_test,lr_predstst), 4)}, respectively.')
print(f'Lasso train/test data MSE: {round(metrics.mean_squared_error(y_train,lasso_predstr),4)}/{round(metrics.mean_squared_error(y_test,lasso_predstst), 4)}, respectively.')
print(f'Ridge train/test data MSE: {round(metrics.mean_squared_error(y_train,ridge_predstr),4)}/{round(metrics.mean_squared_error(y_test,ridge_predstst), 4)}, respectively.')
print(f'Elastic Net train/test data MSE: {round(metrics.mean_squared_error(y_train,elastic_predstr),4)}/{round(metrics.mean_squared_error(y_test,elastic_predstst), 4)}, respectively.')


### Final Scores

In [None]:
print("Linear Regression".center(30, "="))
display_R2_scores(lr, X_train_ct, y_train, X_test_ct, y_test)
print()
print("LASSO Regularization".center(30, "="))
display_R2_scores(lasso_cv, X_train_ct, y_train, X_test_ct, y_test)

print()
print("Ridge Regularization".center(30, "="))
display_R2_scores(ridge_cv, X_train_ct, y_train, X_test_ct, y_test)

print()
print("Elastic Net Regularization".center(30, "="))
display_R2_scores(elastic_cv, X_train_ct, y_train, X_test_ct, y_test)


## Final Score

|**Model**|**Training R^2 Score**|**Testing R^2 Score**|**RMSE (Train/Test)**|
|--|--|--|--|
|**Linear Regression**|0.5193|-0.7293Xe^23|8286.35/1224.82Xe^29|
|**Lasso**|0.5182|0.5143|8305.30/8157.45|
|**Ridge**|0.52|0.5097|8274.20/8234.28|
|**Elastic Net**|0.4483|0.4499|9511.19/9238.88|

In [None]:
intercept = ridge_cv.intercept_
intercept

## Mason

In [None]:
# Loading dataset
df = pd.read_csv('../Data/salary_cleaned.csv')
df.head()

In [None]:
# Converting Year and Month to string 
df['year'] = df['year'].astype(str)
df['month'] = df['month'].astype(str)
df.info()

In [None]:
# Features and target variable
features = ['company', 'title', 'yearsofexperience', 'yearsatcompany', 'year', 'month', 'state_short', 'inflation_rate', 'inflation_rate_3mos', 'employment_rate', 'employment_rate_3mos']

X = df[features]
y = df['totalyearlycompensation']

In [None]:
# Train/Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)

In [None]:
# ColumnTransformer
ct = ColumnTransformer([
    ('sc', StandardScaler(), make_column_selector(dtype_exclude=object)),
    ('encoder', OneHotEncoder(handle_unknown='ignore'), ['company', 'title', 'state_short', 'year', 'month'])],
    n_jobs=-1,
    remainder='passthrough')

In [None]:
X_train_ct = ct.fit_transform(X_train)
X_test_ct = ct.transform(X_test)

In [None]:
X_train_ct.shape

In [None]:
X_test_ct.shape

#### RandomForest Regression

In [None]:
# Instantiate RandomForestRegression
rf = RandomForestRegressor()

# Parameters
rf_params={
    'n_estimators':[100, 200],
    'criterion': ['mse'],
    'max_depth': [2, 5],
    'min_samples_split': [2,4],
    'min_samples_leaf': [1,3],
    'random_state': [42]
}

# Instantiating RFR Gridsearch
rf_gs = GridSearchCV(rf, rf_params, cv=5, verbose=1, n_jobs=-1)

# Fitting GridSearch to the data
rf_gs.fit(X_train_ct, y_train)

In [None]:
# Train/Test Scores
print(f'Best Train Score: {rf_gs.best_score_}')
print(f'Best Test Score: {rf_gs. score(X_test_ct, y_test)}')

# Best Parameters
print(f'Best parameters: {rf_gs.best_params_}')

# Best Estimators
print(f'Best Estimators: {rf_gs.best_estimator_}')

#### AdaBoost Regression

In [None]:
# Instantiate AdaBoostRegressor
abr = AdaBoostRegressor()

# Parameters
abr_params ={
    'base_estimator': [None],
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [1, 3],
    'loss': ['linear', 'square', 'exponential'],
    'random_state': [42]
}

# Instantiate ABR GridSearch
abr_gs = GridSearchCV(abr, abr_params, cv=5, verbose=1, n_jobs=-1)

# Fitting Gridsearch to the data
abr_gs.fit(X_train_ct, y_train)

In [None]:
# Train/Test Scores
print(f'Best Train Score: {rf_gs.best_score_}')
print(f'Best Test Score: {rf_gs.score(X_test_ct, y_test)}')

# Best Parameters
print(f'Best parameters: {rf_gs.best_params_}')

# Best Estimators
print(f'Best Estimators: {rf_gs.best_estimator_}')