# Modeling

## Hans

## Annie

## Sileshi

## Mason

In [1]:
# imports
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn import metrics 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer

In [2]:
# Loading dataset
df = pd.read_csv('../Data/salary_cleaned.csv')
df.head()

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,year,month,year_month,timestamp_3mos,year_month_3mos,state_short,inflation_rate,inflation_rate_3mos,state,employment_rate,employment_rate_3mos
0,2018-06-03 13:58:20,Yahoo,IC2,Software Engineer,160.5,"Sunnyvale, CA",0.58,0.58,Full Stack,2018,6,2018-06,2018-03-03 13:58:20,2018-03,CA,0.029,0.024,California,0.95766,0.956797
1,2018-06-04 20:28:22,Facebook,E3,Software Engineer,165.0,"Seattle, WA",1.0,1.0,Full Stack,2018,6,2018-06,2018-03-04 20:28:22,2018-03,WA,0.029,0.024,Washington,0.955998,0.954978
2,2018-06-05 00:56:33,VmWare,Senior MTS,Software Engineer,218.0,"Palo Alto, CA",8.0,1.0,Distributed Systems (Back-End),2018,6,2018-06,2018-03-05 00:56:33,2018-03,CA,0.029,0.024,California,0.95766,0.956797
3,2018-06-05 01:19:05,Uber,L4,Software Engineer,240.0,"San Francisco, CA",3.0,0.0,Web Development (Front-End),2018,6,2018-06,2018-03-05 01:19:05,2018-03,CA,0.029,0.024,California,0.95766,0.956797
4,2018-06-05 07:13:17,Capital One,Master Software Engineer,Software Engineer,196.0,"New York, NY",8.0,2.0,iOS,2018,6,2018-06,2018-03-05 07:13:17,2018-03,NY,0.029,0.024,New York,0.959053,0.955962


In [3]:
# Converting Year and Month to string 
df['year'] = df['year'].astype(str)
df['month'] = df['month'].astype(str)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24496 entries, 0 to 24495
Data columns (total 20 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   timestamp                24496 non-null  object 
 1   company                  24496 non-null  object 
 2   level                    24496 non-null  object 
 3   title                    24496 non-null  object 
 4   totalyearlycompensation  24496 non-null  float64
 5   location                 24496 non-null  object 
 6   yearsofexperience        24496 non-null  float64
 7   yearsatcompany           24496 non-null  float64
 8   tag                      24496 non-null  object 
 9   year                     24496 non-null  object 
 10  month                    24496 non-null  object 
 11  year_month               24496 non-null  object 
 12  timestamp_3mos           24496 non-null  object 
 13  year_month_3mos          24496 non-null  object 
 14  state_short           

In [4]:
# Features and target variable
features = ['company', 'title', 'yearsofexperience', 'yearsatcompany', 'year', 'month', 'state_short', 'inflation_rate', 'inflation_rate_3mos', 'employment_rate', 'employment_rate_3mos']

X = df[features]
y = df['totalyearlycompensation']

In [5]:
# Train/Test Split

X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)

In [6]:
# ColumnTransformer
ct = ColumnTransformer([
    ('sc', StandardScaler(), make_column_selector(dtype_exclude=object)),
    ('encoder', OneHotEncoder(handle_unknown='ignore'), ['company', 'title', 'state_short', 'year', 'month'])],
    n_jobs=-1,
    remainder='passthrough')

In [7]:
X_train_ct = ct.fit_transform(X_train)
X_test_ct = ct.transform(X_test)

In [8]:
X_train_ct.shape

(18372, 1234)

In [9]:
X_test_ct.shape

(6124, 1234)

#### RandomForest Regression

In [25]:
# Instantiate RandomForestRegression
rf = RandomForestRegressor(random_state=42)

# Parameters
rf_params={
    'n_estimators':[100, 200, 300, 400, 500],
    'criterion': ['mse'],
    'max_depth': [2, 4, 6, 8, 10, 12],
    'min_samples_split': [2, 3, 4, 5, 6],
    'min_samples_leaf': [1, 2, 3, 4, 5],
}

# Instantiating RFR Gridsearch
rf_gs = GridSearchCV(rf, rf_params, cv=5, verbose=1, n_jobs=-1)

# Fitting GridSearch to the data
rf_gs.fit(X_train_ct, y_train)

Fitting 5 folds for each of 750 candidates, totalling 3750 fits


KeyboardInterrupt: 

In [24]:
# Train/Test Scores
print(f'Best Score: {rf_gs.best_score_}')
print(f'Train Score: {rf_gs.score(X_train_ct, y_train)}')
print(f'Test Score: {rf_gs.score(X_test_ct, y_test)}')

# Best Parameters
print(f'Best parameters: {rf_gs.best_params_}')

# Best Estimators
print(f'Best Estimators: {rf_gs.best_estimator_}')

Best Train Score: 0.41540868532825537
Test Score: 0.41070684312493977
Best parameters: {'criterion': 'mse', 'max_depth': 6, 'min_samples_leaf': 3, 'min_samples_split': 2, 'n_estimators': 200}
Best Estimators: RandomForestRegressor(max_depth=6, min_samples_leaf=3, n_estimators=200,
                      random_state=42)


In [17]:
# Predictions
rf_preds_train = rf_gs.predict(X_train_ct)
rf_preds_test = rf_gs.predict(X_test_ct)

In [18]:
# Evaluation
rf_score_train = rf_gs.score(X_train_ct, y_train)
rf_score_test = rf_gs.score(X_test_ct, y_test)

rf_mse_train = metrics.mean_squared_error(y_train, rf_preds_train)
rf_mse_test = metrics.mean_squared_error(y_test, rf_preds_test)

In [19]:
# create the perf evaluation output df
perf_dict = {
    'R2': [rf_score_train, rf_score_test],
    'mse': [rf_mse_train, rf_mse_test]
}

perf_df = pd.DataFrame(perf_dict, index=['train','test']).T

# perf_df.to_csv('./randomforest_perf.csv')
# Data was obtained via AWS

In [20]:
# create the prediction output df
data_test_copy = X_test.copy()

In [21]:
# Predictions
data_test_copy['totalcomp'] = y_test
data_test_copy['predictions'] = rf_preds_test

# data_test_copy.to_csv('./randomforest_pred.csv', index=False)
# Data was obtained via AWS


In [22]:
# CV results
cv_result = pd.DataFrame(rf_gs.cv_results_)

# cv_result.to_csv('./randomforest_CV_Result.csv', index=False)
# Data was obtained via AWS

#### AdaBoost Regression

In [10]:
# Instantiate AdaBoostRegressor
abr = AdaBoostRegressor(random_state=42)

# Parameters
abr_params ={
    'base_estimator': [None],
    'n_estimators': [50, 100, 150, 200],
    'learning_rate': [1, 3],
    'loss': ['linear', 'square', 'exponential'],
}

# Instantiate ABR GridSearch
abr_gs = GridSearchCV(abr, abr_params, cv=5, verbose=1, n_jobs=-1)

# Fitting Gridsearch to the data
abr_gs.fit(X_train_ct, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


GridSearchCV(cv=5, estimator=AdaBoostRegressor(random_state=42), n_jobs=-1,
             param_grid={'base_estimator': [None], 'learning_rate': [1, 3],
                         'loss': ['linear', 'square', 'exponential'],
                         'n_estimators': [50, 100, 150, 200]},
             verbose=1)

In [11]:
# Predictions
abr_preds_train = abr_gs.predict(X_train_ct)
abr_preds_test = abr_gs.predict(X_test_ct)

# Evaluation
abr_score_train = abr_gs.score(X_train_ct, y_train)
abr_score_test = abr_gs.score(X_test_ct, y_test)

abr_mse_train = metrics.mean_squared_error(y_train, abr_preds_train)
abr_mse_test = metrics.mean_squared_error(y_test, abr_preds_test)

In [13]:
# create the perf evaluation output df
abr_perf_dict = {
    'R2': [abr_score_train, abr_score_test],
    'mse': [abr_mse_train, abr_mse_test]
}

abr_perf_df = pd.DataFrame(abr_perf_dict, index=['train','test']).T

abr_perf_df.to_csv('../Data/adaboost_perf.csv')


In [14]:
# create the prediction output df
abrdata_test_copy = X_test.copy()

# Predictions
abrdata_test_copy['totalcomp'] = y_test
abrdata_test_copy['predictions'] = abr_preds_test

abrdata_test_copy.to_csv('../Data/adaboost_pred.csv', index=False)

In [15]:
# CV results
cv_result = pd.DataFrame(abr_gs.cv_results_)

cv_result.to_csv('../Data/adaboost_CV_Result.csv', index=False)