### After the preprocessing step we will be eastablishing the baseline for our model. Also, we know that our problem can be best solved by Regression methods.

In [38]:
# importing necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score, KFold, train_test_split
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore")

#__author__ = Monish Khambhati
#__email__ = monish.khambhati@gmail.com

In [2]:
# importing the train data
train_df = pd.read_csv('data/train_features.csv')

In [3]:
# importing the target features data
target_df = pd.read_csv('data/train_salaries.csv')

In [4]:
# importing the test features data
test_df = pd.read_csv('data/test_features.csv')

In [5]:
# Merging the train and target dataframe
train_df = pd.merge(left = train_df,right = target_df,how='inner', on='jobId')

From the EDA conclusion we know that **jobId** is not related to the target variables. So, I will start by dropping those fetaures from train and test data. And also choosing the salaries which are greater than 0 in **train_df**.

In [6]:
# function to clean the dataframe
def clean_data(raw_df):
    '''remove rows that contain salary <= 0 or duplicate job IDs'''
    clean_df = raw_df.drop_duplicates(subset='jobId')
    clean_df = clean_df[clean_df.salary>0]
    return clean_df

In [7]:
train_df = clean_data(train_df)

In [8]:
test_df = test_df.drop('jobId', axis=1)

Before feeding data to our model we will be writing a function to perform **One Hot Encoding**.
 - One hot encoding is a process by which categorical variables are converted into a form that could be provided to ML algorithms to do a better job in prediction.

In [9]:
def one_hot_encode_df(df, cat_vars=None, num_vars=None):
    '''performs one-hot encoding on all categorical variables and combines result with continous variables'''
    cat_df = pd.get_dummies(df[cat_vars])
    num_df = df[num_vars].apply(pd.to_numeric)
    return pd.concat([cat_df, num_df], axis=1)

In [10]:
# defining categorical variable and numerical variables
categorical_vars = ['companyId','jobType', 'degree', 'major', 'industry']
numerical_vars = ['yearsExperience', 'milesFromMetropolis']
target_var = 'salary'

In [11]:
train_df = one_hot_encode_df(train_df,cat_vars=categorical_vars, num_vars=numerical_vars)

In [12]:
train_df.head()

Unnamed: 0,companyId_COMP0,companyId_COMP1,companyId_COMP10,companyId_COMP11,companyId_COMP12,companyId_COMP13,companyId_COMP14,companyId_COMP15,companyId_COMP16,companyId_COMP17,...,major_PHYSICS,industry_AUTO,industry_EDUCATION,industry_FINANCE,industry_HEALTH,industry_OIL,industry_SERVICE,industry_WEB,yearsExperience,milesFromMetropolis
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,10,83
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,3,73
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,1,0,0,0,10,38
3,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,8,17
4,0,0,0,0,0,0,0,0,0,0,...,1,0,0,1,0,0,0,0,8,16


## Splitting data into training and validation set and establishing the baseline
 - I will be using sklearn to split the data into training and validation set into 80:20 feataure
 - Also, the metric that we will choose in this case is Mean Squared Error(MSE). Mean squared error measures the average of the squares of errors, i.e, the difference between actual value (y) and the estimated value (ŷ).
 - Baseline model would be a simple linear regession model and we will be using it to hypothesize solutions based on the results of the baseline.

In [13]:
# Extracting target variable from train_salaries which are greater than 0
target_df = clean_data(target_df)

In [14]:
target_df = target_df.drop('jobId',axis = 1)

In [15]:
# Splitting the data into train and validation set
X_train_data, X_test_data, y_train_data, y_test_data = train_test_split(train_df, target_df, test_size=0.2, random_state=1)

In [16]:
test_df = one_hot_encode_df(test_df,cat_vars=categorical_vars, num_vars=numerical_vars)

### Linear Regression

In [17]:
# Creating a linear regression object
lr = LinearRegression()

# Fitting the model with the train data
lr.fit(X_train_data, y_train_data)

# Predicting the model on the validation data
y_predict = lr.predict(X_test_data)
print("The first 5 predictied salaries: ", y_predict[0:5])

The first 5 predictied salaries:  [[157.76965332]
 [ 62.42608643]
 [108.85063171]
 [112.38508606]
 [ 92.39764404]]


In [18]:
# Evaluating our model using validation set by calculating mean squared error
mse = mean_squared_error(y_test_data, y_predict)

In [19]:
print(mse)

384.8722111900365


In [20]:
#Prediction accuracy using k fold cross validation
Rcross = cross_val_score(lr, y_test_data, y_predict, cv = 5)

print("The k-cross validation accuracy is: ", (Rcross.mean(), Rcross.std()))

The k-cross validation accuracy is:  (0.7444241030428751, 0.002713188048878918)


## Hypothesize a solution
#### On the baseline simple regression model the MSE is 384.87. We would try other techniques and also do feature engineering and hyperparameter tuning to imrove model on test set. 
 - The models we will be using to achieve better accuracy are:
     - Random Forest Regressor
     - Gradient Boosting Regressor

### Random Forest Regressor

In [35]:
rf = RandomForestRegressor(n_estimators = 60, max_depth = 25, 
                           min_samples_split = 20, n_jobs = 2, 
                           max_features = 30)

In [37]:
#Fitting the object to training data
rf.fit(X_train_data, y_train_data)

  


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=25,
           max_features=30, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=20, min_weight_fraction_leaf=0.0,
           n_estimators=60, n_jobs=2, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [42]:
# Checking the model accuracy on test set
rf.score(X_test_data, y_test_data)

0.7535437432666752

In [48]:
# Making predictions on test data
y_predict_rf = rf.predict(X_test_data)
print("First 5 Predictions on test set: ", y_predict_rf[0:5])

First 5 Predictions on test set:  [155.15944532  61.24348629  96.32195964 102.75388373  93.19703172]


In [51]:
# Calculating mean squared error on test set
mean_squared_error(y_test_data, y_predict_rf)

371.17114569262134

### Gradient Boosting Regressor

In [50]:
#Creating Gradient Boosting Regressor object
gd = GradientBoostingRegressor(n_estimators = 60, max_depth = 5,loss = 'ls', verbose = 5)

In [52]:
#Fitting object to data
gd.fit(X_train_data, y_train_data)

      Iter       Train Loss   Remaining Time 
         1        1349.8934           12.17m
         2        1228.8101           10.40m
         3        1128.7405            9.54m
         4        1044.5412            9.30m
         5         973.8131            9.03m
         6         912.9784            8.82m
         7         860.9562            8.46m
         8         813.5527            8.22m
         9         771.8185            8.03m
        10         735.6256            7.83m
        11         702.8820            7.65m
        12         675.2100            7.44m
        13         647.1429            7.24m
        14         622.9292            7.07m
        15         602.7250            6.89m
        16         582.9249            6.73m
        17         564.0264            6.55m
        18         547.9513            6.38m
        19         533.2990            6.21m
        20         520.3717            6.05m
        21         509.0961            5.89m
        2

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=5, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=60, presort='auto', random_state=None,
             subsample=1.0, verbose=5, warm_start=False)

In [54]:
y_predict_gd = gd.predict(X_test_data)

In [55]:
mean_squared_error(y_test_data, y_predict_gd)

371.70423609623913

In [None]:
def consolidate_data(df1, df2, key=None, left_index=False, right_index=False):
    '''perform inner join to return only records that are present in both dataframes'''
    return pd.merge(left=df1, right=df2, how='inner', on=key, left_index=left_index, right_index=right_index)

def get_target_df(df, target):
    '''returns target dataframe'''
    return df[target]

def train_model(model, feature_df, target_df, num_procs, mean_mse, cv_std):
    neg_mse = cross_val_score(model, feature_df, target_df, cv=2, n_jobs=num_procs, scoring='neg_mean_squared_error')
    mean_mse[model] = -1.0*np.mean(neg_mse)
    cv_std[model] = np.std(neg_mse)

def print_summary(model, mean_mse, cv_std):
    print('\nModel:\n', model)
    print('Average MSE:\n', mean_mse[model])
    print('Standard deviation during CV:\n', cv_std[model])

def save_results(model, mean_mse, predictions, feature_importances):
    '''saves model, model summary, feature importances, and predictions'''
    with open('model.txt', 'w') as file:
        file.write(str(model))
    feature_importances.to_csv('feature_importances.csv') 
    np.savetxt('predictions.csv', predictions, delimiter=',')

In [None]:
# initialize model lists and scripts
models = []
mean_mse = {}
cv_std = {}
res= {}

# number of processes to run in parellel
num_procs = 2

#shared model paramaters
verbose_lvl = 0

In [None]:
#create models -- hyperparameter tuning already done by hand for each mode

lr_std_pca = make_pipeline(StandardScaler(), PCA(), LinearRegression())
rf = RandomForestRegressor(n_estimators=150, n_jobs=num_procs, max_depth=25, min_samples_split=60, \
                           max_features=30, verbose=verbose_lvl)
gbm = GradientBoostingRegressor(n_estimators=150, max_depth=5, loss='ls', verbose=verbose_lvl)
                      
models.extend([lr, lr_std_pca, rf, gbm])

#parallel cross-validate models, using MSE as evaluation metric, and print summaries
print("Beginning cross validation")
for model in models:
    train_model(model, train_df, target_df, num_procs, mean_mse, cv_std)
    print_summary(model, mean_mse, cv_std)