In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score,mean_absolute_error

# Read in files and print the information
train_feature_df = pd.read_csv('data/train_features.csv')
train_target_df = pd.read_csv('data/train_salaries.csv')
test_feature_df = pd.read_csv('data/test_features.csv')

# Merge the features and salaries on jobId, delete original file to save memory
train_df = pd.merge(train_feature_df, train_target_df, on='jobId')

# Remove data with zero salaries
train_df = train_df[train_df.salary > 8.5]
train_df = train_df.drop(columns =['jobId', 'companyId'])

# define a function to encode the categorical variables 
def Encode_Data(df):   
    for col in df.columns:        
        if df[col].dtype.name == "category" or "object":
            le = LabelEncoder()
            df[col]=le.fit_transform(df[col])
    return df

#check encoded data
df = Encode_Data(train_df)
df = df.astype('float32')

target = df['salary']
features = df.drop('salary',axis=1)

x = features
y = target
#splitting datasets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, random_state = 0)


LR = LinearRegression()
RF = RandomForestRegressor(n_estimators=200, n_jobs=4, max_depth=30, 
                           min_samples_split=60, max_features='auto', verbose=0)
GB = GradientBoostingRegressor(n_estimators=160, max_depth=6, loss='ls',verbose=0)

#initialize model list and dicts
models = []
mean_mse = {}
cv_std = {}

def training_model(model,x_train,y_train, mean_mse, cv_std):
    print('\033[1m'+ 'Model Name: \n' + '\033[0m',model)
    neg_mse= cross_val_score(model,x_test,y_test,cv=5,scoring='neg_mean_squared_error')
    mean_mse[model] = np.mean(neg_mse)
    cv_std[model] = np.std(neg_mse)
    print('\033[1m' + 'Negative Mean Squared Error:\n' + '\033[0m', mean_mse[model])
    print('\033[1m' + 'Standard Deviation:\n' + '\033[0m', cv_std[model])
    
#Evaluate models with 5 fold cross validation
models.extend([LR,RF,GB])
for model in models: 
    training_model(model, x_train, y_train, mean_mse,cv_std)
    
#select the model with the lowest error as your "prodcuction" model
#use max function because using negative MSE and therefore want it as big as possible
bestmodel = max(mean_mse, key=mean_mse.get) 
print('\nBest model is: \n', bestmodel)

bs = bestmodel.fit(x,y)

import pickle

# Saving model using pickle
pickle.dump(bs, open('model.pkl','wb'))

[1mModel Name: 
[0m LinearRegression()
[1mNegative Mean Squared Error:
[0m -922.7896240234375
[1mStandard Deviation:
[0m 5.278781464679644
[1mModel Name: 
[0m RandomForestRegressor(max_depth=30, min_samples_split=60, n_estimators=200,
                      n_jobs=4)
[1mNegative Mean Squared Error:
[0m -387.08948677179615
[1mStandard Deviation:
[0m 3.3636857908144613
[1mModel Name: 
[0m GradientBoostingRegressor(max_depth=6, n_estimators=160)
[1mNegative Mean Squared Error:
[0m -358.63140227428664
[1mStandard Deviation:
[0m 2.599476938043089

Best model is: 
 GradientBoostingRegressor(max_depth=6, n_estimators=160)
