#### In this notebook we will be automating the entire process that we did in the data modeling notebook. We will be writing the code in Object Oriented Paradigm. The 3 algorithms that we tested our data in the data modeling phase was:
 - Linear Regression
 - Random Forest Regressor
 - Gradient Boosting Resgressor
#### We will later automate the pipleine and also deploy the solutions after selecting the model that performs better than the other two.

In [3]:
# importing necessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import cross_val_score
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

In [4]:
# Function to read the csv file and store it into the dataframe
def read_csv(file):
    return pd.read_csv(file)

In [5]:
# Function to clean the data. This is the same function that we used in data modeling noteobook
def clean_df(raw_df):
    # Dropping jobId column if they are duplicates and selecting only those salaries that are greater than zero
    
    clean_df = raw_df.drop_duplicates(subset='jobId')
    clean_df = clean_df[clean_df.salary>0]
    return clean_df

# Function to perform inner join two dataframe based on any features 

def consolidate_data(df1, df2, key=None, left_index=False, right_index=False):
    # Perfroming inner join based on the key passed in the data frame
    
    return pd.merge(left=df1, right=df2, how='inner', on=key, left_index=left_index, right_index=right_index)

In [6]:
# Function to perform one hot encoding on the dataframe
def one_hot_encode_df(df, cat_vars=None, num_vars=None):
    cat_df = pd.get_dummies(df[cat_vars])
    num_df = df[num_vars].apply(pd.to_numeric)
    return pd.concat([cat_df, num_df], axis=1)

In [7]:
# Function to return the target dataframe
def get_target_df(df, target):
    return df[target]

In [8]:
# Function to train the models that we specified above
def train_models(model, feature_df, target_df, num_procs, mean_mse, cv_std):
    neg_mse = cross_val_score(model, feature_df, target_df, cv=2, n_jobs=num_procs, scoring='neg_mean_squared_error')
    mean_mse[model] = -1.0*np.mean(neg_mse)
    cv_std[model] = np.std(neg_mse)

In [9]:
# Function to print the summary of model
def print_summary_models(model, mean_mse, cv_std):
    print('\nModel:\n', model)
    print('Average MSE:\n', mean_mse[model])
    print('Standard deviation during CV:\n', cv_std[model])

In [10]:
# Function to save the results(predictions) and also to plot the feature importance into the model
def save_results(model, mean_mse, predictions, feature_importances):
    with open('model.txt', 'w') as file:
        file.write(str(model))
    feature_importances.to_csv('feature_importances.csv') 
    np.savetxt('predictions.csv', predictions, delimiter=',')

In [12]:
# Definining train data
train_feature_file = 'data/train_features.csv'
train_target_file = 'data/train_salaries.csv'
test_feature_file = 'data/test_features.csv'

In [13]:
# Defining catergorical, numerical and target variables
categorical_vars = ['companyId', 'jobType', 'degree', 'major', 'industry']
numeric_vars = ['yearsExperience', 'milesFromMetropolis']
target_var = 'salary'

In [14]:
# Loading the train data
print("Reading data")
feature_df = read_csv(train_feature_file)
target_df = read_csv(train_target_file)
test_df = read_csv(test_feature_file)

# Merging the train data and target data
train_df = consolidate_data(feature_df, target_df, key='jobId')

# Cleaning the data
clean_train_df = shuffle(clean_df(train_df)).reset_index()

# Performing one hot encoding on the training data
print("Encoding data")
feature_df = one_hot_encode_df(clean_train_df, cat_vars=categorical_vars, num_vars=numeric_vars)
test_df = one_hot_encode_df(test_df, cat_vars=categorical_vars, num_vars=numeric_vars)

# Get target df
target_df = get_target_df(clean_train_df, target_var)

Reading data
Encoding data


In [15]:
# Initializing model list and disctionaris for mean mas enad cross validation standard deviation
models = []
mean_mse = {}
cv_std = {}
res = {}

# Define number of processes to run in parallel
num_procs = 2

# Define shared model paremeter
verbose_lvl = 0

#### We already did hyperperameter tuning of each model in Data Modeling notebook. So we will simply create model objects and train them.
 - We will store the mean_mse in dictionary
 - We will perform cross-validation using k-fold cross validation 
 - We will store the result in dictionary

In [16]:
# Create model objects
lr = LinearRegression()
rf = RandomForestRegressor(n_estimators=180, n_jobs=num_procs, max_depth=30, min_samples_split=60,\
                           max_features=30, verbose=verbose_lvl)
gbm = GradientBoostingRegressor(n_estimators=180, max_depth=10, loss='ls', verbose=verbose_lvl)


In [17]:
# Training Linear Regression model, Performing Cross-Validation and printing summary
train_models(lr, feature_df, target_df, num_procs, mean_mse, cv_std)
print_summary_models(lr, mean_mse, cv_std)


Model:
 LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
Average MSE:
 384.45520244056524
Standard deviation during CV:
 0.23037442568974598


In [18]:
# Making pipeline for model
lr_std_pca = make_pipeline(StandardScaler(), PCA(), LinearRegression())

In [None]:
# Training Random Forest model, Performing Cross-Validation and printing summary
train_models(rf, feature_df, target_df, num_procs, mean_mse, cv_std)
print_summary_models(rf, mean_mse, cv_std)

In [None]:
# Training Gradient Boosting model, Performing Cross-Validation and printing summary
train_models(gbm, feature_df, target_df, num_procs, mean_mse, cv_std)
print_summary_models(gbm, mean_mse, cv_std)

In [None]:
# Training Polynomial Fature Model, Perfoming Cross-Validation and printing Summary
p = PolynomialFeatures(2)

#Fitting object to training/testing data
x_train_p = p.fit_transform(clean_train_df)
x_test_p = p.fit_transform(test_df)

In [None]:
poly = LinearRegression()
poly.fit(x_train_p, target_df)