In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
df = pd.read_csv('glassdoor_data_eda.csv')

In [3]:
# Selection Columns that are correlated to the Salary to the salary
df = df[[
        'Rating', 'Size', 'Type of ownership', 'Industry', 'Sector',
       'Revenue', 'avg_salary', 'job_type', 'seniority', 'state',
       ]]

In [4]:
# Dropping rows with null numeric column
df = df.dropna(subset=['Rating'])

In [5]:
# Encoding categorical variables
df = pd.get_dummies(df)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

In [7]:
scaler = MinMaxScaler()
df['Rating'] = scaler.fit_transform(df['Rating'].values.reshape(-1, 1))

In [8]:
X = df.drop('avg_salary', axis=1)
y = df['avg_salary'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('Train size:', X_train.shape, '\nTest size:', X_test.shape  )

Train size: (682, 182) 
Test size: (171, 182)


In [9]:
# Linear Regression
from sklearn.linear_model import LinearRegression
lr = LinearRegression()
lr.fit(X_train, y_train)
print('Train error: ', cross_val_score(lr, X_train, y_train, cv=3, scoring='neg_mean_absolute_error'), \
     '\nTest error: ', mean_absolute_error(y_test, lr.predict(X_test)))

Train error:  [-2.18917951e+13 -4.04509032e+11 -6.39781682e+11] 
Test error:  739002846080.7931


In [10]:
# KN
from sklearn.neighbors import KNeighborsRegressor
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)
print('Train error: ', cross_val_score(knn, X_train, y_train, cv=3, scoring='neg_mean_absolute_error'), \
     '\nTest error: ', mean_absolute_error(y_test, knn.predict(X_test)))

Train error:  [-21.5495614  -22.87092511 -21.58590308] 
Test error:  21.639181286549707


In [11]:
# Decision Tree
from sklearn.tree import DecisionTreeRegressor
dtree = DecisionTreeRegressor(random_state=1)
dtree.fit(X_train, y_train)
print('Train error: ', cross_val_score(dtree, X_train, y_train, cv=3, scoring='neg_mean_absolute_error'), \
     '\nTest error: ', mean_absolute_error(y_test, dtree.predict(X_test)))

Train error:  [-20.73135965 -23.40323054 -22.55983847] 
Test error:  22.168615984405456


In [12]:
# Random Forest
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor(random_state=1)
rf.fit(X_train, y_train)
print('Train error: ', cross_val_score(rf, X_train, y_train, cv=3, scoring='neg_mean_absolute_error'), \
     '\nTest error: ', mean_absolute_error(y_test, rf.predict(X_test)))

Train error:  [-19.03094636 -18.0960739  -18.33562385] 
Test error:  16.927400051134697


In [13]:
# Random Searching for RandomForest
from sklearn.model_selection import RandomizedSearchCV
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
                'max_features': max_features,
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'bootstrap': bootstrap}


In [14]:
rf = RandomForestRegressor()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 200, 
                               cv = 3, verbose=2, random_state=42, n_jobs = -1, scoring='neg_mean_absolute_error')
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 200 candidates, totalling 600 fits
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  5.2min
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed: 11.8min
[Parallel(n_jobs=-1)]: Done 600 out of 600 | elapsed: 21.6min finished


RandomizedSearchCV(cv=3, estimator=RandomForestRegressor(), n_iter=200,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [10, 20, 30, 40, 50, 60,
                                                      70, 80, 90, 100, 110,
                                                      None],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=42, scoring='neg_mean_absolute_error',
                   verbose=2)

In [15]:
mean_absolute_error(y_test, rf_random.best_estimator_.predict(X_test))

15.567009955444155

In [16]:
# Saving the model 
import pickle
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_random.best_estimator_, f)