# Web Scraping for Indeed.com & Predicting Salaries: Preprocessing and Modeling

In [34]:
# Import necessary libraries
from random import seed
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer

from randomfunctions import print_cols

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

seed(42)

In [2]:
jobs = pd.read_csv('./csvs/cleaned_data/clean_data.csv').drop('Unnamed: 0', axis=1)
mean_sal = jobs['mean salary'].mean()

In [3]:
for i in ['median', '60_perc', '70_perc', '75_perc', '90_perc']:
    exec(f'_{i}_target = jobs[\'{i}_bool\']')
    jobs.drop(f'{i}_bool', axis=1, inplace=True)

jobs.drop(['Salary', 'lower_sal_val', 'upper_sal_val'], axis=1, inplace=True)

In [4]:
jobs = pd.get_dummies(jobs, columns=['state', 'Search Term', 'Company'])
salaries = jobs['mean salary']
jobs.drop(['Location', 'mean salary'], axis=1, inplace=True)

X_train, X_test, y_train, y_test = train_test_split(jobs, _median_target, test_size=0.2)
orig_train_rows = X_train.shape[0]
orig_test_rows = X_test.shape[0]

cvec = CountVectorizer(stop_words='english')
cvec.fit(X_train['Job Title'])

def use_cvec(data):
    cvec_data = pd.DataFrame(cvec.transform(data['Job Title']).todense(),
                           columns=cvec.get_feature_names(), index=data.index)

    data = pd.concat([data.drop('Job Title', axis=1), cvec_data], axis=1)
    return data
    
X_train = use_cvec(X_train)
X_test = use_cvec(X_test)

# Check that concatenation worked as expected
assert orig_train_rows == X_train.shape[0]
assert orig_test_rows == X_test.shape[0]

In [5]:
params = {'n_estimators':range(50,501, 50),
          'max_depth':range(50,301, 50),
          'min_samples_split':range(2,20,2)
         }    

clf = RandomForestClassifier(warm_start=True)

grid = GridSearchCV(clf, param_grid=params, cv=10, verbose=2, n_jobs=-1)
#grid.fit(X_train, y_train)
#pickle.dump(grid, open('grid_rf.pickle', 'wb'))

In [7]:
grid.best_params_

{'max_depth': 300, 'min_samples_split': 2, 'n_estimators': 300}

In [10]:
test_preds = grid.predict(X_test)

accuracy = accuracy_score(y_test, test_preds)
recall = recall_score(y_test, test_preds)
precision = precision_score(y_test, test_preds)

print(f'Accuracy: {accuracy}')
print(f'Recall: {recall}')
print(f'Precision: {precision}')

Accuracy: 0.8538461538461538
Recall: 0.8450704225352113
Precision: 0.8823529411764706


Due to the slightly lower accuracy score on the test set, our model may be slightly overfit.

In [12]:
feature_importances = pd.DataFrame(grid.best_estimator_.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance',
                                                                        ascending=False)
feature_importances.head()

Unnamed: 0,importance
data,0.075333
research,0.036172
analyst,0.029542
engineer,0.028646
scientist,0.027984


While it's good that we can see the most important words, we cannot see in which direction these words push the salary (ie. we don't know whether word data indicates a job is likely to have a higher or lower salary). In order to get some interpretive value from this model, I decided to take the fifty most important words and look at whether the median value of listings with that word in them were above the median of all the job listings.

In [33]:
important_features = feature_importances.head(50).index.tolist()

important_features_df = pd.DataFrame(feature_importances.head(50))

imp2 = []
for i in important_features:
    imp2.append(str(i))

# Create column with boolean values, representing whether the boolean
# or dummy variable was above 0.
for i in imp2:
    X_train[i + ' bool'] = X_train[i] > 0

word_medians = []
word_means = []
for i in imp2:
    word_medians.append(np.median(jobs2.iloc[X_train.index][X_train[i + ' bool'] == True]['mean salary']))
    word_means.append(np.mean(jobs2.iloc[X_train.index][X_train[i + ' bool'] == True]['mean salary']))

important_features_df['word medians'] = word_medians
important_features_df['word means'] = word_means

median_effect = []
for i in important_features_df['word medians']:
    if i > 108607.5:
        median_effect.append('+')
    else:
        median_effect.append('-')

important_features_df['word effect'] = median_effect
important_features_df.to_csv('./csvs/cleaned_data/important_features.csv')

important_features_df.head(20)

Unnamed: 0,importance,word medians,word means,word effect
data,0.075333,132500.0,136377.136598,+
research,0.036172,59669.5,71170.413265,-
analyst,0.029542,75000.0,86242.512821,-
engineer,0.028646,135000.0,139112.802632,+
scientist,0.027984,125000.0,126089.18617,+
machine,0.025557,150000.0,149555.555556,+
learning,0.025476,150000.0,150652.173913,+
Company_ Jobspring Partners,0.02426,157500.0,154880.952381,+
senior,0.023179,135000.0,137449.118421,+
state_'CA',0.013878,145000.0,137827.848739,+


One of the most important things we see here is that data is the most important feature in our model. This suggests not only that several non-data-related job listings come up when searching for data scientist positions on Indeed, but also that these positions are significantly lower paying than data and machine learning-related ones. Unsurprisingly, both 'machine' and 'learning' are also words that indicate high-paying jobs. We'll perform some hypothesis tests soon to confirm these hypotheses.