In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import cross_validation
from sklearn import metrics
from sklearn.grid_search import GridSearchCV

In [2]:
# import dataset
jobs = pd.read_csv('/Users/teresaborcuch/jobs_df.csv')

In [3]:
del jobs['Unnamed: 0']

In [4]:
jobs.head()

Unnamed: 0,id,Python,over_90k,has_phd,years,has_startup,title,company,scientist_title,analyst_title
0,jl_f43cd8061406b3d7,1.0,1.0,0.0,,0.0,Director of Data Science and Analysis,Fidelity Investments,0,0
1,jl_90603c7f1f0af480,1.0,1.0,0.0,0.0,0.0,Analytics Engineer (Boston),QuantumBlack,0,0
2,jl_50022587c7a4a8d9,1.0,1.0,0.0,0.0,0.0,Data Scientist Intern: Pricing & Profitability...,Wayfair,1,0
3,jl_44da2bd2b0b7e145,1.0,1.0,1.0,0.0,0.0,Machine Learning Scientist,Amazon Corporate LLC,1,0
4,jl_f5945f64ec7013e3,1.0,1.0,0.0,0.0,0.0,Data Scientist,MIT,1,0


In [5]:
jobs.dropna(subset = ['over_90k'], inplace = True)
jobs['years'].fillna(0, inplace = True)
jobs['years'] = jobs['years'].astype('int')
jobs['Python'] = jobs['Python'].astype('category')
jobs['has_phd'] = jobs['has_phd'].astype('category')
jobs['has_startup'] = jobs['has_startup'].astype('category')
jobs['scientist_title'] = jobs['scientist_title'].astype('category')
jobs['analyst_title'] = jobs['analyst_title'].astype('category')
jobs['over_90k'] = jobs['over_90k'].astype('category')

In [6]:
len(jobs['id'].unique())

627

In [7]:
# Now we have 627 jobs ready to go 
jobs.shape

(627, 10)

In [8]:
jobs.dtypes

id                   object
Python             category
over_90k           category
has_phd            category
years                 int64
has_startup        category
title                object
company              object
scientist_title    category
analyst_title      category
dtype: object

I'll build a logistic regression model that considers particular terms in a job's title and description and uses these to predict whether the job pays over 90k or not. This analysis will identify which factors are the best predictors of high salary, construct and optimize a model using cross-validation, and evaluate its performance on a novel test subset of data.

In [10]:
# Create matrices of predictors and targets
X = jobs[['Python','has_phd','years','has_startup','scientist_title','analyst_title']]
y = jobs.over_90k

In [39]:
# split for training and testing
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size = 0.2)

In [40]:
X_train

Unnamed: 0,Python,has_phd,years,has_startup,scientist_title,analyst_title
438,0.0,0.0,0,0.0,0,0
14,1.0,1.0,2,0.0,1,0
24,1.0,1.0,0,0.0,0,0
501,0.0,0.0,0,0.0,0,0
476,0.0,1.0,0,0.0,1,0
209,0.0,0.0,0,0.0,0,0
202,0.0,0.0,0,0.0,0,1
253,0.0,1.0,0,0.0,0,0
568,0.0,0.0,0,0.0,0,0
141,1.0,0.0,0,0.0,1,0


In [45]:
# use gridsearch to find best value of C and regularization type to optimize model
logreg = linear_model.LogisticRegression()
C_vals = [0.0001, 0.001, 0.01, 0.1, 0.5, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]
penalties = ['l1','l2']

gs = GridSearchCV(logreg, {'penalty':penalties, 'C':C_vals}, verbose = True, cv = 5, scoring = 'f1_macro')
gs.fit(X, y)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    0.3s
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:    0.7s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.0001, 0.001, 0.01, 0.1, 0.5, 0.75, 1.0, 2.5, 5.0, 10.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1_macro',
       verbose=True)

In [46]:
gs.best_params_

{'C': 100.0, 'penalty': 'l1'}

The best model has C = 100.0 and uses L1 regularization penalty.

In [49]:
# Fit a model with these hyperparameters
gs_logreg = linear_model.LogisticRegression(C = 100.0, penalty = 'l1', solver = 'liblinear')
gs_logreg.fit(X,y)

LogisticRegression(C=100.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [50]:
predictions = gs_logreg.predict(X)

In [56]:
# Make confusion matrix
conmat= metrics.confusion_matrix(y, predictions, labels=gs_logreg.classes_)
conmat= pd.DataFrame(conmat, columns=gs_logreg.classes_, index=gs_logreg.classes_)
print conmat

     0.0  1.0
0.0  206  105
1.0   99  217


In [59]:
# Evaluate model
print metrics.classification_report(y, predictions)

             precision    recall  f1-score   support

        0.0       0.68      0.66      0.67       311
        1.0       0.67      0.69      0.68       316

avg / total       0.67      0.67      0.67       627



In [60]:
gs_logreg.coef_

array([[ 1.26590898,  0.81567117,  0.20195585,  1.54177996, -0.18713206,
        -0.82614071]])