# Import Libraries

In [380]:
import pandas as pd
import numpy as np
from sklearn import linear_model, metrics
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import *
import operator

# Define functions

In [381]:
# define a new function that allows for a range with floats
def xfrange(start, stop, step):
    i = 0
    while start + i * step < stop:
        yield start + i * step
        i += 1
        
# function to look at the coefficients of the given model and df
def examine_coefficients(model, df):
    return pd.DataFrame(
        { 'Coefficient' : model.coef_[0] , 'Feature' : df.columns}
    ).sort_values(by='Coefficient')

# Import DataFrames and Clean Data
### Describe the data:
* id_and_python.csv - 309 rows, details if python was found in the job description, pulled from indeed.com
* id_salary.csv - 1359 rows, details if the salary listed was over 90k, pulled from indeed.com
* id_years.csv - 999 rows, details if the # of years required was was listed job description, values of 0, 1, 2, 3, and 4+ years, pulled from indeed.com
* phd.csv - 719 rows, details if phd was found in the job description, pulled from indeed.com
* startup_df.csv - 2489 rows, details if startup was found in the job description, pulled from indeed.com


In [383]:
# Read in the 5 separate dataframes pulled from our indeed.com web scraping
id_python = pd.read_csv('../../DSI-BOS-students/timote_hogan/Project-04/id_and_Python.csv')
id_salary = pd.read_csv('../../DSI-BOS-students/timote_hogan/Project-04/id_salary.csv')
id_years = pd.read_csv('../../DSI-BOS-students/timote_hogan/Project-04/id_years.csv')
id_phd = pd.read_csv('../../DSI-BOS-students/timote_hogan/Project-04/phd_df.csv')
id_startup = pd.read_csv('../../DSI-BOS-students/timote_hogan/Project-04/startup_df.csv')

id_phd=id_phd.rename(columns = {'job_id':'id'}) # renames the job_id column to id

# run through the dataframes and list how many rows we removed
df_list = [id_python,id_salary,id_years, id_phd, id_startup]
for i,df in enumerate(df_list):
    start_length = len(df)
    df = df.drop_duplicates(subset = 'id')
    end_length = len(df)
    print 'start', start_length, 'end', end_length, ':', start_length - end_length, "entries removed"

print '\n'

# remove duplicate id's
id_phd = id_phd.drop_duplicates(subset='id')
id_salary = id_salary.drop_duplicates(subset='id')
id_phd = id_phd.drop_duplicates(subset='id')
id_startup = id_startup.drop_duplicates(subset='id')
id_years = id_years.drop_duplicates(subset='id')

# split out the years column to dummy variables
id_years_dummies = pd.get_dummies(id_years.years,prefix='ys')
id_years = pd.concat([id_years, id_years_dummies],axis=1) # combine the two dataframes back together
# print id_years.head()

# Using left merges on the id_salary df so we keep the data with salary info
salary_df = pd.merge(id_salary, id_python, how='left', on='id')
salary_df = pd.merge(salary_df, id_years, how='left', on='id')
salary_df = pd.merge(salary_df, id_phd, how='left', on='id')
salary_df = pd.merge(salary_df, id_startup, how='left', on='id')

# drop the following lists from the dataframe that are not needed
drop_list = [u'Unnamed: 0_x', u'Unnamed: 0_y', u'Unnamed: 0_x', u'years', u'Unnamed: 0_y', u'Unnamed: 0', u'title']
salary_df = salary_df.drop(drop_list, axis=1)

# fill the NaN values with 0. Since all the Dataframes we merged were 1 or nothing we can safely fill these values
salary_df = salary_df.fillna(0)

# shortens the columns and lowercases
salary_df = salary_df.rename(columns= {'Python':'python', 'has_phd':'phd','has_startup':'startup'})

print salary_df.head(), '\n'

# export to CSV for data analysis
salary_df.to_csv('/Users/gmaclenn/DSI-BOS-students/gmaclenn/source_files/salary_df.csv')

start 305 end 305 : 0 entries removed
start 1125 end 1125 : 0 entries removed
start 1000 end 768 : 232 entries removed
start 720 end 573 : 147 entries removed
start 2490 end 364 : 2126 entries removed


                    id  over_90k  python  ys_0  ys_1  ys_2  ys_3  ys_4  phd  \
0  jl_85c4e91c561780aa         0     1.0   0.0   1.0   0.0   0.0   0.0  0.0   
1  jl_e632343c455d80f9         0     1.0   1.0   0.0   0.0   0.0   0.0  0.0   
2  jl_1b5a168dfc7b2712         0     0.0   0.0   0.0   0.0   1.0   0.0  0.0   
3  jl_05654b2739edb3dc         0     0.0   1.0   0.0   0.0   0.0   0.0  0.0   
4  jl_1a7766c45b1abbeb         0     0.0   0.0   0.0   0.0   1.0   0.0  1.0   

   startup  
0      0.0  
1      0.0  
2      0.0  
3      1.0  
4      0.0   



# Create Logistic Regression Model

In [384]:
predictor_cols = ['python', 'phd', 'startup', 'ys_0', 'ys_1', 'ys_2', 'ys_3', 'ys_4']
target_cols = ['over_90k'] 

X = salary_df[predictor_cols]
y = np.ravel(salary_df[target_cols]) # need a 1-d array (use ravel)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35)

print X_train.shape, y_train.shape # confirm the train test split gives us the right dimensions
print X_test.shape, y_test.shape


(731, 8) (731,)
(394, 8) (394,)


In [385]:
logit = linear_model.LogisticRegression()
job_model = logit.fit(X_train, y_train)
predictions = logit.predict(X_test)

conmat = np.array(confusion_matrix(y_test, predictions, labels=[1,0]))

confusion_df = pd.DataFrame(conmat, index=['Over 90k', 'Under 90k'], columns=['Predicted Over 90k', 'Predicted Under 90k'])

print confusion_df
# print '\n', classification_report(y_test, predictions)

           Predicted Over 90k  Predicted Under 90k
Over 90k                  112                   98
Under 90k                  59                  125


# Examine Model Performance and Coefficients

In [386]:
TP = float(confusion_df.iloc[0][0])
TN = float(confusion_df.iloc[1][1])
FP = float(confusion_df.iloc[1][0])
FN = float(confusion_df.iloc[0][1])

Accuracy = (TP + TN) / (TP + TN + FP + FN)
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)

print "Accuracy: \t", Accuracy
print "Precision: \t", Precision
print "Recall: \t", Recall, '\n'

print examine_coefficients(job_model, X)

Accuracy: 	0.60152284264
Precision: 	0.654970760234
Recall: 	0.533333333333 

   Coefficient  Feature
4    -1.077536     ys_1
3    -0.345652     ys_0
5    -0.197909     ys_2
6     0.200939     ys_3
1     0.683471      phd
7     0.858624     ys_4
0     1.170492   python
2     1.571412  startup


# Using Grid Search to optimize for f1_score
f1_score is essentially the weighted average of precision and recall. Optimizing for these two metrics will give us a good balanced model without skewing one or the other too heavily.

In [387]:
C_vals = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
penalties = ['l1','l2']

gs_1 = GridSearchCV(logit, {'penalty':penalties, 'C':C_vals}, cv=5, scoring='f1_macro', verbose=1)
gs_1.fit(X,y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    0.3s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring='f1_macro', verbose=1)

In [388]:
print gs_1.best_params_

{'penalty': 'l2', 'C': 1.0}


Since the default settings for the Logistic Regression model are C=1.0 and penalty = l2, we do not need to modify the previous model at all.

# Modify the thresholds to play with accuracy and recall
The function below iterates over the range 0.5 (standard cutoff value) to 0.75 and creates a dictionary of the probability cutoff as a key and the accuracy, precision and recall of the model as values. We then return the max Precision value and it's associated cutoff value to demonstrate a cutoff value that would maximize precision.

In [393]:
prob_df = pd.DataFrame(logit.predict_proba(X_test), columns = ['prob0', 'prob1'])

# Modify the threshold to maximize Precision

# create a dictionary
max_precision_dict = {}

for prob_cutoff in xfrange(0.5,0.75,0.01):
    prob_df['higher_threshold'] = [1 if x >= prob_cutoff else 0 for x in prob_df.prob1.values]

    # Create the confusion matrix
    conmat2 = np.array(confusion_matrix(y_test, prob_df.higher_threshold.values, labels=[1,0]))

    confusion_df2 = pd.DataFrame(conmat2, index=['Over 90k', 'Under 90k'], columns=['Predicted Over 90k', 'Predicted Under 90k'])

    # assign variables for measuring statistics
    TP = float(confusion_df2.iloc[0][0])
    TN = float(confusion_df2.iloc[1][1])
    FP = float(confusion_df2.iloc[1][0])
    FN = float(confusion_df2.iloc[0][1])

    Accuracy = (TP + TN) / (TP + TN + FP + FN)
    Precision = TP / (TP + FP)
    Recall = TP / (TP + FN)
    
    # creates dictionary with the probability cutoff as the key and
    # the metrics precision, accuracy & recall as values
    max_precision_dict[prob_cutoff] = [Precision, Accuracy, Recall]
    
# finds the maximum value from the newly created dictionary
import operator
max_precision = max(max_precision_dict.iteritems(), key=operator.itemgetter(1))

print "Optimal Probability Cutoff: \t", max_precision[0]
print "Maximized Precision:\t\t", max_precision[1][0]
print "Accuracy: \t\t\t", max_precision[1][1]
print "Recall: \t\t\t", max_precision[1][2]

Optimal Probability Cutoff: 	0.7
Maximized Precision:		0.782608695652
Accuracy: 			0.565989847716
Recall: 			0.257142857143


In [395]:
prob_df = pd.DataFrame(logit.predict_proba(X_test), columns = ['prob0', 'prob1'])

# print prob_df
prob_df['higher_threshold'] = [1 if x >= 0.7 else 0 for x in prob_df.prob1.values]
# print prob_df.head(), '\n'

# Create the confusion matrix
conmat3 = np.array(confusion_matrix(y_test, prob_df.higher_threshold.values, labels=[1,0]))

confusion_df3 = pd.DataFrame(conmat3, index=['Over 90k', 'Under 90k'], columns=['Predicted Over 90k', 'Predicted Under 90k'])
print confusion_df3, '\n'

# assign variables for measuring statistics
TP = float(confusion_df2.iloc[0][0])
TN = float(confusion_df2.iloc[1][1])
FP = float(confusion_df2.iloc[1][0])
FN = float(confusion_df2.iloc[0][1])

Accuracy = (TP + TN) / (TP + TN + FP + FN)
Precision = TP / (TP + FP)
Recall = TP / (TP + FN)

print "Original Accuracy: \t", metrics.accuracy_score(y_test, predictions)
print "Original Precision: \t", metrics.precision_score(y_test, predictions)
print "Original Recall: \t", metrics.recall_score(y_test, predictions), '\n'

print "Accuracy: ", '\t\t', Accuracy,
print "Precision: ", '\t\t', Precision
print "Recall: ", '\t\t', Recall

           Predicted Over 90k  Predicted Under 90k
Over 90k                   54                  156
Under 90k                  15                  169 

Original Accuracy: 	0.60152284264
Original Precision: 	0.654970760234
Original Recall: 	0.533333333333 

Accuracy:  		0.565989847716
Precision:  		0.782608695652
Recall:  		0.257142857143


##### We can see that modifying the probability cutoff range allows us to play with the Precision & Recall. In an instance where we wanted to minimize the instance of false positives as much as possible, to increase Precision, the Recall value takes a big hit. This is an instance of overfitting.

# Use Gridsearch to optimize the precision

In [389]:
C_vals = [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]
penalties = ['l1','l2']

gs_2 = GridSearchCV(logit, {'penalty':penalties, 'C':C_vals}, cv=5, scoring='precision', verbose=1)
gs_2.fit(X,y)

Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    0.2s
[Parallel(n_jobs=1)]: Done  80 out of  80 | elapsed:    0.3s finished


GridSearchCV(cv=5, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.0001, 0.001, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring='precision', verbose=1)

In [390]:
print gs_2.best_params_

{'penalty': 'l1', 'C': 10.0}


In [392]:
logit_final = linear_model.LogisticRegression(C=10, penalty='l1')
logit_final.fit(X,y)

y_pred_final = logit_final.predict(X_test)
print classification_report(y_test, y_pred_final)

conmat4 = np.array(confusion_matrix(y_test, y_pred_final, labels=[1,0]))

confusion_df4 = pd.DataFrame(conmat4, index=['Over 90k', 'Under 90k'], columns=['Predicted Over 90k', 'Predicted Under 90k'])
print confusion_df4, '\n'


print "Original Accuracy: \t", metrics.accuracy_score(y_test, predictions)
print "Original Precision: \t", metrics.precision_score(y_test, predictions)
print "Original Recall: \t", metrics.recall_score(y_test, predictions), '\n'

print "Accuracy: \t", metrics.accuracy_score(y_test, y_pred_final)
print "Precision: \t", metrics.precision_score(y_test, y_pred_final)
print "Recall: \t", metrics.recall_score(y_test, y_pred_final), '\n'


             precision    recall  f1-score   support

          0       0.57      0.69      0.62       184
          1       0.66      0.54      0.59       210

avg / total       0.62      0.61      0.61       394

           Predicted Over 90k  Predicted Under 90k
Over 90k                  113                   97
Under 90k                  57                  127 

Original Accuracy: 	0.60152284264
Original Precision: 	0.654970760234
Original Recall: 	0.533333333333 

Accuracy: 	0.609137055838
Precision: 	0.664705882353
Recall: 	0.538095238095 



Looking at the modifications we made to the penalty and C values. The model hardly changed at all with the updated parameters.