# Web Scraping for Indeed.com & Predicting Salaries

### Problem Statement:
#### Can salaries for Data Scientist job listings be predicted as high or low, in comparison to the median salary posted, using various features of the posting? What will best predict whether it's higher or lower than the median?

#### Functions to grab the job title, company, location, salary, and brief description

### Analysis for Principal

https://docs.google.com/document/d/16qF3YIrw4OdYrzfc9DpSkoTCKnOisrbD1_UR5Wr4BJw/edit?usp=sharing

In [1]:
## use various IDs, classes to identify the features
## if they cannot be found, an error will be thrown,
## and we will simply pass that attribute and move on

def get_job(webpage):
    tag = webpage.find('a', title=True, attrs={'data-tn-element':'jobTitle'})
    try:
        return tag['title']
    except:
        pass

def get_company(webpage):
    tag = webpage.find('span', attrs={'class':'company'})
    try:
        return tag.text.strip('\n')
    except:
        pass
def get_location(webpage):
    tag = webpage.find('span', attrs={'class':'location'})
    try:
        return tag.text
    except:
        pass    

def get_salary(webpage):
    try:
        return webpage.find('table').tr.td.nobr.renderContents() ## for regular listings
    except:
        try:
            return webpage.find('div').div.text ## for sponsored listings
        except:
            pass

def get_description(webpage):
    description = webpage.find('span', attrs={'itemprop':"description"})
    try:
        return description.text.strip('\n')
    except:
        pass

In [2]:
###############
###############
### I chose to write a function to combine some of the steps
### I also chose not to clean the salaries each time, but instead
### will just clean them all at once, when I import the data from the csvs
### pps I am not limiting my search to particular cities
###############
###############


## first, define two functions that will be used in 
## the main scraping function: str_number_to_number, compile_files


# given a string of a number with commas, convert to float
def str_number_to_number(string):
    import locale 
    string = string.strip('$')
    locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') #for american comma notation
    # if european comma notation needed, change 2nd parameter to 'fr_FR'
    num = locale.atof(string)
    return float(num)


## import the results that have been previously exported into df
def compile_files():
    import glob
    import pandas as pd
    import numpy as np
    indeed_csvs = '/Users/jennydoyle/Desktop/dsi/indeed/'
    files = glob.glob(indeed_csvs + '*.csv') # get a list of the csv files
    indeed_final = pd.DataFrame(columns=['job','company','location','salary','description'])
    for f in files: # read each csv file in
        f = pd.read_csv(f, names=['job','company','location','salary','description'],low_memory=False)
        indeed_final = indeed_final.append(f)
    indeed_final.drop_duplicates(inplace=True)
    return indeed_final


######################################################
######################################################
################    SCRAPE TIME!!    #################
######################################################
######################################################


def scrape_indeed():
    
    import requests
    from bs4 import BeautifulSoup
    import datetime
    import time
    import re
    import numpy as np
 
    
    ## compile previously scraped results to see if there are new jobs to add
    indeed = compile_files()    
    base = len(indeed)
    ## record start time to calculate elapsed time
    start = datetime.datetime.now()
    
    print 'Start time: ',start.strftime("%Y-%m-%d %H:%M:%S")
    print 'Base file has ', base, ' records'

    ## add '&fromage=last' to the url to get newly added jobs that might be skipped over otherwise
    ## end the url at start= so we can dynamically flip through all pages of listings
    url = "http://www.indeed.com/jobs?q=data+scientist+%2420%2C000&fromage=last&start="

    ## x will indicate the number of the first listing on a particular page 
    ## each page lists 10 posts (and 5 sponsored posts)
    x = 0
    url_start = url+str(x)
    page = requests.get(url_start).content
    soup = BeautifulSoup(page,'lxml')
    
    print 'Page scraped & souped'
    
    ## take the full line that says 'Jobs x to y of z' and turn into a list
    ## use to end the page loop -- function will stop looking 
    ## to the next page once it hits the last results
    for results in soup.find('div', attrs={'id':'searchCount'}):
        count = str(results).split()    
        total = count[len(count)-1]          # set total to z, the total number o[f results
        total = str_number_to_number(total)  # since there are commas if the number > 999, this function will deal with that and convert to int

        
    while x <= total-9:
        ## go to new page of listings
        url_new_page = url + str(x)
        page = requests.get(url_new_page).content
        soup = BeautifulSoup(page)
        
        ## record which number of listings we're at
        ## for process notifications
        for num_listings in  soup.find('div', attrs={'id':'searchCount'}) :
            num_listings = num_listings.split()[3]
        
        main = soup.find('td',{'id':'resultsCol'})   # limit our searching to solely the results portion of the page
        results = main.find_all('div', {'class': re.compile("result$")}) # create a list consisting only of the 15 results

        ## results has a list of the 10 listings on the page, plus the 5 sponsored
        ## loop through each listing and get the job, company, location, salary, and decription info
        for i in range(len(results)):
            job = get_job(results[i])
            company = get_company(results[i]) 
            location = get_location(results[i])
            salary = get_salary(results[i])
            description = get_description(results[i])

            add_job = [job, company, location, salary, description]
            indeed.append(add_job) ## add to main df

        ## move to next page of results
        x+=10
        new = len(indeed) - base
        elapsed = datetime.datetime.now() - start
        remaining = total - x
        est_pages = remaining/10
        
        ## print update after each page bc impatient
        print 'Added ', new, ' jobs-- scraped ',num_listings,' of ', total, ' listings in ', elapsed, '; ', est_pages, ' pages remaining'
        
        time.sleep(0.5) ## wait a little to request the next page
            
    finish = datetime.datetime.now()
    now = finish.strftime("%Y-%m-%d %H:%M:%S")
    print 'Finish time: ',now

    elapsed = finish-start
    print 'Elapsed: ',elapsed
    indeed = pd.DataFrame(indeed)
    
    ## send results to csv file
    indeed.to_csv('/Users/jennydoyle/Desktop/dsi/indeed/'+now+'.csv',sep=',', encoding='utf-8',header=False,index=False)
    return indeed


## Predicting salaries using Random Forests + Another Classifier

#### Load in the the data of scraped salaries

In [76]:
## YOUR CODE HERE

indeed = compile_files()
indeed.reset_index(drop=True)
indeed.head()

Unnamed: 0,job,company,location,salary,description
0.0,Data Scientist,Novetta,"Crystal City, VA",,
1.0,Data Scientist,"Syntelli Solutions, Inc","Charlotte, NC 28277",,
2.0,Software Engineer (Data and Analytics),The Advisory Board Company,"Richmond, VA",,
3.0,Data Scientist,TechStratium Inc.,"McLean, VA",,TechStratium is hiring Data Scientists to join...
4.0,Advanced Analytics Data Scientist,IBM,"Springfield, VA",,"As an Advanced Analytics Data Scientist, you'l..."


In [94]:
###
### CLEAN UP SALARIES
###


import numpy as np

## create a sub-df consisting only of jobs with annual salaries
df=indeed[indeed.salary.notnull()&indeed.salary.str.contains('year')]
df.salary = df.salary.astype(str)

## turn the salary into a list so we can grab the high and low ends, then average
df['salary_list'] = df.salary.str.split()

mask = df.salary.str.contains('-')
df['low_end'], df['high_end'], df['salary_clean'] = np.NaN, np.NaN, np.NaN
df['low_end'][mask] = map(lambda x: x[0],df.salary_list.loc[mask])
df['high_end'][mask] = map(lambda x: x[2],df.salary_list.loc[mask])

df.low_end[df.high_end==1] = np.NaN
df.high_end[df.high_end==1] = np.NaN

df.salary_clean[df.salary.notnull()]= [x[0] for x in df.salary_list[df.salary_list.notnull()]]
df.salary_clean[df.low_end.notnull()&df.high_end.notnull()] = np.NaN

for col in ['salary_clean','low_end','high_end']:
    df[col][df[col].notnull()] = [str_number_to_number(x) for x in df[col][df[col].notnull()]]

# average out ranges
df.salary_clean[df.salary_clean.isnull()] = (df.low_end + df.high_end) / 2

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/

#### We want to predict a binary variable - whether the salary was low or high. Compute the median salary and create a new binary variable that is true when the salary is high (above the median)

In [95]:
###
### BINARY TARGET FEATURE -- above (1) median or below (0) 
###


df = df[df.salary_clean.notnull()]
median_salary = np.median(df.salary_clean)
df['high_salary'] = 1
df['high_salary'][df.salary_clean <= median_salary] = 0 

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [113]:
median_salary

90981.0

#### Thought experiment: What is the baseline accuracy for this model?

In [96]:
###
### LOGISTIC REGRESSION MODEL
###


import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

dummies = pd.get_dummies(df[['job','company','location']])

X = pd.concat([dummies, df['salary_clean']], axis=1)
y = df.high_salary

cross_val_score(LogisticRegression(), X, y)

array([ 0.49206349,  0.5       ,  0.49180328])

#### Create a Random Forest model to predict High/Low salary using Sklearn. Start by ONLY using the location as a feature. 

In [97]:
###
### CLEAN UP LOCATIONS -- parse to city, state
###


## remove areas in parentheses
df.location = df.location.str.replace('\((.*?)\)','')
df.location = df.location.str.strip()

## remove zip codes
df.location = df.location.str.replace(r'(\d{5}(\-\d{4})?)$','')
df.location = df.location.str.strip()

## create feature with states
df['state'] = df.location.str.findall('\,\s(\D{2})$')

## remove state from location
df.location = df.location.str.replace('(\,\s\D{2})$','')

## take the states out of the list they were for some reason placed in
df.state = df.state.astype(str)
df.state = df.state.str.replace('(\[)','')
df.state = df.state.str.replace('(\])','')
df.state = df.state.str.replace('(\')','')

df.company = df.company.str.strip()
df.company = df.company.str.upper()

In [98]:
###
### DUM DUMS!!!!
###


X = pd.get_dummies(df[['location','state']])

In [99]:
###
### RANDOM FOREST MODEL
###


from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold

def classify(Classifier, X, y, weight):
    name = str(Classifier)
    cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=41)
    if weight != '':
        dt = Classifier(class_weight=weight)
    else:
        dt = Classifier()
    s = cross_val_score(dt, X, y, cv=cv, n_jobs=-1)
    print "{} Score:\t{:0.3} ± {:0.3}".format(name, s.mean().round(3), s.std().round(3))

## output baseline score
classify(RandomForestClassifier, X, y, 'balanced')

<class 'sklearn.ensemble.forest.RandomForestClassifier'> Score:	0.651 ± 0.036


#### Create a few new variables in your dataframe to represent interesting features of a job title.
- For example, create a feature that represents whether 'Senior' is in the title 
- or whether 'Manager' is in the title. 
- Then build a new Random Forest with these features. Do they add any value? 


In [100]:
###
### JOB TITLE VARIABLES
###


df.job = df.job.str.upper()
df['analyst'] = 0
df['analyst'][df.job.str.contains('ANALY')] = 1

# df['statistician'] = 0
# df['statistician'][df.job.str.contains('STATISTIC')] = 1

df['machine_learning'] = 0
df['machine_learning'][df.job.str.contains('MACHINE')] = 1

# df['research'] = 0
# df['research'][df.job.str.contains('RESEARCH')] = 1

# df['science'] = 0
# df['science'][df.job.str.contains('SCIEN')] = 1

df['engineer'] = 0
df['engineer'][df.job.str.contains('ENGIN')] = 1

df['entry_level'] = 0
df['entry_level'][df.job.str.contains('\WI\W')] = 1
df['entry_level'][df.job.str.contains('\WI$')] = 1
df['entry_level'][df.job.str.contains('ENTRY_LEVEL')] = 1
df['entry_level'][df.job.str.contains('1')] = 1

df['mid_level'] = 0
df['mid_level'][df.job.str.contains('MANAGER')] = 1
df['mid_level'][df.job.str.contains('MID_LEVEL')] = 1
df['mid_level'][df.job.str.contains('\WII\W')] = 1
df['mid_level'][df.job.str.contains('\WII$')] = 1
df['mid_level'][df.job.str.contains('2')] = 1
df['mid_level'][df.job.str.contains('ASSISTANT')] = 1

df['senior_level'] = 0
df['senior_level'][df.job.str.contains('\WIII\W')] = 1
df['senior_level'][df.job.str.contains('\WIII$')] = 1
df['senior_level'][df.job.str.contains('3')] = 1
df['senior_level'][df.job.str.contains('SR\W')] = 1
df['senior_level'][df.job.str.contains('SENIOR')] = 1
df['senior_level'][df.job.str.contains('LEAD')] = 1
df['senior_level'][df.job.str.contains('PRINCIPAL')] = 1
df['senior_level'][df.job.str.contains('DIRECTOR')] = 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a

#### Rebuild this model with the new variables
- You can either create the dummy features manually or use the `dmatrix` function from `patsy`
- Remember to scale the feature variables as well!


In [102]:
from sklearn.model_selection import train_test_split
df.job.drop_duplicates(inplace=True)
# df.reset_index(inplace=True)

dummies = pd.get_dummies(df.location)
df_final = pd.concat([dummies, df[['job','company','description','location','high_salary','salary_clean','analyst','engineer','machine_learning','mid_level','entry_level','senior_level']]], axis=1)

df_final.job.drop_duplicates(inplace=True)
df_final.reset_index(inplace=True)


X = pd.concat([dummies, df[['analyst','engineer','machine_learning','mid_level','entry_level','senior_level']]], axis=1)
features = X.columns
y = list(df.high_salary.values)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)


model = RandomForestClassifier().fit(X_train,y_train)
model.score(X_train,y_train)

0.91935483870967738

#### Use cross-validation in scikit-learn to evaluate the model above. 
- Evaluate the accuracy of the model. 

In [103]:
cross_val_score(RandomForestClassifier(), X, y)

array([ 0.71428571,  0.62903226,  0.55737705])

#### Continue to incorporate other text features from the title or summary that you believe will predict the salary and examine their coefficients

In [104]:
feature_importance = pd.DataFrame(sorted(zip(model.feature_importances_,features), key=lambda pair: pair[0], reverse=True))
feature_importance = feature_importance[0:30]
feature_importance


Unnamed: 0,0,1
0,0.07177,analyst
1,0.070448,New York
2,0.055167,senior_level
3,0.041572,machine_learning
4,0.041544,Reston
5,0.025591,Washington
6,0.024921,Stamford
7,0.023929,Chicago
8,0.022562,engineer
9,0.022035,Sunnyvale


In [None]:
## Keywords: research, analyst, statistician, engineer, machine learning
## locations: LA, New York (generally)

In [107]:
predictions = model.predict(X_train)

results = pd.DataFrame(sorted(zip(df_final.analyst, df_final.Reston, df_final['Los Angeles'], df_final.Queens, df_final.engineer, df_final['New York'],df_final.machine_learning,df_final.high_salary,predictions), key=lambda pair: pair[0], reverse=True),columns=['analyst', 'Reston', 'Los Angeles', 'Queens', 'engineer', 'New York','machine_learning','SALARY_HIGH','PREDICTION'])
results['CORRECT']='Yes'
results['CORRECT'][results.SALARY_HIGH!=results.PREDICTION]='No'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [108]:
print 'ANALYST'
results[['CORRECT','PREDICTION']][results.analyst==1].groupby(['CORRECT']).count()


ANALYST


Unnamed: 0_level_0,PREDICTION
CORRECT,Unnamed: 1_level_1
No,22
Yes,24


In [110]:
print 'ENGINEER'
results[['CORRECT','PREDICTION']][results.engineer==1].groupby(['CORRECT']).count()


ENGINEER


Unnamed: 0_level_0,PREDICTION
CORRECT,Unnamed: 1_level_1
No,8
Yes,5


In [111]:
print 'MACHINE LEARNING'
results[['CORRECT','PREDICTION']][results.machine_learning==1].groupby(['CORRECT']).count()


MACHINE LEARNING


Unnamed: 0_level_0,PREDICTION
CORRECT,Unnamed: 1_level_1
No,2
Yes,5


#### Take ~100 scraped entries with salaries. Convert them to use with your model and predict the salary - which entries have the highest predicted salaries?

In [204]:
import numpy as np

random_salaries = df_final.sample(n=120)
# X_keep = df_final.features
X_random = random_salaries[features]
y_random = random_salaries.high_salary

model.score(X_random,y_random)

0.8666666666666667

In [205]:
predictions = model.predict(X_random)
predict_proba = model.predict_proba(X_random)[:,1]

In [206]:
results = pd.DataFrame(sorted(zip(df_final.job,df_final.company,df_final.salary_clean,df_final.high_salary,predictions,predict_proba), key=lambda pair: pair[0], reverse=True),columns=['JOB','COMPANIES','SALARY','ACTUALLY_HIGH','PREDICTED_HIGH','PREDICTION_PROBA'])
results['CORRECT']='No'
results['CORRECT'][results.ACTUALLY_HIGH==results.PREDICTED_HIGH]='Yes'

results.head(3)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


Unnamed: 0,JOB,COMPANIES,SALARY,ACTUALLY_HIGH,PREDICTED_HIGH,PREDICTION_PROBA,CORRECT
0,SUPERVISORY HEALTH SCIENTIST,CENTERS FOR DISEASE CONTROL AND PREVENTION,143516.0,1,0,0.0,No
1,SUPERVISORY HEALTH SCIENTIST,CENTERS FOR DISEASE CONTROL AND PREVENTION,143516.0,1,1,0.866667,Yes
2,STATISTICIAN III - DIVISION OF PLANNING AND PR...,STATE OF SOUTH CAROLINA,56947.5,0,1,0.747143,No


In [207]:
print len(results[results.CORRECT=='Yes'])
print len(results[results.CORRECT=='No'])
print len(results[results.CORRECT=='Yes']) / float(len(results))

65
55
0.541666666667


In [None]:
## So, model score ended up being pretty good,
## but it looks like only half were predicted correctly -_____-

### BONUS 

#### Bonus: Use Count Vectorizer from scikit-learn to create features from the text summaries. 
- Examine using count or binary features in the model
- Re-evaluate your models using these. Does this improve the model performance? 
- What text features are the most valuable? 

In [19]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

df_final = df_final[df_final.description.notnull()]
df_final.fillna(0,inplace=True)

cvec = CountVectorizer(stop_words='english')
cvec.fit(df_final['description'])


cvec_table  = pd.DataFrame(cvec.transform(df_final['description']).todense(),
             columns=cvec.get_feature_names())
cvec_table = cvec_table.transpose().transpose()
cvec_table.reset_index(drop=True)

Unnamed: 0,100,11,180k,200,2017,220k,605,700,ability,absorption,...,windows,work,working,works,world,writing,wuermli,year,years,york
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
dummies = pd.get_dummies(df_final[['location']])
df = pd.concat([dummies, cvec_table,  df_final[['job','company','description','location','high_salary','salary_clean','analyst','statistician','engineer','machine_learning','research','mid_level','entry_level','senior_level']]], axis=1)
X = pd.concat([dummies, cvec_table, df_final[['analyst','statistician','engineer','machine_learning','research','mid_level','entry_level','senior_level']]], axis=1)
X.drop(X.index[177],inplace=True)
X.fillna(0,inplace=True)
y = list(df_final.high_salary.values)
features = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

model = RandomForestClassifier().fit(X_train,y_train)
model.score(X_train,y_train)

0.97540983606557374

In [23]:
model.score(X_test,y_test)

0.70491803278688525

In [24]:
model.score(X,y)

0.88524590163934425

In [25]:
## Create a df of the features and their scores, descending
feature_importances = sorted(zip(model.feature_importances_,features), key=lambda pair: pair[0], reverse=True)
feature_importances = pd.DataFrame(feature_importances,columns=['importance','feature'])
feature_importances

Unnamed: 0,importance,feature
0,0.075781,analyst
1,0.046352,location_New York
2,0.026553,looking
3,0.024617,development
4,0.020836,scientist
5,0.020542,analytics
6,0.018536,learning
7,0.017325,research
8,0.016607,data
9,0.014140,consulting


In [26]:
## I don't want to use all of the features, just the important ones
## even though most have low scores ... 
features = list(feature_importances.feature[0:45])

X = X[features]
y = list(df_final.high_salary.values)

features = X.columns

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

model = RandomForestClassifier().fit(X_train,y_train)
model.score(X_train,y_train)

0.98360655737704916

In [27]:
model.score(X_test,y_test)

0.78688524590163933

In [28]:
model.score(X,y)

0.91803278688524592

In [None]:
## Interesting that the score went up for the full dataset from the test set