# Get Salary Information from the Indeed Dataset

I want to analyse whether companies that advertise jobs with a salary have a smaller gender pay gap than companies that don't list a salary (for example by saying that it is "negotiable" or "dependent on experience").

First we need to get the salary listing information. This is not prodiced in the Indeed dataset but is contained on their website. So we need to do some webscraping.

## Import the Indeed Data

In [None]:
import requests
from bs4 import BeautifulSoup
import time

In [None]:
# To get txt data in dataframe
import json
from pandas.io.json import json_normalize #package for flattening json in pandas df

def txt_to_df(txt_file_loc):
    txt =  open(txt_file_loc,"r").read()
    txt = json.loads(txt)
    df = json_normalize(txt['results'])
    return df

fulltime_df = txt_to_df("indeed_job_search_api/job_ad_fulltime_with_summary.txt")
partime_df = txt_to_df("indeed_job_search_api/job_ad_parttime_with_summary.txt")

## Look up URLs of job listings (later we'll extract the salary information)

In [None]:
def getSoup(df)
    salarylisting = {}
    for i, j in df.iterrows():
        print(i, j.url)
        jobPage = requests.get(j.url)
        if jobPage.status_code == 200:
            soup = BeautifulSoup(jobPage.text, 'lxml')
            salarylisting[i] = soup
        else:
            print("Eek! status_code=", jobPage.status_code, " for url", j.url)
        time.sleep(1)
    return salarylisting
        
fullSoup = getSoup(fulltime_df)
partSoup = getSoup(parttime_df)

## Filter for salary information

In [None]:
def stripSoupForSalary(listing)
    selectSalaryListing = {}
    for i, soup in listing.items():

        salaryInfo = soup.findAll("span", {"class": "no-wrap"})
        if len(salaryInfo) == 1:
            selectSalaryListing[i] = salaryInfo[0].text.strip()
    return selectSalaryListing

fullSalaryListing = stripSoupForSalary(fullSoup)
partSalaryListing = stripSoupForSalary(partSoup)

## Save to file

In [None]:
with open('selectSalaryListingFullTime.json', 'w') as fp:
    json.dump(fullSalaryListing, fp)
    
with open('selectSalaryListingPartTime.json', 'w') as fp:
    json.dump(partSalaryListing, fp)