# EXPORTS JOBS POSTED FROM [web.byui.edu/StudentEmployment/’](web.byui.edu/StudentEmployment/) INTO A CSV FILE

In [1]:
import http.client
import json
import pandas as pd
from bs4 import BeautifulSoup

## REQUEST DATA FROM API ENDPOINT

In [10]:
conn = http.client.HTTPSConnection("web.byui.edu")
 
conn.request("GET", "/studentemployment/api/jobs")

res = conn.getresponse()

data = res.read()

info = data.decode("utf-8")

responseObject = json.loads(info)

data_jobs = pd.DataFrame(responseObject)

In [None]:
data_jobs

In [None]:
type(data)

## DISPLAY DATA RECEIVED

In [None]:
pd.options.display.max_columns = None

data_jobs

## SIMPLE OVERVIEW

In [None]:
print(f'''
    THERE ARE {data_jobs.jobID.nunique()} JOBS.
    HIGHEST PAY JOB:\'{data_jobs[data_jobs.payRate==data_jobs.payRate.max()]['title'].iloc[0]}\' WITH {data_jobs.payRate.max()} DOLLARS AN HOUR.
    ONLINE JOBS: {data_jobs[data_jobs["title"].str.contains('Online')].shape[0]} OUT OF {data_jobs.shape[0]}.
''')

## CLEAN THE DATA

In [None]:
col_dates = ['dateUpdated','startDate','endDate','beginningDate','recruitingStartDate']

data_jobs[col_dates] = data_jobs[col_dates].astype('datetime64[ns]')

columns_to_drop = [
    'jobID', # not needed for EDA
    # 'description', # not needed for EDA
    # 'summary', # not needed for EDA
    'displayJob', # single boolean
    'dateUpdated','startDate','endDate', # not needed for display
    'approximateHoursPerWeek', # not consistent
    'positionsAllocated', # not relevant
    'positionsAvailble', # not relevant
    # 'workSchedule', # not needed for EDA
    'requireResume', # not relevant
    'limitApplicants', # not relevant
    'limitNumber', # not relevant
    'applicants', # empty
    'jobQuestions', # empty
    'isOnline', # not accurate
    'allowOnline', # not accurate
    'jobMajors' # not relevant
    ]

data_jobs.drop(columns_to_drop, axis=1, inplace=True)

data_jobs['description'] = data_jobs['description'].apply(lambda x: [p.text.strip() for p in BeautifulSoup(x).find_all('p') if p.text.strip() != ''])

## OPTIONAL: Save as a CSV

In [None]:
# data_jobs.to_csv('StudentEmployment.csv')

## FILTER OUT THE JOBS THAT HAVE 'Online','Custodian', and 'TA' IN THEIR TITLE

In [None]:
# JOBS FILTERED. NOT ONLINE, NOR CUSTODIAN, AND NOR TA.

remove = ['Online','Custodian','TA']

data_filtered = data_jobs[~data_jobs["title"].str.contains('|'.join(remove))].sort_values('payRate',ascending=False).reset_index().drop('index',axis=1)

print(f'{data_filtered.shape[0]} JOBS OUT OF {data_jobs.shape[0]} POSTED JOBS.')

pd.set_option('display.max_rows', None)

data_filtered

## OPTIONAL: Save as a CSV

In [None]:
# data_filtered.to_csv('StudentEmployment_filtered.csv')