# Clean Dataset

In [1]:
import pandas as pd

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


## Read Data

In [2]:
df_jobs = pd.read_csv('data/jobs.csv')
df_jobs

Unnamed: 0,job_id,company_name,title,description,max_salary,pay_period,location,company_id,med_salary,min_salary,...,compensation_type,normalized_salary,zip_code,company_description,company_state,company_country,company_city,company_zip_code,company_industries,company_employee_count
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,20.0,HOURLY,"Princeton, NJ",2774458.0,,17.0,...,BASE_SALARY,38480.0,8540.0,With years of experience helping local buyers ...,NJ,US,Jersey City,07302,Real Estate,402.0
1,1829192,,Mental Health Therapist/Counselor,"At Aspen Therapy and Wellness , we are committ...",50.0,HOURLY,"Fort Collins, CO",,,30.0,...,BASE_SALARY,83200.0,80521.0,,,,,,,
2,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,65000.0,YEARLY,"Cincinnati, OH",64896719.0,,45000.0,...,BASE_SALARY,55000.0,45202.0,"In April of 1983, The National Exemplar began ...",Ohio,US,Mariemont,45227,Restaurants,15.0
3,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,175000.0,YEARLY,"New Hyde Park, NY",766262.0,,140000.0,...,BASE_SALARY,157500.0,11040.0,"Abrams Fensterman, LLP is a full-service law f...",New York,US,Lake Success,11042,Law Practice,222.0
4,35982263,,Service Technician,Looking for HVAC service tech with experience ...,80000.0,YEARLY,"Burlington, IA",,,60000.0,...,BASE_SALARY,70000.0,52601.0,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123844,3906267117,Lozano Smith,Title IX/Investigations Attorney,Our Walnut Creek office is currently seeking a...,195000.0,YEARLY,"Walnut Creek, CA",56120.0,,120000.0,...,BASE_SALARY,157500.0,94595.0,"For more than 30 years, Lozano Smith has serve...",California,US,Fresno,93720,Law Practice,185.0
123845,3906267126,Pinterest,"Staff Software Engineer, ML Serving Platform",About Pinterest:\n\nMillions of people across ...,,,United States,1124131.0,,,...,,,,Pinterest's mission is to bring everyone the i...,California,US,San Francisco,0,Software Development,8667.0
123846,3906267131,EPS Learning,"Account Executive, Oregon/Washington",Company Overview\n\nEPS Learning is a leading ...,,,"Spokane, WA",90552133.0,,,...,,,99201.0,EPS Learning has partnered with educators for ...,MD,US,Bethesda,20814,Education Administration Programs,127.0
123847,3906267195,Trelleborg Applied Technologies,Business Development Manager,The Business Development Manager is a 'hunter'...,,,"Texas, United States",2793699.0,,,...,,,,Trelleborg Applied Technologies manufactures a...,Lancashire,GB,Rochdale,OL11 1TQ,Industrial Machinery Manufacturing,47.0


## Clean Data

In [3]:
''''
Experience level has missing values.
Fill "NA" values with "Unknown".
'''
df_jobs['formatted_experience_level'] = df_jobs['formatted_experience_level'].fillna("Unknown")

In [4]:
''''
Remote allowed are all "1" and has missing values. Assume missing values are "0".
'''
df_jobs['remote_allowed'] = df_jobs['remote_allowed'].fillna(0)

In [5]:
'''
Fill missing values with "Unknown".
'''
df_jobs["zip_code"] = df_jobs["zip_code"].fillna("Unknown")
df_jobs["company_country"] = df_jobs["company_country"].fillna("Unknown")
df_jobs["company_state"] = df_jobs["company_state"].fillna("Unknown")
df_jobs["company_city"] = df_jobs["company_city"].fillna("Unknown")
df_jobs["company_zip_code"] = df_jobs["company_zip_code"].fillna("Unknown")
df_jobs["company_industries"] = df_jobs["company_industries"].fillna("Unknown")

In [6]:
'''
Based on our exploratory data analysis, we unlabel salary data for:
- Jobs with below min wage as they are bad data. <10K USD yearly is below minimum wage in US. See https://www.dol.gov/agencies/whd/minimum-wage)
- Jobs with too high a salary of 1M+ as they are likely to be bad data
- Jobs with too high a salary range spread of more than 4x as they are likely to be highly inaccurate data
'''
salaries = df_jobs['normalized_salary']
salary_spread_range = (df_jobs['max_salary'] - df_jobs['min_salary'])/df_jobs['min_salary']
bad_salaries = (salaries < 1e4) | (salaries > 1e6) | (salary_spread_range > 4)
df_jobs.loc[bad_salaries,  ['normalized_salary', 'med_salary', 'max_salary', 'min_salary']] = None
print('Jobs with salaries we unlabelled:', bad_salaries.sum())

Jobs with salaries we unlabelled: 610


## Transform Columns

In [7]:
'''
Populate "location_state" from "location". 
Ensure that it is a valid state, otherwise we say it's "Unknown".
'''

us_states = [
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'FL', 'GA',
    'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD',
    'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ',
    'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC',
    'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'
]

df_jobs['location_state'] = df_jobs['location'].apply(lambda x: x.split(',')[-1].strip() if ',' in x else None).apply(lambda x: x if x in us_states else "Unknown")
df_jobs['location_state'].value_counts(dropna=False)

location_state
Unknown    19753
CA         11484
TX         10271
NY          6044
FL          5907
NC          4927
IL          4480
PA          4133
VA          3660
MA          3489
OH          3421
GA          3420
NJ          3286
MI          2857
WA          2708
AZ          2507
CO          2318
MD          1974
MO          1922
TN          1885
WI          1849
MN          1849
IN          1808
SC          1539
CT          1191
KY          1179
OR          1177
LA          1106
AL          1004
IA           995
UT           968
KS           931
NV           907
OK           794
AR           665
NE           591
NH           559
NM           499
HI           425
WV           416
ID           413
MS           387
ME           377
DE           320
RI           306
MT           236
ND           235
AK           206
VT           181
SD           165
WY           125
Name: count, dtype: int64

## Filter Data

In [None]:
'''
Based on our problem statement, here is the list of raw features (X) we would try to use to predict salaries:
1. job_id (ID column)
2. title
3. location
4. location_state
5. description
6. formatted_work_type
7. formatted_experience_level
8. remote_allowed
9. company_industries
10. company_country
11. company_state
12. company_city
13. company_description
14. company_employee_count

And here are the target variables (Y) we want to predict:
1. normalized_salary (main)
2. min_salary
3. max_salary
4. med_salary
5. pay_period
'''
FEATURE_COLUMNS = ['title', 'location', 'location_state', 'description', 'formatted_work_type', 'formatted_experience_level', 'remote_allowed',
    'company_industries', 'company_country', 'company_state', 'company_city', 'company_description', 'company_employee_count']
TARGET_COLUMNS = ['normalized_salary', 'min_salary', 'max_salary', 'med_salary', 'pay_period']

'''
As discoverd during EDA, we filter OUT:
- duplicated jobs (based on feature columns, excluding job_id)
- jobs with non-USD currency

We keep jobs without salary data because we can potentially label them.
'''

non_duplicated = ~df_jobs.duplicated(subset=FEATURE_COLUMNS, keep='first')
only_usd = (~df_jobs['currency'].notna()) | (df_jobs['currency'] == 'USD')

# For each filter, print out number of jobs filtered
for name, filter in zip(["non_duplicated", "only_usd"], [non_duplicated, only_usd]):
    print(f"NOT {name} jobs: {len(filter) - filter.sum()}")

df_jobs_clean = df_jobs[non_duplicated & only_usd][
    ['job_id'] + FEATURE_COLUMNS + TARGET_COLUMNS
]

# Breakdown unlabeled VS labelled jobs
unlabeled = df_jobs_clean['normalized_salary'].isna()
print(f"Unlabelled jobs: {unlabeled.sum()}")
print(f"Labelled jobs: {len(df_jobs_clean) - unlabeled.sum()}")

df_jobs_clean

NOT non_duplicated jobs: 4613
NOT only_usd jobs: 15
Unlabelled jobs: 84517
Labelled jobs: 34705


Unnamed: 0,job_id,title,location,location_state,description,formatted_work_type,formatted_experience_level,remote_allowed,company_industries,company_country,company_state,company_city,company_description,company_employee_count,normalized_salary,min_salary,max_salary,med_salary,pay_period
0,921716,Marketing Coordinator,"Princeton, NJ",NJ,Job descriptionA leading real estate firm in N...,Full-time,Unknown,0.0,Real Estate,US,NJ,Jersey City,With years of experience helping local buyers ...,402.0,38480.0,17.0,20.0,,HOURLY
1,1829192,Mental Health Therapist/Counselor,"Fort Collins, CO",CO,"At Aspen Therapy and Wellness , we are committ...",Full-time,Unknown,0.0,Unknown,Unknown,Unknown,Unknown,,,83200.0,30.0,50.0,,HOURLY
2,10998357,Assitant Restaurant Manager,"Cincinnati, OH",OH,The National Exemplar is accepting application...,Full-time,Unknown,0.0,Restaurants,US,Ohio,Mariemont,"In April of 1983, The National Exemplar began ...",15.0,55000.0,45000.0,65000.0,,YEARLY
3,23221523,Senior Elder Law / Trusts and Estates Associat...,"New Hyde Park, NY",NY,Senior Associate Attorney - Elder Law / Trusts...,Full-time,Unknown,0.0,Law Practice,US,New York,Lake Success,"Abrams Fensterman, LLP is a full-service law f...",222.0,157500.0,140000.0,175000.0,,YEARLY
4,35982263,Service Technician,"Burlington, IA",IA,Looking for HVAC service tech with experience ...,Full-time,Unknown,0.0,Unknown,Unknown,Unknown,Unknown,,,70000.0,60000.0,80000.0,,YEARLY
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123844,3906267117,Title IX/Investigations Attorney,"Walnut Creek, CA",CA,Our Walnut Creek office is currently seeking a...,Full-time,Mid-Senior level,0.0,Law Practice,US,California,Fresno,"For more than 30 years, Lozano Smith has serve...",185.0,157500.0,120000.0,195000.0,,YEARLY
123845,3906267126,"Staff Software Engineer, ML Serving Platform",United States,Unknown,About Pinterest:\n\nMillions of people across ...,Full-time,Mid-Senior level,1.0,Software Development,US,California,San Francisco,Pinterest's mission is to bring everyone the i...,8667.0,,,,,
123846,3906267131,"Account Executive, Oregon/Washington","Spokane, WA",WA,Company Overview\n\nEPS Learning is a leading ...,Full-time,Mid-Senior level,1.0,Education Administration Programs,US,MD,Bethesda,EPS Learning has partnered with educators for ...,127.0,,,,,
123847,3906267195,Business Development Manager,"Texas, United States",Unknown,The Business Development Manager is a 'hunter'...,Full-time,Unknown,1.0,Industrial Machinery Manufacturing,GB,Lancashire,Rochdale,Trelleborg Applied Technologies manufactures a...,47.0,,,,,


In [9]:
df_jobs_clean.to_csv('data/jobs_clean.csv', index=False)