## __Data cleaning__


In [14]:
# Salary Estimate parsing
# Company name parsing
# State name parsing
# Age of company  
# Job description parsing

In [1]:
import pandas as pd 

In [2]:
df = pd.read_csv("dataset/glassdoor_jobs.csv")

#### **Salary Estimate parsing**

In [3]:
# Filter out jobs "Salary Estimate" having values   
df = df[df["Salary Estimate"] != "-1"]
# Remove string after salary range values 
salary = df["Salary Estimate"].apply(lambda x: x.split('(')[0])
# Remove thousand marker 
salary = salary.apply(lambda x: x.replace('K','').replace('$',''))
# Remove unecessary characters surrounding the salary estimate 
salary = salary.apply(lambda x: x.lower().replace('employer provided salary:','').replace('per hour',''))
# Split and get the min. and max. salary estimate and make them as new features  
df["min_salary"] = salary.apply(lambda x: int(x.split('-')[0]))
df["max_salary"] = salary.apply(lambda x: int(x.split('-')[1]))
df["avg_salary"] = (df["min_salary"] + df["max_salary"]) / 2
# salary.unique()
# df["avg_salary"].dtype

In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,min_salary,max_salary,avg_salary
0,0,Data Scientist,$53K-$91K (Glassdoor est.),"Data Scientist\nLocation: Albuquerque, NM\nEdu...",3.8,Tecolote Research\n3.8,"Albuquerque, NM","Goleta, CA",501 to 1000 employees,1973,Company - Private,Aerospace & Defense,Aerospace & Defense,$50 to $100 million (USD),-1,53,91,72.0
1,1,Healthcare Data Scientist,$63K-$112K (Glassdoor est.),What You Will Do:\n\nI. General Summary\n\nThe...,3.4,University of Maryland Medical System\n3.4,"Linthicum, MD","Baltimore, MD",10000+ employees,1984,Other Organization,Health Care Services & Hospitals,Health Care,$2 to $5 billion (USD),-1,63,112,87.5
2,2,Data Scientist,$80K-$90K (Glassdoor est.),"KnowBe4, Inc. is a high growth information sec...",4.8,KnowBe4\n4.8,"Clearwater, FL","Clearwater, FL",501 to 1000 employees,2010,Company - Private,Security Services,Business Services,$100 to $500 million (USD),-1,80,90,85.0
3,3,Data Scientist,$56K-$97K (Glassdoor est.),*Organization and Job ID**\nJob ID: 310709\n\n...,3.8,PNNL\n3.8,"Richland, WA","Richland, WA",1001 to 5000 employees,1965,Government,Energy,"Oil, Gas, Energy & Utilities",$500 million to $1 billion (USD),"Oak Ridge National Laboratory, National Renewa...",56,97,76.5
4,4,Data Scientist,$86K-$143K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee",86,143,114.5


In [5]:
# As whown above, salary may come "Per hour" or "Employer Provided" or simply "Per annum"
# Creating new columns 'hourly'/'employer_provided' to handle them with values 
# 1: if 'Salary Estimate' is hourly, 0: otherwise 
# 1: if 'Salary Estimate' is employer provided, 0: otherwise
df["hourly"] = df["Salary Estimate"].apply(lambda x: 1 if 'per hour' in x.lower() else 0)
df["employer_provided"] = df["Salary Estimate"].apply(lambda x: 1 if 'employer provided salary:' in x.lower() else 0)
df["employer_provided"].unique()
# df["hourly"].unique()
# salary.unique()

array([0, 1])

In [31]:
# df.head()

#### **Company Name parsing**

In [6]:
((df["Rating"] < 0) * 1).unique()
((df["Rating"] < 0) * 1).sum() # there 11 Companies without rating

11

In [7]:
# Querying the whole dataframe
df["company_name"] = df.apply(lambda x: x["Company Name"] if x["Rating"] < 0 else x["Company Name"].split('\n')[0], axis=1)

#### **State field parsing**

In [8]:
# Parse state 
df["state_name"] = df["Location"].apply(lambda x: x.split(',')[1])
# df["state_name"].value_counts()
# df["state_name"].unique()
# Find if the Job state is at the Headquarter 
df["same_state_as_hq"] = df.apply(lambda x: 1 if x["Location"] == x["Headquarters"] else 0, axis=1)
df["same_state_as_hq"].value_counts()

1    414
0    328
Name: same_state_as_hq, dtype: int64

#### **Age of company parsing**

In [9]:
df["age"] = df["Founded"].apply(lambda x: x if x < 0 else (2020 - x))
df.age.value_counts()

-1      50
 10     32
 12     31
 24     27
 14     24
        ..
 121     1
 118     1
 106     1
 103     1
 276     1
Name: age, Length: 102, dtype: int64

#### **Parsing keywords in Job Description** 

In [None]:
# Find out relevant keyword in job descriptions
# List of relevant keyword: python, `r studio`, `spark`, `aws`, `excel`, `sql`, `tableau`
df["python_yn"] = df["Job Description"].apply(lambda x: 1 if 'python' in x.lower() else 0)
df["rstudio_yn"] = df["Job Description"].apply(lambda x: 1 if 'r studio' in x.lower() or 'r-studio' in x.lower() else 0)
df["spark_yn"] = df["Job Description"].apply(lambda x: 1 if 'spark' in x.lower() else 0)
df["aws_yn"] = df["Job Description"].apply(lambda x: 1 if 'aws' in x.lower() else 0)
df["excel_yn"] = df["Job Description"].apply(lambda x: 1 if 'excel' in x.lower() else 0)
df["sql_yn"] = df["Job Description"].apply(lambda x: 1 if 'sql' in x.lower() else 0)
df["tableau_yn"] = df["Job Description"].apply(lambda x: 1 if 'tableau' in x.lower() else 0)