# Data Cleanup
Now let's begin the process of cleaning up the string data we scraped from Glassdoor.

In [120]:
# libraries needed
import pandas as pd
import numpy as np
from datetime import datetime

pd.set_option('display.max_rows', 100)
pd.options.mode.chained_assignment = None

In [None]:
# get some information on the saved data
file_name = r"" # enter the filepath between the quotes
data = pd.read_csv(file_name)
data.head(10)

In [119]:
#We see some NaN values, so let's confirm they are recognized as nulls
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 700 entries, 0 to 699
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          700 non-null    object 
 1   Salary Minimum     630 non-null    object 
 2   Salary Maximum     630 non-null    object 
 3   Salary Average     630 non-null    object 
 4   Rating             657 non-null    float64
 5   Company Name       700 non-null    object 
 6   Location           700 non-null    object 
 7   Size               670 non-null    object 
 8   Founded            564 non-null    float64
 9   Type of ownership  670 non-null    object 
 10  Industry           638 non-null    object 
 11  Sector             638 non-null    object 
 12  Revenue            670 non-null    object 
dtypes: float64(2), object(11)
memory usage: 71.2+ KB


Above, we can see that some columns are fully populated, such as 'Job Title', but others have nulls. I do see some black values where there should be NaNs, so I will run through the document and replace empty cells with NaN. I will want to convert the year founded into years existing. I have to clean up some duplicates I see, convert the salary to a float and remove non numeric characters, and clean up some trailing characters at the end of the company names (/n#). It would be useful to have the location be split into city and states. I may want to clean up the string for type of ownership into just Private vs Public, but I'll run through it to confirm. Finally, I will have to review the revenue data and convert the Unknown/NA into nulls, then determine if the information can be used. **A significant and fun list!**

In [None]:
#Replace empty cells with NaN
#r = raw string. ^ = start of line $ = end of line \s* = any length of string (accounts for whitespace)
data = data.replace(r'^\s*$', np.nan, regex=True) # we use regex to check the cell expression and see if it matches the input
data.head(10)

In [None]:
# checking columns for nulls
data.isnull().sum()

In [None]:
#Remove features where the salary is null,since that doesnt help us
data_cleaned = data.dropna(axis=0,subset=['Salary Average', 'Salary Minimum'])
data_cleaned.isnull().sum()

In [None]:
# separate hourly rows from salary rows
data_cleaned = pd.DataFrame(data = data_cleaned) # convert the slice to a pandas dataframe to work with it
data_cleaned['Average Hourly Rate'] = data_cleaned["Salary Average"].apply(lambda x: 1 if '/hr' in x.lower() else 0)
data_cleaned = data_cleaned.reset_index(drop=True)
data_cleaned.head(20)

We can see that there appears to be average hourly rate data in here. Let's split those out so we can compare the hourly rates to the salary rates givin when we analyze the data.

In [None]:
#clean up Salary min/max/average strings to only have numeric strings, then convert to float
# remove $, ',',(/yr (est.)

#First, lets clean up the Average Salary
salary_avg = data_cleaned['Salary Average'].apply(lambda x: x.split("/")[0])
salary_avg = salary_avg.apply(lambda x: x.replace('$', '').replace(',',''))
data_cleaned['Salary Average'] = salary_avg

#Now the minimum salary
salary_min = data_cleaned['Salary Minimum'].apply(lambda x:x.replace('$', '').replace('K','').replace('/hr', ''))
data_cleaned['Salary Minimum']=salary_min


#Now the maximum salary
salary_max = data_cleaned['Salary Maximum'].apply(lambda x:x.replace('$', '').replace('K',''))
data_cleaned['Salary Maximum']=salary_max
data_cleaned.tail(10)

Now lets convert the hourly values to salary

In [None]:
# convert hourly salary to yearly salary
# convert $/hr to $/year and replace cells with the yearly estimate
def hr_to_year(i):
    i = int(float(i)) # convert the string to a float
    i = i*40*52
    #print("A rate of $", i,"/hr will be a salary of $", int(salary),"/yr.")
    return i

data_cleaned["Converted Salary"] = data_cleaned["Salary Average"].apply(lambda x: hr_to_year(x) if x.find('.') != -1 else x)
data_cleaned[["Salary Average","Converted Salary"]].head(10)


With the hourly rate converted to yearly salary and a column marking which rows had an hourly value, we can now move on to some simplier tasks, such as converting data types and cleaning up some strings.

In [None]:
# remove trailing 5 characters from Company Name strings
data_cleaned['Company Name'] = data_cleaned["Company Name"].apply(lambda x: x.split('\n', 1)[0] if x.find('\n') != -1 else x)
data_cleaned['Company Name'].head(15)


In [123]:
# split location to city and state


#str(string_check.iloc[0]).split(',')[1]
#a = 0
#for i in DA_data_cleaned["Location"]:
#    if ',' in str(DA_data_cleaned["Location"].iloc[i]):
#        DA_data_cleaned["City"] = str(DA_data_cleaned["Location"].iloc[i]).split(',')[0]
#        DA_data_cleaned["State"] = str(DA_data_cleaned["Location"].iloc[i]).split(',')[1]
#        a += 1
#    else:
#        DA_data_cleaned["City"] = "Remote"
#        DA_data_cleaned["State"] = "Remote"
#        a +=1

data_cleaned["City"] = data_cleaned["Location"].apply(lambda x: x.split(', ')[0])
data_cleaned["State"] = data_cleaned["Location"].apply(lambda x: x.split(',')[-1] if x.find(',') != 1 else "BAR") 
data_cleaned["State"] = data_cleaned["State"].apply(lambda x: x.strip() if x.strip().lower() != 'manhattan' else 'NY')
data_cleaned['State'].value_counts()


Remote    87
NY        74
CO        71
IL        54
CA        45
TX        38
PA        32
VA        32
GA        29
HI        25
FL        16
MO        15
OH        15
NJ        14
DC        13
MN        11
TN         9
NC         9
MA         8
MI         7
AZ         6
RI         6
NE         5
IN         3
UT         2
SC         2
CT         1
MD         1
Name: State, dtype: int64

In [None]:
# convert year founded to years in existance
currentyear = datetime.now().year
data_cleaned['Company Age (years)'] = data_cleaned["Founded"].apply(lambda x:-1 if x==float(np.nan) else currentyear - x)
data_cleaned

In [None]:
# Group jobs under archetypes (Junior v senior, analyst v business analyst)
data_cleaned["Job Title"].value_counts() # count instances of job titles occuring

We can see from above that there are already some redundancies due to small changes in the titles (eg Sr. Data Analyst ). Let's group the jobs together with a function that searches the titles and combines everything under similar banners (manager, analyst, specialist, etc). Ken Jee created some nice functions that will serve us well, so if you wish to see more here is the link: https://youtu.be/QWgg4w1SpJ8.

In [None]:
#define functions to bin jobs into groups
def title_condencer(title):
    if 'scientist' in title.lower():
        return 'data scientist'
    elif 'data engineer' in title.lower():
        return 'data engineer'
    elif 'machine learning' in title.lower():
        return 'machine learning'
    elif 'data scientist' in title.lower():
        return 'data scientist'
    elif 'analyst' in title.lower():
        return 'analyst'
    elif 'manager' in title.lower():
        return 'director'
    elif 'specialist' in title.lower():
        return 'specialist'
    elif 'business' in title.lower():
        return 'business-based'
    else:
        return 'Unbinned'

#identify if there is a seniority or level flag
def seniority(title):
    if 'sr' in title.lower() or 'senior' in title.lower() or 'sr.' in title.lower() or 'lead' in title.lower() or 'prinicpal' in title.lower() or 'iii' in title.lower():
        return 'senior'
    elif 'jr' in title.lower() or 'jr.' in title.lower() or 'junior' in title.lower():
        return 'junior'
    else:
        return 'na'

In [None]:
#Check binning
data_cleaned['Job Title'] = data_cleaned['Job Title'].values.astype(str)
data_cleaned['Title Grouping'] = data_cleaned['Job Title'].apply(title_condencer)
data_cleaned['Title Grouping'].value_counts()

In [None]:
#check seniority level
data_cleaned['Seniority Level'] = data_cleaned['Job Title'].apply(seniority)
data_cleaned['Seniority Level'].value_counts()

In [None]:
#input the filename you wish to save the information as
file = r"" #place the filepath between the quotes

data_cleaned.to_csv(file, index = False)

In [None]:
# check for string answers for Ownership column - to do

In [None]:
# Replace string "unknown/Non-applicable" in revenue with NaN - to do