# Data Cleanup
Now let's begin the process of cleaning up the string data we scraped from Glassdoor.

In [9]:
# libraries needed
import pandas as pd
import numpy as np
from datetime import datetime

pd.set_option('display.max_rows', 100)
pd.options.mode.chained_assignment = None

In [10]:
# get some information on the saved data
file_name = r"C:\Users\Tineash\Projects\Glassdoor_webscraper\Data\Data_analyst_dataset.csv" # enter the filepath between the quotes
data = pd.read_csv(file_name)
data.head(10)

Unnamed: 0,Job Title,Salary Minimum,Salary Maximum,Salary Average,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,Data Analyst + Apprentice (Entry-Level),$35K,$45K,"$40,000 /yr (est.)",3.3,New Apprenticeship\n3.3,"Raleigh, NC",1 to 50 Employees,,Company - Private,,,Unknown / Non-Applicable
1,Data Analyst,$97K,$135K,"$116,000 /yr (est.)",3.6,KEYENCE\n3.6,"Itasca, IL",5001 to 10000 Employees,1974.0,Company - Public,Machinery Manufacturing,Manufacturing,$5 to $10 million (USD)
2,Data Analyst,$100K,$110K,"$105,000 /yr (est.)",3.7,National Association of Boards of Pharmacy\n3.7,"Mount Prospect, IL",51 to 200 Employees,1904.0,Nonprofit Organization,Membership Organizations,Management & Consulting,$25 to $50 million (USD)
3,"Remote - Test Data Analyst (SQL queries, ALM, ...",$50.00 /hr,$55.00,$52.50 /hr (est.),,Compest Solutions Inc,Remote,,,,,,
4,Data Analyst,$65K,$80K,"$72,500 /yr (est.)",4.1,Meritize\n4.1,Remote,51 to 200 Employees,2016.0,Company - Private,Banking & Lending,Financial Services,Unknown / Non-Applicable
5,Data Analyst,$40K,$40K,"$40,000 /yr (est.)",3.7,Apollo Retail\n3.7,Remote,1001 to 5000 Employees,1992.0,Company - Private,Other Retail Stores,Retail & Wholesale,$50 to $100 million (USD)
6,SQL Data Analyst – Part Time,$18.00 /hr,$18.00,$18.00 /hr (est.),,Sackett Financial Group,"Brea, CA",,,,,,
7,Data Analyst,,,,,LENDING USA,Remote,,,,,,
8,Data Analyst II,$90K,$115K,"$102,500 /yr (est.)",2.3,Holman\n2.3,"Mount Laurel, NJ",51 to 200 Employees,1964.0,Company - Private,Advertising & Public Relations,Media & Communication,$5 to $10 million (USD)
9,Jr. Data Analyst,$48K,$98K,"$68,420 /yr (est.)",3.7,Vdart Inc\n3.7,"Plano, TX",1001 to 5000 Employees,2007.0,Company - Private,Information Technology Support Services,Information Technology,$50 to $100 million (USD)


In [11]:
#We see some NaN values, so let's confirm they are recognized as nulls
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Job Title          1200 non-null   object 
 1   Salary Minimum     1085 non-null   object 
 2   Salary Maximum     1085 non-null   object 
 3   Salary Average     1085 non-null   object 
 4   Rating             908 non-null    float64
 5   Company Name       1200 non-null   object 
 6   Location           1200 non-null   object 
 7   Size               983 non-null    object 
 8   Founded            761 non-null    float64
 9   Type of ownership  983 non-null    object 
 10  Industry           791 non-null    object 
 11  Sector             791 non-null    object 
 12  Revenue            983 non-null    object 
dtypes: float64(2), object(11)
memory usage: 122.0+ KB


Above, we can see that some columns are fully populated, such as 'Job Title', but others have nulls. I do see some black values where there should be NaNs, so I will run through the document and replace empty cells with NaN. I will want to convert the year founded into years existing. I have to clean up some duplicates I see, convert the salary to a float and remove non numeric characters, and clean up some trailing characters at the end of the company names (/n#). It would be useful to have the location be split into city and states. I may want to clean up the string for type of ownership into just Private vs Public, but I'll run through it to confirm. Finally, I will have to review the revenue data and convert the Unknown/NA into nulls, then determine if the information can be used. **A significant and fun list!**

In [12]:
#Replace empty cells with NaN
#r = raw string. ^ = start of line $ = end of line \s* = any length of string (accounts for whitespace)
data = data.replace(r'^\s*$', np.nan, regex=True) # we use regex to check the cell expression and see if it matches the input
data.head(10)

Unnamed: 0,Job Title,Salary Minimum,Salary Maximum,Salary Average,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue
0,Data Analyst + Apprentice (Entry-Level),$35K,$45K,"$40,000 /yr (est.)",3.3,New Apprenticeship\n3.3,"Raleigh, NC",1 to 50 Employees,,Company - Private,,,Unknown / Non-Applicable
1,Data Analyst,$97K,$135K,"$116,000 /yr (est.)",3.6,KEYENCE\n3.6,"Itasca, IL",5001 to 10000 Employees,1974.0,Company - Public,Machinery Manufacturing,Manufacturing,$5 to $10 million (USD)
2,Data Analyst,$100K,$110K,"$105,000 /yr (est.)",3.7,National Association of Boards of Pharmacy\n3.7,"Mount Prospect, IL",51 to 200 Employees,1904.0,Nonprofit Organization,Membership Organizations,Management & Consulting,$25 to $50 million (USD)
3,"Remote - Test Data Analyst (SQL queries, ALM, ...",$50.00 /hr,$55.00,$52.50 /hr (est.),,Compest Solutions Inc,Remote,,,,,,
4,Data Analyst,$65K,$80K,"$72,500 /yr (est.)",4.1,Meritize\n4.1,Remote,51 to 200 Employees,2016.0,Company - Private,Banking & Lending,Financial Services,Unknown / Non-Applicable
5,Data Analyst,$40K,$40K,"$40,000 /yr (est.)",3.7,Apollo Retail\n3.7,Remote,1001 to 5000 Employees,1992.0,Company - Private,Other Retail Stores,Retail & Wholesale,$50 to $100 million (USD)
6,SQL Data Analyst – Part Time,$18.00 /hr,$18.00,$18.00 /hr (est.),,Sackett Financial Group,"Brea, CA",,,,,,
7,Data Analyst,,,,,LENDING USA,Remote,,,,,,
8,Data Analyst II,$90K,$115K,"$102,500 /yr (est.)",2.3,Holman\n2.3,"Mount Laurel, NJ",51 to 200 Employees,1964.0,Company - Private,Advertising & Public Relations,Media & Communication,$5 to $10 million (USD)
9,Jr. Data Analyst,$48K,$98K,"$68,420 /yr (est.)",3.7,Vdart Inc\n3.7,"Plano, TX",1001 to 5000 Employees,2007.0,Company - Private,Information Technology Support Services,Information Technology,$50 to $100 million (USD)


In [13]:
# checking columns for nulls
data.isnull().sum()

Job Title              0
Salary Minimum       115
Salary Maximum       115
Salary Average       115
Rating               292
Company Name           0
Location               0
Size                 217
Founded              439
Type of ownership    217
Industry             409
Sector               409
Revenue              217
dtype: int64

In [15]:
#Remove features where the salary is null,since that doesnt help us
data_cleaned = data.dropna(axis=0,subset=['Salary Average', 'Salary Minimum'])
data_cleaned.isnull().sum()

Job Title              0
Salary Minimum         0
Salary Maximum         0
Salary Average         0
Rating               282
Company Name           0
Location               0
Size                 209
Founded              413
Type of ownership    209
Industry             389
Sector               389
Revenue              209
dtype: int64

In [25]:
# separate hourly rows from salary rows
data_cleaned = pd.DataFrame(data = data_cleaned) # convert the slice to a pandas dataframe to work with it
data_cleaned['Average Hourly Rate'] = data_cleaned["Salary Average"].apply(lambda x: 1 if '/hr' in x.lower() else 0)
data_cleaned = data_cleaned.reset_index(drop=True)
data_cleaned.head(20)

Unnamed: 0,Job Title,Salary Minimum,Salary Maximum,Salary Average,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue,Converted Salary,City,State,Company Age (years),Title Grouping,Seniority Level,Average Hourly Rate
0,Data Analyst + Apprentice (Entry-Level),35.0,45.0,40000.0,3.3,New Apprenticeship,"Raleigh, NC",1 to 50 Employees,,Company - Private,,,Unknown / Non-Applicable,40000,Raleigh,NC,,analyst,na,0
1,Data Analyst,97.0,135.0,116000.0,3.6,KEYENCE,"Itasca, IL",5001 to 10000 Employees,1974.0,Company - Public,Machinery Manufacturing,Manufacturing,$5 to $10 million (USD),116000,Itasca,IL,48.0,analyst,na,0
2,Data Analyst,100.0,110.0,105000.0,3.7,National Association of Boards of Pharmacy,"Mount Prospect, IL",51 to 200 Employees,1904.0,Nonprofit Organization,Membership Organizations,Management & Consulting,$25 to $50 million (USD),105000,Mount Prospect,IL,118.0,analyst,na,0
3,"Remote - Test Data Analyst (SQL queries, ALM, ...",50.0,55.0,52.5,,Compest Solutions Inc,Remote,,,,,,,108160,Remote,Remote,,analyst,na,0
4,Data Analyst,65.0,80.0,72500.0,4.1,Meritize,Remote,51 to 200 Employees,2016.0,Company - Private,Banking & Lending,Financial Services,Unknown / Non-Applicable,72500,Remote,Remote,6.0,analyst,na,0
5,Data Analyst,40.0,40.0,40000.0,3.7,Apollo Retail,Remote,1001 to 5000 Employees,1992.0,Company - Private,Other Retail Stores,Retail & Wholesale,$50 to $100 million (USD),40000,Remote,Remote,30.0,analyst,na,0
6,SQL Data Analyst – Part Time,18.0,18.0,18.0,,Sackett Financial Group,"Brea, CA",,,,,,,37440,Brea,CA,,analyst,na,0
7,Data Analyst II,90.0,115.0,102500.0,2.3,Holman,"Mount Laurel, NJ",51 to 200 Employees,1964.0,Company - Private,Advertising & Public Relations,Media & Communication,$5 to $10 million (USD),102500,Mount Laurel,NJ,58.0,analyst,na,0
8,Jr. Data Analyst,48.0,98.0,68420.0,3.7,Vdart Inc,"Plano, TX",1001 to 5000 Employees,2007.0,Company - Private,Information Technology Support Services,Information Technology,$50 to $100 million (USD),68420,Plano,TX,15.0,analyst,junior,0
9,Data Analyst,48.0,90.0,66214.0,3.5,Change Healthcare,"Los Angeles, CA",10000+ Employees,2007.0,Company - Public,Information Technology Support Services,Information Technology,Unknown / Non-Applicable,66214,Los Angeles,CA,15.0,analyst,na,0


We can see that there appears to be average hourly rate data in here. Let's split those out so we can compare the hourly rates to the salary rates givin when we analyze the data.

In [26]:
#clean up Salary min/max/average strings to only have numeric strings, then convert to float
# remove $, ',',(/yr (est.)

#First, lets clean up the Average Salary
salary_avg = data_cleaned['Salary Average'].apply(lambda x: x.split("/")[0])
salary_avg = salary_avg.apply(lambda x: x.replace('$', '').replace(',',''))
data_cleaned['Salary Average'] = salary_avg

#Now the minimum salary
salary_min = data_cleaned['Salary Minimum'].apply(lambda x:x.replace('$', '').replace('K','').replace('/hr', ''))
data_cleaned['Salary Minimum']=salary_min


#Now the maximum salary
salary_max = data_cleaned['Salary Maximum'].apply(lambda x:x.replace('$', '').replace('K',''))
data_cleaned['Salary Maximum']=salary_max
data_cleaned.tail(10)

Unnamed: 0,Job Title,Salary Minimum,Salary Maximum,Salary Average,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue,Converted Salary,City,State,Company Age (years),Title Grouping,Seniority Level,Average Hourly Rate
1075,Junior ESG Data Analyst,38.0,78.0,58159.0,4.1,Glass Lewis,Georgia,51 to 200 Employees,2003.0,Company - Private,Investment & Asset Management,Financial Services,$10 to $25 million (USD),58159,Georgia,Georgia,19.0,analyst,junior,0
1076,Data Analyst,80.0,85.0,82500.0,3.9,Tatum by Randstad,Remote,10000+ Employees,1960.0,Company - Public,HR Consulting,Human Resources & Staffing,$10+ billion (USD),82500,Remote,Remote,62.0,analyst,na,0
1077,Data Analyst - 100% REMOTE,40.0,46.0,43.0,5.0,SGA,Remote,Unknown,,Company - Public,,,Unknown / Non-Applicable,89440,Remote,Remote,,analyst,na,0
1078,Data Analyst - SQL/Excel,68.0,139.0,97144.0,4.8,"Ursus, Inc.","Menlo Park, CA",51 to 200 Employees,2015.0,Company - Private,Information Technology Support Services,Information Technology,$25 to $50 million (USD),97144,Menlo Park,CA,7.0,analyst,na,0
1079,Data Analyst,50.0,60.0,55.0,,Insight Global,"San Francisco, CA",,,,,,,114400,San Francisco,CA,,analyst,na,0
1080,Data Analyst,75.0,115.0,95000.0,,Delta Hire,Remote,Unknown,,Company - Private,,,Unknown / Non-Applicable,95000,Remote,Remote,,analyst,na,0
1081,Data Analyst needed! Start ASAP!,55.0,65.0,60000.0,4.1,Noor Staffing Group,"New York, NY",51 to 200 Employees,2015.0,Company - Private,HR Consulting,Human Resources & Staffing,Unknown / Non-Applicable,60000,New York,NY,7.0,analyst,na,0
1082,Data Analyst,50.0,60.0,55.0,,Insight Global,"San Francisco, CA",,,,,,,114400,San Francisco,CA,,analyst,na,0
1083,Data Analyst,50.0,60.0,55.0,,Insight Global,"San Francisco, CA",,,,,,,114400,San Francisco,CA,,analyst,na,0
1084,Software Data Analyst,66.0,125.0,90542.0,3.7,Pearson,"Boston, MA",10000+ Employees,1844.0,Company - Public,Publishing,Media & Communication,$5 to $10 billion (USD),90542,Boston,MA,178.0,analyst,na,0


Now lets convert the hourly values to salary

In [27]:
# convert hourly salary to yearly salary
# convert $/hr to $/year and replace cells with the yearly estimate
def hr_to_year(i):
    i = int(float(i)) # convert the string to a float
    i = i*40*52
    #print("A rate of $", i,"/hr will be a salary of $", int(salary),"/yr.")
    return i

data_cleaned["Converted Salary"] = data_cleaned["Salary Average"].apply(lambda x: hr_to_year(x) if x.find('.') != -1 else x)
data_cleaned[["Salary Average","Converted Salary"]].head(10)


Unnamed: 0,Salary Average,Converted Salary
0,40000.0,40000
1,116000.0,116000
2,105000.0,105000
3,52.5,108160
4,72500.0,72500
5,40000.0,40000
6,18.0,37440
7,102500.0,102500
8,68420.0,68420
9,66214.0,66214


With the hourly rate converted to yearly salary and a column marking which rows had an hourly value, we can now move on to some simplier tasks, such as converting data types and cleaning up some strings.

In [28]:
# remove trailing 5 characters from Company Name strings
data_cleaned['Company Name'] = data_cleaned["Company Name"].apply(lambda x: x.split('\n', 1)[0] if x.find('\n') != -1 else x)
data_cleaned['Company Name'].head(15)


0                             New Apprenticeship
1                                        KEYENCE
2     National Association of Boards of Pharmacy
3                          Compest Solutions Inc
4                                       Meritize
5                                  Apollo Retail
6                        Sackett Financial Group
7                                         Holman
8                                      Vdart Inc
9                              Change Healthcare
10                                The Home Depot
11                                     eTeam Inc
12                               iTalent Digital
13                                  The HT Group
14      Talent Nexus for Lorien US (Impellam US)
Name: Company Name, dtype: object

In [29]:
# split location to city and state


#str(string_check.iloc[0]).split(',')[1]
#a = 0
#for i in DA_data_cleaned["Location"]:
#    if ',' in str(DA_data_cleaned["Location"].iloc[i]):
#        DA_data_cleaned["City"] = str(DA_data_cleaned["Location"].iloc[i]).split(',')[0]
#        DA_data_cleaned["State"] = str(DA_data_cleaned["Location"].iloc[i]).split(',')[1]
#        a += 1
#    else:
#        DA_data_cleaned["City"] = "Remote"
#        DA_data_cleaned["State"] = "Remote"
#        a +=1

data_cleaned["City"] = data_cleaned["Location"].apply(lambda x: x.split(', ')[0])
data_cleaned["State"] = data_cleaned["Location"].apply(lambda x: x.split(',')[-1] if x.find(',') != 1 else "BAR") 
data_cleaned["State"] = data_cleaned["State"].apply(lambda x: x.strip() if x.strip().lower() != 'manhattan' else 'NY')
data_cleaned['State'].value_counts()


Remote      441
CA          220
OH           83
NC           81
GA           43
MA           42
IL           41
NJ           29
VA           28
NY           21
TX           21
MD            7
Georgia       7
Colorado      6
PA            4
DC            3
WA            3
PR            3
CO            2
Name: State, dtype: int64

In [30]:
# convert year founded to years in existance
currentyear = datetime.now().year
data_cleaned['Company Age (years)'] = data_cleaned["Founded"].apply(lambda x:-1 if x==float(np.nan) else currentyear - x)
data_cleaned

Unnamed: 0,Job Title,Salary Minimum,Salary Maximum,Salary Average,Rating,Company Name,Location,Size,Founded,Type of ownership,Industry,Sector,Revenue,Converted Salary,City,State,Company Age (years),Title Grouping,Seniority Level,Average Hourly Rate
0,Data Analyst + Apprentice (Entry-Level),35,45,40000,3.3,New Apprenticeship,"Raleigh, NC",1 to 50 Employees,,Company - Private,,,Unknown / Non-Applicable,40000,Raleigh,NC,,analyst,na,0
1,Data Analyst,97,135,116000,3.6,KEYENCE,"Itasca, IL",5001 to 10000 Employees,1974.0,Company - Public,Machinery Manufacturing,Manufacturing,$5 to $10 million (USD),116000,Itasca,IL,48.0,analyst,na,0
2,Data Analyst,100,110,105000,3.7,National Association of Boards of Pharmacy,"Mount Prospect, IL",51 to 200 Employees,1904.0,Nonprofit Organization,Membership Organizations,Management & Consulting,$25 to $50 million (USD),105000,Mount Prospect,IL,118.0,analyst,na,0
3,"Remote - Test Data Analyst (SQL queries, ALM, ...",50.00,55.00,52.50,,Compest Solutions Inc,Remote,,,,,,,108160,Remote,Remote,,analyst,na,0
4,Data Analyst,65,80,72500,4.1,Meritize,Remote,51 to 200 Employees,2016.0,Company - Private,Banking & Lending,Financial Services,Unknown / Non-Applicable,72500,Remote,Remote,6.0,analyst,na,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1080,Data Analyst,75,115,95000,,Delta Hire,Remote,Unknown,,Company - Private,,,Unknown / Non-Applicable,95000,Remote,Remote,,analyst,na,0
1081,Data Analyst needed! Start ASAP!,55,65,60000,4.1,Noor Staffing Group,"New York, NY",51 to 200 Employees,2015.0,Company - Private,HR Consulting,Human Resources & Staffing,Unknown / Non-Applicable,60000,New York,NY,7.0,analyst,na,0
1082,Data Analyst,50.00,60.00,55.00,,Insight Global,"San Francisco, CA",,,,,,,114400,San Francisco,CA,,analyst,na,0
1083,Data Analyst,50.00,60.00,55.00,,Insight Global,"San Francisco, CA",,,,,,,114400,San Francisco,CA,,analyst,na,0


In [31]:
# Group jobs under archetypes (Junior v senior, analyst v business analyst)
data_cleaned["Job Title"].value_counts() # count instances of job titles occuring

Data Analyst                                                      268
Data Analyst - 100% REMOTE                                         43
REMOTE Data Analyst                                                42
Remote - Test Data Analyst (SQL queries, ALM, Excel)               42
Data analyst Level 1                                               42
Data Analyst + Apprentice (Entry-Level)                            41
Administrative Assistant / Data Analyst Supporting the DEA         41
Saybrus Partners- Data Reporting Analyst                           41
Staff Cybersecurity Analyst (Remote)                               41
Data Analyst - SQL/Excel                                           39
Customs Compliance Analyst                                         38
Data Reporting Analyst                                             37
Retail Floor Plan Analyst Planograms                               36
Data Consultant                                                    36
Strategy and Data An

We can see from above that there are already some redundancies due to small changes in the titles (eg Sr. Data Analyst ). Let's group the jobs together with a function that searches the titles and combines everything under similar banners (manager, analyst, specialist, etc). Ken Jee created some nice functions that will serve us well, so if you wish to see more here is the link: https://youtu.be/QWgg4w1SpJ8.

In [32]:
#define functions to bin jobs into groups
def title_condencer(title):
    if 'scientist' in title.lower():
        return 'data scientist'
    elif 'data engineer' in title.lower():
        return 'data engineer'
    elif 'machine learning' in title.lower():
        return 'machine learning'
    elif 'data scientist' in title.lower():
        return 'data scientist'
    elif 'analyst' in title.lower():
        return 'analyst'
    elif 'manager' in title.lower():
        return 'director'
    elif 'specialist' in title.lower():
        return 'specialist'
    elif 'business' in title.lower():
        return 'business-based'
    else:
        return 'Unbinned'

#identify if there is a seniority or level flag
def seniority(title):
    if 'sr' in title.lower() or 'senior' in title.lower() or 'sr.' in title.lower() or 'lead' in title.lower() or 'prinicpal' in title.lower() or 'iii' in title.lower():
        return 'senior'
    elif 'jr' in title.lower() or 'jr.' in title.lower() or 'junior' in title.lower():
        return 'junior'
    else:
        return 'na'

In [33]:
#Check binning
data_cleaned['Job Title'] = data_cleaned['Job Title'].values.astype(str)
data_cleaned['Title Grouping'] = data_cleaned['Job Title'].apply(title_condencer)
data_cleaned['Title Grouping'].value_counts()

analyst       1032
Unbinned        42
specialist      11
Name: Title Grouping, dtype: int64

In [34]:
#check seniority level
data_cleaned['Seniority Level'] = data_cleaned['Job Title'].apply(seniority)
data_cleaned['Seniority Level'].value_counts()

na        1059
junior      22
senior       4
Name: Seniority Level, dtype: int64

In [35]:
#input the filename you wish to save the information as
file = r"C:\Users\Tineash\Projects\Glassdoor_webscraper\Data\DA_data_cleaned.csv" #place the filepath between the quotes

data_cleaned.to_csv(file, index = False)

In [None]:
# check for string answers for Ownership column - to do

In [None]:
# Replace string "unknown/Non-applicable" in revenue with NaN - to do

In [None]:
# remove texts from revenue and convert revenue range to an average revenue as an int/float