### Exploratory Data Analysis: Ask A Manager Salary 2019 Dataset 

##### Load Libraries & Data

In [1]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import re

In [2]:
# data loaded from intermediate folder. Raw data has been pre-processed
aam_salary_int = pd.read_csv('../data/02_intermediate/aam_sal_inter.csv')

##### Explore the shape of the dataset

In [3]:
aam_salary_int.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30995 entries, 0 to 30994
Data columns (total 10 columns):
timestamp         30995 non-null object
age               30995 non-null object
industry          30171 non-null object
job_title         30993 non-null object
annual_salary     30977 non-null float64
currency          30995 non-null object
location          29500 non-null object
work_years        30995 non-null object
notes             7004 non-null object
other_currency    236 non-null object
dtypes: float64(1), object(9)
memory usage: 2.4+ MB


In [4]:
# "other_currency" column mostly null values. Drop column. 
aam_salary_int.drop(columns=['other_currency'], inplace=True)

In [5]:
# ~ 27K entries in USD. Let's look at only the USD dataset
aam_salary_int.currency.value_counts()

USD        27326
CAD         1474
GBP          887
AUD/NZD      617
EUR          429
Other        148
JPY           33
CHF           32
SEK           25
ZAR           12
HKD           12
Name: currency, dtype: int64

In [6]:
aam_salary_usd = aam_salary_int.loc[aam_salary_int['currency'] == 'USD']

In [7]:
# a number of 0, high and nan values have been discovered
aam_salary_usd.sort_values(by=['annual_salary'])

Unnamed: 0,timestamp,age,industry,job_title,annual_salary,currency,location,work_years,notes
16447,2019-04-26 19:01:54,25-34,Industrial,Owner,0.0,USD,Youngstown/OHIO/USA,8 - 10 years,Sales
24316,2019-04-30 11:28:25,under 18,stay at home industry,full time daughter,0.0,USD,,1 year or less,
24727,2019-04-30 15:33:53,55-64,,Unemployed,0.0,USD,"Brunswick, GA, USA",21 - 30 years,
25546,2019-05-01 07:59:55,55-64,,Retired,0.0,USD,"St. Louiz, MO",31 - 40 years,
15287,2019-04-26 16:17:19,35-44,Homeschool mom,Owner,0.0,USD,,8 - 10 years,
20782,2019-04-29 01:30:38,35-44,Parenting,Mom,0.0,USD,Sequim Washington United States,1 year or less,
20725,2019-04-28 23:31:52,25-34,Care giver,Mom,0.0,USD,"Denver, colorado",2 - 4 years,
27209,2019-05-10 13:08:08,45-54,IT Hardware Sales,Sales Represntative,0.0,USD,Charlotte NC,21 - 30 years,
27529,2019-05-10 13:54:20,35-44,,Stay at home mom,0.0,USD,NJ,8 - 10 years,
27534,2019-05-10 13:54:55,35-44,,Stay at home mom,0.0,USD,NJ,8 - 10 years,


In [8]:
# number of nan values in annual_salary column
aam_salary_usd.annual_salary.isnull().sum()

16

In [10]:
# number and print out of salary less than or equal to 10000 in the annual_salary column
print(len(aam_salary_usd.loc[aam_salary_int['annual_salary']<=10000]))
aam_salary_usd.loc[aam_salary_int['annual_salary']<=10000]

541


Unnamed: 0,timestamp,age,industry,job_title,annual_salary,currency,location,work_years,notes
111,2019-04-24 11:45:10,45-54,Social Work,Lead Facilitator,35.0,USD,"Denison, TX USA",21 - 30 years,
118,2019-04-24 11:45:16,35-44,Civil Service,Human Service Specialist 4,59.0,USD,Jersey City/New Jersey/USA,11 - 20 years,"Supervisor, first level of manangement."
136,2019-04-24 11:45:33,35-44,Veterinary medicine,Veterinary technician,18.0,USD,"Pittsburgh, Pa, USA",11 - 20 years,Specialty medicine within a referral hospital
157,2019-04-24 11:46:04,25-34,Manufacturing,Human Resources Generalist/Payroll Admin,60.0,USD,"Tampa, FL",5-7 years,"Additional hats as required for safety, minist..."
191,2019-04-24 11:46:42,55-64,Consulting,Vice President,175.0,USD,"Philadelphia, PA",31 - 40 years,
226,2019-04-24 11:47:20,25-34,public library,adult services manager,6500.0,USD,"chicago, il",8 - 10 years,
239,2019-04-24 11:47:40,65 or over,IT,Web developer,68.0,USD,Los Angeles CA,41 years or more,
244,2019-04-24 11:47:46,35-44,Museum Exhibit Design,Dir Exhibit Experience,90.0,USD,"Queens, NY",11 - 20 years,
258,2019-04-24 11:47:54,25-34,Non profit legal services,Social worker,65.0,USD,Nyc,8 - 10 years,
302,2019-04-24 11:48:55,55-64,Publishing,Designer,40.0,USD,"Oklahoma City, OK",31 - 40 years,


In [11]:
# drop rows where annual_salary <= 1
aam_salary_usd = aam_salary_usd.drop(aam_salary_usd[aam_salary_usd.annual_salary <= 10000].index)

In [12]:
# number of salaries greather than or equal to 1 Million 
print(len(aam_salary_usd.loc[aam_salary_int['annual_salary']>=1000000]))

42


In [13]:
# drop rows where annual salary >= 1 Million 
aam_salary_usd = aam_salary_usd.drop(aam_salary_usd[aam_salary_usd.annual_salary >= 1000000].index)

In [24]:
# drop NaN values from annual_salary column
aam_salary_usd = aam_salary_usd[pd.notnull(aam_salary_usd['annual_salary'])]
len(aam_salary_usd)

26727

In [25]:
aam_salary_usd.sort_values(by=['annual_salary'])

Unnamed: 0,timestamp,age,industry,job_title,annual_salary,currency,location,work_years,notes
7271,2019-04-24 14:36:02,55-64,Ecommerce,Senior Marketing Manager,10300.0,USD,"Los Angeles, CA USA",11 - 20 years,
21195,2019-04-29 09:09:48,25-34,Transportation,Senior Project Engineer,10400.0,USD,"Fairfax, Virginia",8 - 10 years,
19740,2019-04-27 19:37:58,25-34,Tech,Engineer,10500.0,USD,"New York, ny, usa",5-7 years,
14519,2019-04-26 14:23:18,45-54,Sales distribution,Sales rep,10500.0,USD,Kansas City/MO/USA,21 - 30 years,
2881,2019-04-24 12:56:58,25-34,Public Library,Library Assistant,10600.0,USD,Alabama,2 - 4 years,Part-time
10566,2019-04-24 22:33:10,45-54,Telecom,Sr Manager Project Management,10700.0,USD,Richardson TX,11 - 20 years,
14096,2019-04-26 13:44:07,45-54,Automotive Manufacturer,Senior District Sales Manager,10700.0,USD,Florida,11 - 20 years,
15447,2019-04-26 16:38:56,45-54,Healthcare,Director,10800.0,USD,"Fort Lauderdale, Florida, USA",11 - 20 years,
13367,2019-04-26 12:36:33,45-54,Education,Adjunct instructor,10800.0,USD,Philadelphia,8 - 10 years,
19713,2019-04-27 19:15:40,25-34,Public Libraries,Library Assistant,11000.0,USD,"Raleigh, NC, USA",8 - 10 years,Part-time
