### Exploratory Data Analysis: Ask A Manager Salary 2019 Dataset 

##### Load Libraries & Data

In [11]:
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd
import re

In [12]:
# data loaded from intermediate folder. Raw data has been pre-processed
aam_salary_int = pd.read_csv('../data/02_intermediate/aam_sal_inter.csv')

##### Explore the shape of the dataset

In [13]:
aam_salary_int.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30995 entries, 0 to 30994
Data columns (total 10 columns):
timestamp         30995 non-null object
age               30995 non-null object
industry          30171 non-null object
job_title         30993 non-null object
annual_salary     30977 non-null float64
currency          30995 non-null object
location          29500 non-null object
work_years        30995 non-null object
notes             7004 non-null object
other_currency    236 non-null object
dtypes: float64(1), object(9)
memory usage: 2.4+ MB


In [14]:
# "other_currency" column mostly null values. Drop column. 
aam_salary_int.drop(columns=['other_currency'], inplace=True)

In [15]:
# ~ 27K entries in USD. Let's look at only the USD dataset
aam_salary_int.currency.value_counts()

USD        27326
CAD         1474
GBP          887
AUD/NZD      617
EUR          429
Other        148
JPY           33
CHF           32
SEK           25
HKD           12
ZAR           12
Name: currency, dtype: int64

In [16]:
aam_salary_usd = aam_salary_int.loc[aam_salary_int['currency'] == 'USD']

In [17]:
# a number of 0 and high values have been discovered
aam_salary_usd.sort_values(by=['annual_salary'])

Unnamed: 0,timestamp,age,industry,job_title,annual_salary,currency,location,work_years,notes
16447,2019-04-26 19:01:54,25-34,Industrial,Owner,0.0,USD,Youngstown/OHIO/USA,8 - 10 years,Sales
24316,2019-04-30 11:28:25,under 18,stay at home industry,full time daughter,0.0,USD,,1 year or less,
24727,2019-04-30 15:33:53,55-64,,Unemployed,0.0,USD,"Brunswick, GA, USA",21 - 30 years,
25546,2019-05-01 07:59:55,55-64,,Retired,0.0,USD,"St. Louiz, MO",31 - 40 years,
15287,2019-04-26 16:17:19,35-44,Homeschool mom,Owner,0.0,USD,,8 - 10 years,
20782,2019-04-29 01:30:38,35-44,Parenting,Mom,0.0,USD,Sequim Washington United States,1 year or less,
20725,2019-04-28 23:31:52,25-34,Care giver,Mom,0.0,USD,"Denver, colorado",2 - 4 years,
27209,2019-05-10 13:08:08,45-54,IT Hardware Sales,Sales Represntative,0.0,USD,Charlotte NC,21 - 30 years,
27529,2019-05-10 13:54:20,35-44,,Stay at home mom,0.0,USD,NJ,8 - 10 years,
27534,2019-05-10 13:54:55,35-44,,Stay at home mom,0.0,USD,NJ,8 - 10 years,


In [18]:
# number of nan values in annual_salary column
aam_salary_usd.annual_salary.isnull().sum()

16

In [19]:
# number and print out of salary less than or equal to 1 in the annual_salary column
print(len(aam_salary_usd.loc[aam_salary_int['annual_salary']<=1]))
aam_salary_usd.loc[aam_salary_int['annual_salary']<=1]

38


Unnamed: 0,timestamp,age,industry,job_title,annual_salary,currency,location,work_years,notes
1357,2019-04-24 12:11:42,35-44,none,none,0.0,USD,,1 year or less,
1739,2019-04-24 12:20:43,25-34,Student,Student,0.0,USD,Pasadena California,2 - 4 years,
12146,2019-04-25 12:47:10,25-34,Unemployed,Unemployed,0.0,USD,USA,5-7 years,
12313,2019-04-25 14:13:54,45-54,IT,Program Manager,0.0,USD,,31 - 40 years,
14216,2019-04-26 13:55:30,45-54,Work,Pleeb,1.0,USD,,1 year or less,
15287,2019-04-26 16:17:19,35-44,Homeschool mom,Owner,0.0,USD,,8 - 10 years,
15669,2019-04-26 17:10:08,35-44,Professional gangsta,Pimp,1.0,USD,USA,11 - 20 years,Swagger
15674,2019-04-26 17:10:34,35-44,Unemployed,Stay at home parent,0.0,USD,"San Rafael, CA",11 - 20 years,
16135,2019-04-26 18:11:53,45-54,,none,0.0,USD,,21 - 30 years,
16447,2019-04-26 19:01:54,25-34,Industrial,Owner,0.0,USD,Youngstown/OHIO/USA,8 - 10 years,Sales


In [28]:
# drop rows where annual_salary <= 1
aam_salary_usd = aam_salary_usd.drop(aam_salary_usd[aam_salary_usd.annual_salary <= 1].index)

In [23]:
# number of salaries greather than or equal to 1 Million 
# print(len(aam_salary_usd.loc[aam_salary_int['annual_salary']>=1000000]))
# aam_salary_usd.loc[aam_salary_int['annual_salary']>=1000000]