## Final Project - NYC Citywide Payroll Data

### Meaghan Burke - Data 608

#### Data Source: https://data.cityofnewyork.us/City-Government/Citywide-Payroll-Data-Fiscal-Year-/k397-673e

### MAy 11th 2019


### Data Cleaning 
Steps:

1. Read in the raw_csv from the NYC Open Data website (download via link above)
2. Use script (view link below to access ipybn file) that does the following transformations:

    - Filtered for 'MANHATTAN', 'QUEENS', 'BRONX', 'BROOKLYN', & 'RICHMOND' boroughs

    - Filtered for full time, active employees 

    - Filtered for full year salaried employees

    - Removed null job titles

    - Removed 2014 as it is incomplete 

    - Converted all string values to uppercase & removed whitespace 

    - output the data to consolidated_table.csv & scatter_table. These are the base datasets for the dash application


In [17]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

pd.options.display.float_format = '${:,.2f}'.format

In [2]:
# all the filtering and data cleaning 
#filter for only per annum jobs, active employees and titles that are not null 
#del the original for memory purposes
borough_keep = ['MANHATTAN', 'QUEENS', 'BRONX', 'BROOKLYN', 'RICHMOND']
pay_dataset =  pd.read_csv("Citywide_Payroll_Data__Fiscal_Year_.csv", low_memory = False)
pay_dataset['Total Pay'] = pay_dataset[['Regular Gross Paid', 'Total OT Paid', 'Total Other Pay']].sum(axis =1 )
pay_dataset = pay_dataset.applymap(lambda s:s.upper().strip() if type(s) == str else s)
filtered_pay= pay_dataset[(pay_dataset['Pay Basis'] == 'PER ANNUM') & 
                          (pay_dataset['Leave Status as of June 30'] == 'ACTIVE') &
                          (~pay_dataset['Title Description'].isnull()) &
                          (pay_dataset['Fiscal Year'] != 2014) &
                          (pay_dataset['Work Location Borough'].isin(borough_keep))]

filtered_pay.loc[filtered_pay['Work Location Borough'].isnull(), 'Work Location Borough'] = 'UNKNOWN'
del(pay_dataset)

In [None]:
#recreate a employee id as the Payroll Number has too many NAS, checked unqiue counts and the combination of the below columns is unqiue to each employee
#https://stackoverflow.com/questions/48008334/anonymize-specific-columns-with-pii-in-pandas-dataframe-python anonymize 
cols = ['Agency Name', 'Last Name', 'First Name', 'Mid Init', 'Agency Start Date', 'Pay Basis']
filtered_pay['Employee_Id'] = filtered_pay[cols].apply(lambda row: '_'.join(row.values.astype(str)), axis=1).astype('category').cat.codes
filtered_pay.drop(['Last Name', 'Payroll Number', 'First Name', 'Mid Init', 'Leave Status as of June 30'], axis = 1, inplace = True)

In [11]:
calcs = {'Work Location Borough': ['nunique'],'Employee_Id':['nunique'], 'Title Description':['nunique'],
        'Regular Hours': ['sum'],'Regular Gross Paid':['mean'], 'OT Hours':['sum'],'Total OT Paid' :['mean'],'Total Other Pay':['mean'],
        'Total Pay':['mean']}
consolidated_table = filtered_pay.groupby(['Fiscal Year','Agency Name']).agg(calcs).reset_index()
consolidated_table.columns = consolidated_table.columns.droplevel(-1)
consolidated_table = consolidated_table.sort_values('Title Description', ascending = False)

In [15]:
#used in the main EDA section of the app
consolidated_table.to_csv("charting_dataset.csv")

In [13]:
calcs = {'Work Location Borough': ['nunique'],'Employee_Id':['nunique'], 'Title Description':['nunique'],
        'Regular Hours': ['mean'],'Regular Gross Paid':['mean'], 'OT Hours':['mean'],'Total OT Paid' :['mean'],'Total Other Pay':['mean'],
        'Total Pay':['mean']}

scatter_table = filtered_pay.groupby(['Agency Name']).agg(calcs).reset_index()
scatter_table.columns = scatter_table.columns.droplevel(-1)
scatter_table = scatter_table.sort_values('Title Description', ascending = False)

In [16]:
#used in the scatter plots section of the app
scatter_table.to_csv("scatter_dataset.csv")

In [4]:
#no longer used, but helpful for data validation 
filtered_pay.to_csv("filtered_nyc_payset.csv")

### Data EDA

In [87]:
filtered_pay.shape

(1113051, 14)

In [18]:
#display the unique descriptive information as a table in the dash application
unique_values = filtered_pay.groupby(['Fiscal Year']).nunique()

decribe_table = filtered_pay.describe()

unique_values

Unnamed: 0_level_0,Fiscal Year,Agency Name,Agency Start Date,Work Location Borough,Title Description,Base Salary,Pay Basis,Regular Hours,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay,Total Pay,Employee_Id
Fiscal Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2015,1,144,10448,5,1204,23503,1,11195,142723,25454,113948,110680,224996,264153
2016,1,144,10583,5,1206,23997,1,11749,129511,26704,114971,115618,213897,274049
2017,1,147,10685,5,1218,24413,1,13528,131364,25970,118924,116986,221903,285158
2018,1,148,10753,5,1236,23458,1,12790,131422,25127,119840,111806,227269,289435


In [40]:
decribe_table

Unnamed: 0,Fiscal Year,Base Salary,Regular Hours,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay,Total Pay,Employee_Id
count,1113051.0,1113051.0,1113051.0,1113051.0,1113051.0,1113051.0,1113051.0,1113051.0,1113051.0
mean,2016.539,69366.83,1128.044,66847.68,105.9016,5790.611,3732.348,76370.63,183221.8
std,1.11509,27168.1,942.2682,29067.34,186.2982,11132.22,6205.931,36325.75,104169.9
min,2015.0,1.0,-235.25,-187.1,-2.5,-26493.88,-205816.5,-24226.35,0.0
25%,2016.0,46520.0,0.0,43929.21,0.0,0.0,0.0,49999.88,93159.0
50%,2017.0,68000.0,1790.0,65196.98,0.0,0.0,444.83,71648.06,182961.0
75%,2018.0,85292.0,2045.72,85387.05,159.0,6858.325,5215.79,96968.92,273974.0
max,2018.0,350000.0,4060.92,672308.9,2736.67,161290.2,95960.96,672731.3,361401.0


In [19]:
consolidated_table

Unnamed: 0,Fiscal Year,Agency Name,Work Location Borough,Employee_Id,Title Description,Regular Hours,Regular Gross Paid,OT Hours,Total OT Paid,Total Other Pay,Total Pay
407,2017,POLICE DEPARTMENT,5,50086,197,"$1,948.52","$74,786.05",$225.95,"$14,596.83","$10,450.52","$99,833.40"
250,2016,NYC HOUSING AUTHORITY,5,8037,196,"$1,891.26","$53,301.71",$122.07,"$4,434.85","$1,852.54","$59,589.10"
555,2018,POLICE DEPARTMENT,5,50821,196,"$1,952.28","$70,258.92",$221.43,"$13,471.15","$10,011.87","$93,741.94"
358,2017,DEPT OF HEALTH/MENTAL HYGIENE,5,5058,195,"$1,672.91","$65,008.52",$39.45,"$1,617.07","$2,559.10","$69,184.69"
505,2018,DEPT OF HEALTH/MENTAL HYGIENE,5,5326,194,"$1,699.66","$66,243.00",$33.65,"$1,390.74","$2,544.47","$70,178.21"
260,2016,POLICE DEPARTMENT,5,49174,192,"$1,994.98","$68,615.07",$241.55,"$13,816.83","$9,709.84","$92,141.74"
116,2015,POLICE DEPARTMENT,5,48037,190,"$2,005.76","$66,589.71",$261.24,"$14,103.69","$9,573.57","$90,266.97"
545,2018,NYC HOUSING AUTHORITY,5,7800,186,"$1,859.74","$52,151.27",$144.99,"$5,133.00","$1,849.45","$59,133.71"
397,2017,NYC HOUSING AUTHORITY,5,8084,186,"$1,835.24","$51,400.65",$133.05,"$4,636.52","$1,815.13","$57,852.29"
69,2015,DEPT OF HEALTH/MENTAL HYGIENE,5,4233,182,"$1,751.47","$63,326.94",$55.20,"$2,000.61","$3,806.99","$69,134.54"
