# Dancer's business - 1. Data prepration

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression

# DataFrame to save

In [2]:
# Dataframe to merge everything
df = None

# Scale/adjustment factor data
Before we begin, let's import a few useful economic index data and census data.
Even though these data won't be used for my modeling, 
I will use them when we explore statistics to get a trend of business.

## Consumer Price Index (CPI)
CPI can be used to account for inflation. I downloaded numbers of interesting areas from the website below.
- Source: [U.S. Bureau of Labor Statistics (link is one of the example page)](https://www.bls.gov/regions/new-york-new-jersey/data/xg-tables/ro2xgcpiny1967.htm)


In [3]:
cpi = pd.read_csv('data/CPI.csv')
display(cpi)

Unnamed: 0,year,NY,Chicago,Seattle,LA,SanFran,All
0,2006,220.741667,198.325,207.983333,210.4,209.533333,85.053674
1,2007,226.940083,204.817583,216.058667,217.338,216.387833,87.479977
2,2008,235.782417,212.535583,224.872,225.008,222.771667,90.838421
3,2009,236.824583,209.995083,226.153833,223.219,224.633833,90.515448
4,2010,240.864167,212.870417,226.745667,225.894,227.611833,91.999941
5,2011,247.717583,218.684167,233.097333,231.928,233.669,94.904233
6,2012,252.588333,222.00475,238.796,236.648,239.868,96.868122
7,2013,256.833083,224.545167,241.691667,239.207,245.281167,98.287078
8,2014,260.229583,228.46775,246.186167,242.434,252.2595,99.881513
9,2015,260.558167,227.792333,249.593667,244.632,258.812333,100.0


In [4]:
# Merge to df
df = pd.melt(cpi, id_vars=['year'], value_vars=cpi.columns[1:], 
            var_name='area', value_name='cpi') 

print(len(df)) # should be 16 years x 6 area = 96 rows
display(df.sample(5))

96


Unnamed: 0,year,area,cpi
34,2008,Seattle,224.872
8,2014,NY,260.229583
32,2006,Seattle,207.983333
21,2011,Chicago,218.684167
65,2007,SanFran,216.387833


## Cost of Living Index (CLI)
CPI may not account for actual living cost.
The CLI is closer to actual spend of living.
When we judge income level, we always consider the CLI to scale.

Unlike the CPI, finding CLI was difficult.
I found the CLI data of year 2010 from the Census.gov.
I will <font color=red>assume this number is staying same over years up to relative between cities.</font>

Also, it is <font color=red>not clear if this is calculated for metropolitan statistical area (broader) or only for city (smaller area). I'll assume the former, which is as same as the area division of wage statistics.</font>

In [5]:
cli = pd.read_csv('data/CLI.csv')
cli = cli[cli.area!='Percent']
display(cli)

Unnamed: 0,area,cli,Grocery,Housing,Utilities,Transportation,HealthCare,Etc
0,NY,216.7,154.3,386.7,169.6,120.3,130.2,145.7
1,Chicago,116.9,111.2,134.8,117.3,116.5,108.5,104.4
2,Seattle,121.4,115.1,140.3,85.7,118.8,119.9,119.1
3,LA,136.4,106.0,207.1,101.7,113.6,109.1,107.0
4,SanFran,164.0,111.9,281.0,94.5,113.0,117.0,124.3


In [6]:
# Merge to df
df = df.merge(cli[['area','cli']], how = 'outer', on = ['area'])

print(len(df)) # should be 16 years x 6 area = 96 rows
display(df.sample(5))

96


Unnamed: 0,year,area,cpi,cli
32,2006,Seattle,207.983333,121.4
41,2015,Seattle,249.593667,121.4
9,2015,NY,260.558167,216.7
45,2019,Seattle,278.1815,121.4
12,2018,NY,273.6405,216.7


## Census - U.S. and Metropolitan area population

- Source
    - National population: [Census.gov, example for year 2010](https://data.census.gov/cedsci/table?q=Age%20and%20Sex&tid=ACSDP1Y2010.DP05)
    - Metropolitan statistical area population: [Census.gov](https://www.census.gov/data/tables/time-series/demo/popest/2010s-total-metro-and-micro-statistical-areas.html)

### Metropolitan statistical area population data

In [7]:
# Check metropolitan area census dataset
census = pd.read_csv('data/census.csv')
census.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2797 entries, 0 to 2796
Data columns (total 17 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   CBSA               2797 non-null   int64  
 1   MDIV               141 non-null    float64
 2   STCOU              1840 non-null   float64
 3   NAME               2797 non-null   object 
 4   LSAD               2797 non-null   object 
 5   CENSUS2010POP      2797 non-null   int64  
 6   ESTIMATESBASE2010  2797 non-null   int64  
 7   POPESTIMATE2010    2797 non-null   int64  
 8   POPESTIMATE2011    2797 non-null   int64  
 9   POPESTIMATE2012    2797 non-null   int64  
 10  POPESTIMATE2013    2797 non-null   int64  
 11  POPESTIMATE2014    2797 non-null   int64  
 12  POPESTIMATE2015    2797 non-null   int64  
 13  POPESTIMATE2016    2797 non-null   int64  
 14  POPESTIMATE2017    2797 non-null   int64  
 15  POPESTIMATE2018    2797 non-null   int64  
 16  POPESTIMATE2019    2797 

In [8]:
# Select only interesting area

# Zipcode of Metropolitan Statistical Area
# LA has two zip codes because it has changed over years
zipcode_area = {31100:'LA',31080:'LA',41860:'SanFran',16980:'Chicago',35620:'NY',42660:'Seattle'}

lst=[]
for i in zipcode_area.keys():
    if i==31100:
        continue
    lst.append(census.loc[(census.CBSA==i)&(census.LSAD=='Metropolitan Statistical Area')])

census = pd.concat(lst)

display(census)

Unnamed: 0,CBSA,MDIV,STCOU,NAME,LSAD,CENSUS2010POP,ESTIMATESBASE2010,POPESTIMATE2010,POPESTIMATE2011,POPESTIMATE2012,POPESTIMATE2013,POPESTIMATE2014,POPESTIMATE2015,POPESTIMATE2016,POPESTIMATE2017,POPESTIMATE2018,POPESTIMATE2019
850,31080,,,"Los Angeles-Long Beach-Anaheim, CA",Metropolitan Statistical Area,12828837,12828957,12838417,12925753,13013443,13097434,13166609,13234696,13270694,13278000,13249879,13214799
1307,41860,,,"San Francisco-Oakland-Berkeley, CA",Metropolitan Statistical Area,4335391,4335593,4343634,4395725,4455473,4519636,4584981,4647924,4688198,4712421,4726314,4731803
291,16980,,,"Chicago-Naperville-Elgin, IL-IN-WI",Metropolitan Statistical Area,9461105,9461537,9470634,9500870,9528090,9550194,9560430,9552554,9533662,9514113,9484158,9458539
1017,35620,,,"New York-Newark-Jersey City, NY-NJ-PA",Metropolitan Statistical Area,18897109,18896277,18923407,19052774,19149689,19226449,19280929,19320968,19334778,19322607,19276644,19216182
1337,42660,,,"Seattle-Tacoma-Bellevue, WA",Metropolitan Statistical Area,3439809,3439808,3449241,3503891,3558829,3612347,3675160,3739654,3816355,3885579,3935179,3979845


In [9]:
# Select only interesting fields
census.drop(['MDIV','STCOU','LSAD','CENSUS2010POP','ESTIMATESBASE2010'],axis=1,inplace=True)

# Change name of dields
census.columns = ['CBSA','NAME',2010,2011,2012,2013,2014,2015,2016,2017,2018,2019]

# Transpose
census = census.T

# Put area names as field names
census.columns = ['LA','SanFran','Chicago','NY','Seattle']

# Reset index
census.reset_index()

# Leave only yearly population rows
census.drop(['CBSA','NAME'],inplace=True)

display(census)

Unnamed: 0,LA,SanFran,Chicago,NY,Seattle
2010,12838417,4343634,9470634,18923407,3449241
2011,12925753,4395725,9500870,19052774,3503891
2012,13013443,4455473,9528090,19149689,3558829
2013,13097434,4519636,9550194,19226449,3612347
2014,13166609,4584981,9560430,19280929,3675160
2015,13234696,4647924,9552554,19320968,3739654
2016,13270694,4688198,9533662,19334778,3816355
2017,13278000,4712421,9514113,19322607,3885579
2018,13249879,4726314,9484158,19276644,3935179
2019,13214799,4731803,9458539,19216182,3979845


### National population data

In [10]:
# Check one example file of national population
demo = pd.read_csv('data/C2015.csv')

demo.info()
display(demo.head(5))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89 entries, 0 to 88
Data columns (total 5 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   Label (Grouping)                        89 non-null     object
 1   United States!!Estimate                 84 non-null     object
 2   United States!!Margin of Error          84 non-null     object
 3   United States!!Percent                  84 non-null     object
 4   United States!!Percent Margin of Error  84 non-null     object
dtypes: object(5)
memory usage: 3.6+ KB


Unnamed: 0,Label (Grouping),United States!!Estimate,United States!!Margin of Error,United States!!Percent,United States!!Percent Margin of Error
0,SEX AND AGE,,,,
1,Total population,321418821.0,*****,321418821,(X)
2,Male,158167834.0,"±31,499",49.2%,±0.1
3,Female,163250987.0,"±31,500",50.8%,±0.1
4,Under 5 years,19793807.0,"±16,520",6.2%,±0.1


In [11]:
%%script false --no-raise-error
# If you already have data/usDemo.csv, this block can be skipped.

# Combine multiple year files of national population
df_save = []
for year in range(2010,2020):
    demo = pd.read_csv('data/C{0}.csv'.format(year))
    demo['year'] = year
    df_save.append(demo)
    
# Mave a csv file
df_save = pd.concat(df_save)

df_save.columns = ['label','estimate','estimate_err','pct','pct_err','year','estimate_err2']
df_save.to_csv('data/usDemo.csv', index=False)

In [12]:
census_national = pd.read_csv('data/usDemo.csv')
display(census_national)

Unnamed: 0,label,estimate,estimate_err,pct,pct_err,year,estimate_err2
0,SEX AND AGE,,,,,2010,
1,Total population,309349689,*****,309349689,(X),2010,
2,Male,152089450,"±27,325",49.2%,±0.1,2010,
3,Female,157260239,"±27,325",50.8%,±0.1,2010,
4,Under 5 years,20133943,"±20,568",6.5%,±0.1,2010,
...,...,...,...,...,...,...,...
880,Total housing units,139686209,,(X),(X),2019,"±6,973"
881,"CITIZEN, VOTING AGE POPULATION",,,,,2019,
882,"Citizen, 18 and over population",235418734,,235418734,(X),2019,"±159,764"
883,Male,114206194,,48.5%,±0.1,2019,"±98,225"


In [13]:
# Leave data to use only
# Change format to merge with metropolitan census dataframe

population=[]
years=[]

# make year:population dictionary
for year in range(2010,2020):
    pop = int(''.join(census_national[(census_national.label.str.contains('Total population'))\
                                      &(census_national.year==year)].iloc[0].estimate.split(',')))
    population.append(pop)
    years.append(year)
    

census_national = pd.DataFrame({'year':years,'All':population})
    
display(census_national)

Unnamed: 0,year,All
0,2010,309349689
1,2011,311591919
2,2012,313914040
3,2013,316128839
4,2014,318857056
5,2015,321418821
6,2016,323127515
7,2017,325719178
8,2018,327167439
9,2019,328239523


### Merge national population to metropolitan population

In [14]:
census = pd.merge(census_national, census, right_index=True, left_on='year')

# Change data type to numeric
census = census.apply(pd.to_numeric)

census.info()
display(census)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10 entries, 0 to 9
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype
---  ------   --------------  -----
 0   year     10 non-null     int64
 1   All      10 non-null     int64
 2   LA       10 non-null     int64
 3   SanFran  10 non-null     int64
 4   Chicago  10 non-null     int64
 5   NY       10 non-null     int64
 6   Seattle  10 non-null     int64
dtypes: int64(7)
memory usage: 640.0 bytes


Unnamed: 0,year,All,LA,SanFran,Chicago,NY,Seattle
0,2010,309349689,12838417,4343634,9470634,18923407,3449241
1,2011,311591919,12925753,4395725,9500870,19052774,3503891
2,2012,313914040,13013443,4455473,9528090,19149689,3558829
3,2013,316128839,13097434,4519636,9550194,19226449,3612347
4,2014,318857056,13166609,4584981,9560430,19280929,3675160
5,2015,321418821,13234696,4647924,9552554,19320968,3739654
6,2016,323127515,13270694,4688198,9533662,19334778,3816355
7,2017,325719178,13278000,4712421,9514113,19322607,3885579
8,2018,327167439,13249879,4726314,9484158,19276644,3935179
9,2019,328239523,13214799,4731803,9458539,19216182,3979845


In [15]:
# Merge to df
census = pd.melt(census, id_vars=['year'], value_vars=cpi.columns[1:], 
        var_name='area', value_name='population')

df = df.merge(census, how='left', on= ['area','year'])

print(len(df)) # should be 16 years x 6 area = 96 rows
display(df.sample(5))

96


Unnamed: 0,year,area,cpi,cli,population
55,2013,LA,239.207,136.4,13097434.0
42,2016,Seattle,255.254,121.4,3816355.0
25,2015,Chicago,227.792333,116.9,9552554.0
45,2019,Seattle,278.1815,121.4,3979845.0
80,2006,All,85.053674,,


# U.S. market data

I crated "data/rev.csv" file by combining U.S. markets statistics from multiple sources.
Data sources are
- U.S. dance studio 
    - market size: [Statista](https://www.statista.com/statistics/1175824/dance-studio-industry-market-size-us/)
    - number of businesses: [IBISWorld](https://www.ibisworld.com/industry-statistics/number-of-businesses/dance-studios-united-states/)
    - number of employees: [IBISWorld](https://www.ibisworld.com/industry-statistics/employment/dance-studios-united-states/)
    - wages: [IBISWorld](https://www.ibisworld.com/industry-statistics/wages/dance-studios-united-states/)
- U.S. fitness and recreational sports centers 
    - revenue: [FRED, cited U.S. Bureau of Labor Statistics](https://fred.stlouisfed.org/series/REVEF71394ALLEST).

In [16]:
# Read data file
rev = pd.read_csv('data/rev.csv')

# Set every money scale to billion dollars
rev.fitness = rev.fitness/1000
rev.studio_wage = rev.studio_wage/1000

display(rev)
# fitness: U.S. fitness and recreational sports center revenue
# dance_studio: U.S. dance studio revenue
# studio_num: number of buinesses of U.S. dance studio
# studio_emp: number of employees of U.S. dance studio
# studio_wage: total wage of U.S. dance studio

Unnamed: 0,year,fitness,dance_studio,studio_num,studio_emp,studio_wage
0,2006,19.447,,,,
1,2007,21.416,,,,
2,2008,22.339,,,,
3,2009,21.842,,,,
4,2010,22.311,,,,
5,2011,23.191,3.04,,,
6,2012,24.051,3.22,47269.0,90668.0,0.9026
7,2013,25.803,3.28,48399.0,93420.0,0.9029
8,2014,27.001,3.42,52942.0,99696.0,0.9504
9,2015,28.838,3.59,55523.0,104321.0,1.0148


In [17]:
# Merge to df
df= df.merge(rev, how = 'left', on= ['year'])

print(len(df)) # should be 16 years x 6 area = 96 rows
display(df.sample(5))

96


Unnamed: 0,year,area,cpi,cli,population,fitness,dance_studio,studio_num,studio_emp,studio_wage
67,2009,SanFran,224.633833,164.0,,21.842,,,,
43,2017,Seattle,263.109167,121.4,3885579.0,33.042,3.87,58515.0,114075.0,1.1448
45,2019,Seattle,278.1815,121.4,3979845.0,35.889,4.2,65723.0,126288.0,1.2295
20,2010,Chicago,212.870417,116.9,9470634.0,22.311,,,,
30,2020,Chicago,243.87275,116.9,,24.361,3.43,62808.0,112485.0,1.0768


# Employee statistics data
- Source: [Occupational Employment and Wage Statistics provided by U.S. Bureau of Labor Statistics](https://www.bls.gov/oes/tables.htm).

This website provides a table of employment statistics (wage, number of employee, etc) of different area of each year.
I downloaded each year's file, and they will be cleaned and concatenated.

## Explanation of fields

Here are definitions of each field. Not explained field is not used in this analysis.

### Area identifier
- area: area code 
- area_name(title):	Area name 

### Job identifier
- occ_code: The 6-digit Standard Occupational Classification (SOC) code or OEWS-specific code for the occupation 
- occ_title: SOC title or OEWS-specific title for the occupation

### Number of employee
- tot_emp: Estimated total employment rounded to the nearest 10 (excludes self-employed).
- emp_prse:	Percent relative standard error (PRSE) for the employment estimate. PRSE is a measure of sampling error, expressed as a percentage of the corresponding estimate. Sampling error occurs when values for a population are estimated from a sample survey of the population, rather than calculated from data for all members of the population. Estimates with lower PRSEs are typically more precise in the presence of sampling error.

### Wage
- h_mean: Mean hourly wage
- a_mean: Mean annual wage 

- mean_prse: Percent relative standard error (PRSE) for the mean wage estimate.
- h_pct10: Hourly 10th percentile wage
- h_pct25: Hourly 25th percentile wage
- h_median: Hourly median wage (or the 50th percentile)
- h_pct75: Hourly 75th percentile wage
- h_pct90: Hourly 90th percentile wage

- a_pct10: Annual 10th percentile wage
- a_pct25: Annual 25th percentile wage
- a_median: Annual median wage (or the 50th percentile)
- a_pct75: Annual 75th percentile wage
- a_pct90: Annual 90th percentile wage

In [18]:
# Let's check how each file looks like
sample = pd.read_excel('data/2010.xls')

sample.info()
display(sample.sample(2))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7940 entries, 0 to 7939
Data columns (total 25 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   PRIM_STATE    7940 non-null   object
 1   AREA          7940 non-null   int64 
 2   AREA_NAME     7940 non-null   object
 3   OCC_CODE      7940 non-null   object
 4   OCC_TITLE     7940 non-null   object
 5   GROUP         253 non-null    object
 6   TOT_EMP       7940 non-null   object
 7   EMP_PRSE      7940 non-null   object
 8   JOBS_1000     7940 non-null   object
 9   LOC QUOTIENT  7940 non-null   object
 10  H_MEAN        7940 non-null   object
 11  A_MEAN        7940 non-null   object
 12  MEAN_PRSE     7940 non-null   object
 13  H_PCT10       7940 non-null   object
 14  H_PCT25       7940 non-null   object
 15  H_MEDIAN      7940 non-null   object
 16  H_PCT75       7940 non-null   object
 17  H_PCT90       7940 non-null   object
 18  A_PCT10       7940 non-null   object
 19  A_PCT2

Unnamed: 0,PRIM_STATE,AREA,AREA_NAME,OCC_CODE,OCC_TITLE,GROUP,TOT_EMP,EMP_PRSE,JOBS_1000,LOC QUOTIENT,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
1813,DC,47900,"Washington-Arlington-Alexandria, DC-VA-MD-WV",33-1012,First-Line Supervisors of Police and Detectives,,2550,4.6,0.898,1.117,...,52.71,67.45,73.77,76280,88300,109630,140300,153430,,
4950,MI,19820,"Detroit-Warren-Livonia, MI",51-8099,"Plant and System Operators, All Other",,370,26.0,0.221,2.584,...,23.21,28.34,31.88,28830,37490,48280,58950,66310,,


In [19]:
# jobs in interest
    # dancer: dancer
    # choreo: choreographer, 
    # fit_trainer: fitness trainer/instructer
    # rec_worker: recreational worker, 
    # all_jobs: all jobs sum/mean
    
code_job = {'27-2031':'dancer','27-2032':'choreo',
            '39-9031':'fit_trainer','39-9032':'rec_worker',
            '00-0000':'all_jobs'}

In [20]:
%%script false --no-raise-error
# This block combines multiple wage data files, then generate a single csv file.
# If you already have data/wage.csv, this block can be skipped. It takes time to run.


df_save = []
for year in range(2006,2022):
    print(year)
    metro = None # metropolitan area statistics data
    national = None # national statistics data
    try:
        metro = pd.read_excel('data/'+str(year)+'.xls')
        national = pd.read_excel('data/'+str(year)+'nat.xls')
    except:
        metro = pd.read_excel('data/'+str(year)+'.xlsx')
        national = pd.read_excel('data/'+str(year)+'nat.xlsx')
 
    metro.columns = metro.columns.str.strip().str.lower()
    national.columns = national.columns.str.strip().str.lower()
    
    # unify feature names in all years
    metro.rename(columns={'area_title':'area_name'},inplace=True)
        
    # LA area code changed
    area_la = 31100
    if year>2014:
        area_la=31080

    # Select metropolitan area in interest
    metro = metro.loc[(metro.area==area_la) | (metro.area==41860) | (metro.area==16980) | 
                    (metro.area==35620) | (metro.area==42660)]
    
    # Select occupation in interest
    metro = metro.loc[(metro.occ_code=='27-2031') | (metro.occ_code=='27-2032') | 
                      (metro.occ_code=='39-9031') | (metro.occ_code=='39-9032') | 
                      (metro.occ_code=='00-0000')]
    national = national.loc[(national.occ_code=='27-2031') | (national.occ_code=='27-2032') |
                            (national.occ_code=='39-9031') | (national.occ_code=='39-9032') |                          
                            (national.occ_code=='00-0000')]

    
    # Change zip code to the unique area names
    metro['area']=metro.apply(lambda x: zipcode_area[x['area']], axis=1)

    # To match columns with metropolitan dataframe
    national['area'] = 'All'
    national['area_name'] = 'U.S. all'
    
    # Keep only columns to use
    metro = metro[['area', 'area_name', 'occ_code', 'occ_title', 
       'tot_emp', 'emp_prse', 'h_mean', 'a_mean', 'mean_prse', 'h_pct10',
       'h_pct25', 'h_median', 'h_pct75', 'h_pct90', 'a_pct10', 'a_pct25',
       'a_median', 'a_pct75', 'a_pct90']]

    national = national[['area', 'area_name', 'occ_code', 'occ_title', 
   'tot_emp', 'emp_prse', 'h_mean', 'a_mean', 'mean_prse', 'h_pct10',
   'h_pct25', 'h_median', 'h_pct75', 'h_pct90', 'a_pct10', 'a_pct25',
   'a_median', 'a_pct75', 'a_pct90']]
    
    
    # comebine national data to metropolitan data
    metro = pd.concat([national,metro], ignore_index=True)

    # add year
    metro['year']=year
    
    # add the unique occupation name
    metro['occ']=metro.apply(lambda x: code_job[x['occ_code']], axis=1)

    # Cleaning
    metro.replace('**',np.nan,inplace=True)
    metro.replace('*',np.nan,inplace=True)
    
    # Append to a list to save
    df_save.append(metro)

# Mave a csv file
df_save = pd.concat(df_save)
df_save.to_csv('data/emp.csv', index=False)

2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021


In [21]:
# Check data is prepared as intended
emp = pd.read_csv('data/emp.csv')

emp.info()

# Confirm if city and occupation labels are correct
print("Check area names are correctly marked ------ ")
for x in zipcode_area.values():
    print(x,emp[emp.area==x].area_name.unique())
    
print("\n Check occupation names are correctly marked ------ ")    
for x in code_job.values():
    print(x,emp[emp.occ==x].occ_title.unique())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458 entries, 0 to 457
Data columns (total 21 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   area       458 non-null    object 
 1   area_name  458 non-null    object 
 2   occ_code   458 non-null    object 
 3   occ_title  458 non-null    object 
 4   tot_emp    426 non-null    float64
 5   emp_prse   426 non-null    float64
 6   h_mean     452 non-null    float64
 7   a_mean     369 non-null    float64
 8   mean_prse  452 non-null    float64
 9   h_pct10    452 non-null    float64
 10  h_pct25    452 non-null    float64
 11  h_median   452 non-null    float64
 12  h_pct75    452 non-null    float64
 13  h_pct90    452 non-null    float64
 14  a_pct10    369 non-null    float64
 15  a_pct25    369 non-null    float64
 16  a_median   369 non-null    float64
 17  a_pct75    369 non-null    float64
 18  a_pct90    369 non-null    float64
 19  year       458 non-null    int64  
 20  occ       

In [22]:
# Since area and occupations are correctly marked, let's remove them
emp.drop(['area_name','occ_code','occ_title'],axis=1,inplace=True)

print(len(emp)) # 16 years x 6 area x 5 jobs = 480 rows, if no missing record
display(emp.sample(5))

458


Unnamed: 0,area,tot_emp,emp_prse,h_mean,a_mean,mean_prse,h_pct10,h_pct25,h_median,h_pct75,h_pct90,a_pct10,a_pct25,a_median,a_pct75,a_pct90,year,occ
45,Chicago,490.0,38.6,12.93,,14.7,7.21,7.69,10.33,14.26,26.16,,,,,,2007,dancer
195,NY,8305030.0,0.3,27.38,56940.0,0.5,9.02,12.38,20.51,34.61,53.39,18770.0,25760.0,42660.0,71980.0,111060.0,2012,all_jobs
85,Seattle,320.0,41.7,18.52,,17.9,8.21,11.27,13.06,28.22,30.89,,,,,,2008,dancer
441,LA,70.0,48.6,30.41,63250.0,11.7,19.78,19.78,36.86,36.86,36.88,41140.0,41140.0,76660.0,76660.0,76700.0,2021,choreo
408,All,248070.0,1.5,21.95,45650.0,0.7,10.4,13.38,19.48,27.64,36.81,21640.0,27840.0,40510.0,57490.0,76550.0,2020,fit_trainer


In [23]:
# Merge to df
df= df.merge(emp, how = 'left', on= ['area','year'])

print(len(df)) # 16 years x 6 area x 5 jobs = 480 rows, if no missing record
display(df.sample(5))

458


Unnamed: 0,year,area,cpi,cli,population,fitness,dance_studio,studio_num,studio_emp,studio_wage,...,h_pct25,h_median,h_pct75,h_pct90,a_pct10,a_pct25,a_median,a_pct75,a_pct90,occ
378,2006,All,85.053674,,,19.447,,,,,...,9.75,14.61,22.99,35.08,15500.0,20270.0,30400.0,47820.0,72960.0,all_jobs
212,2020,Seattle,282.7455,121.4,,24.361,3.43,62808.0,112485.0,1.0768,...,18.49,28.04,45.65,68.3,31170.0,38460.0,58330.0,94960.0,142070.0,all_jobs
96,2009,Chicago,209.995083,116.9,,21.842,,,,,...,8.65,10.0,13.3,18.89,16730.0,18000.0,20810.0,27660.0,39300.0,rec_worker
284,2018,LA,265.962,136.4,13249879.0,33.971,4.1,63363.0,120456.0,1.1787,...,11.93,13.88,16.64,19.24,23080.0,24820.0,28870.0,34610.0,40010.0,rec_worker
294,2020,LA,278.567,136.4,,24.361,3.43,62808.0,112485.0,1.0768,...,14.25,16.2,19.39,23.44,27550.0,29650.0,33690.0,40330.0,48760.0,rec_worker


In [24]:
# It was expected to have 6(area) x 5(jobs) x 16(years) = 480 records
# But we have 458 rows.
# Which records are missing?

for i in range(2006,2022):
    for j in df.area.unique():
        if df.loc[(df.area==j)&(df.year==i)].occ.nunique()!=5:
            print(i,j, set(df.occ.unique()) - set(df.loc[(df.area==j)&(df.year==i)].occ.unique())) 

2006 NY {'dancer'}
2009 Seattle {'choreo'}
2010 Seattle {'dancer'}
2011 Seattle {'dancer', 'choreo'}
2012 Seattle {'dancer', 'choreo'}
2013 Seattle {'dancer'}
2014 Seattle {'dancer'}
2015 Seattle {'dancer'}
2016 Seattle {'dancer'}
2018 Seattle {'choreo'}
2019 NY {'choreo'}
2019 Seattle {'choreo'}
2019 SanFran {'choreo'}
2020 NY {'choreo'}
2020 Chicago {'choreo'}
2020 Seattle {'choreo'}
2020 SanFran {'choreo'}
2021 Chicago {'dancer', 'choreo'}
2021 Seattle {'dancer'}


Some missing records of dancer's income data in multiple cities and years. 
Of course, dancers are hard to track!
Let's not bother to handle the missing data here for now.
If dancer's statistics are irregular over year, we can't impute its empty record anyway.

# Save organized dataset
Now, we have all data prepared. 
Let's save it for next steps.

In [25]:
%%script false --no-raise-error
# If you already have data/dance.csv, this block can be skipped.
df.to_csv('data/dance.csv')