# all data found from
https://www.bls.gov/oes/tables.htm

In [3]:
import pandas as pd

In [4]:
def null_cleaning(df):
    null_cols = []
    problem_cols = []
    
    # looping through the columns
    for i in range(df.shape[1]):
        # counting the number of null vals in the column
        num_nulls = df.isnull().sum()[i]
        # if there are more than 0 null values, add the column to our list, and see what % of the data it is
        if num_nulls != 0:
            col_name = df.columns[i]
            percent_of_data = round(num_nulls/df.shape[0], 3)
            null_cols.append([col_name, num_nulls, percent_of_data])
            # if this column is over 20% nulls, then mark it as a problem column
            if percent_of_data >= 0.2:
                problem_cols.append(col_name)
                
    # what happens if we drop the problem columns?
    df_no_prob_cols = df.drop(columns = problem_cols)
    prob_cols_percent = df_no_prob_cols.shape[1] / df.shape[1]
    
    #what happens is we drop the problem rows too?
    df_no_nulls = df_no_prob_cols.dropna()
    prob_rows_percent = df_no_nulls.shape[0]/df_no_prob_cols.shape[0]
    
    print("The problem columns are: ", problem_cols)
    print("After dropping the problem columns, you are left with ", prob_cols_percent, "% of your columns.")
    print("After dropping the problem columns, and then dropping all rows containing nulls,\n you are left with ", 
          prob_rows_percent, "% of your rows.")
    return null_cols


# 1997

In [120]:
df_97 = pd.read_excel('../data/state_1997_dl.xls')

In [121]:
df_97.shape

(34473, 23)

In [122]:
df_97.head(47)

Unnamed: 0.1,Unnamed: 0,1997 State OES Estimates,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
0,,,,,,,,,,,...,,,,,,,,,,
1,,Occupational Employment Statistics (OES) Survey,,,,,,,,,...,,,,,,,,,,
2,,"Bureau of Labor Statistics, Department of Labor",,,,,,,,,...,,,,,,,,,,
3,,website: http://stats.bls.gov/oes/,,,,,,,,,...,,,,,,,,,,
4,,phone: 202-691-6569,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
6,,Column labels:,,,,,,,,,...,,,,,,,,,,
7,,area - fips code of the state,,,,,,,,,...,,,,,,,,,,
8,,st - the state abbreviation,,,,,,,,,...,,,,,,,,,,
9,,state - the state name,,,,,,,,,...,,,,,,,,,,


In [123]:
# dropping indicies 0 through 38

df_97.drop(index = list(range(39)), inplace = True)

In [124]:
df_97.head()

Unnamed: 0.1,Unnamed: 0,1997 State OES Estimates,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
39,area,st,state,occ_code,occ_titl,group,tot_emp,emp_prse,h_mean,a_mean,...,h_median,h_wpct75,h_wpct90,a_wpct10,a_wpct25,a_median,a_wpct75,a_wpct90,annual,year
40,01,AL,Alabama,10000,Managerial and Administrative Occupations,div,,,,,...,,,,,,,,,,
41,01,AL,Alabama,13000,Staff and Administrative Specialty Managerial ...,maj,**,**,18.04,37530,...,16.25,22.07,32.97,20758,25875,33800,45906,68578,,1997
42,01,AL,Alabama,13002,Financial Managers,,8140,3.3,24.23,50400,...,20.97,33.97,46.32,24336,32261,43617.6,70658,96346,,1997
43,01,AL,Alabama,13005,"Personnel, Training, and Labor Relations Managers",,2760,2.9,20.84,43350,...,19.29,28.17,38.99,21840,28018,40123.2,58594,81099,,1997


In [125]:
new_header = df_97.iloc[0] #grab the first row for the header
df_97 = df_97[1:].copy() #take the data less the header row
df_97.columns = new_header #set the header row as the df header

In [126]:
df_97.head()

39,area,st,state,occ_code,occ_titl,group,tot_emp,emp_prse,h_mean,a_mean,...,h_median,h_wpct75,h_wpct90,a_wpct10,a_wpct25,a_median,a_wpct75,a_wpct90,annual,year
40,1,AL,Alabama,10000,Managerial and Administrative Occupations,div,,,,,...,,,,,,,,,,
41,1,AL,Alabama,13000,Staff and Administrative Specialty Managerial ...,maj,**,**,18.04,37530.0,...,16.25,22.07,32.97,20758.0,25875.0,33800.0,45906.0,68578.0,,1997.0
42,1,AL,Alabama,13002,Financial Managers,,8140,3.3,24.23,50400.0,...,20.97,33.97,46.32,24336.0,32261.0,43617.6,70658.0,96346.0,,1997.0
43,1,AL,Alabama,13005,"Personnel, Training, and Labor Relations Managers",,2760,2.9,20.84,43350.0,...,19.29,28.17,38.99,21840.0,28018.0,40123.2,58594.0,81099.0,,1997.0
44,1,AL,Alabama,13008,Purchasing Managers,,2370,4.7,19.07,39660.0,...,15.99,23.59,39.15,17451.0,23442.0,33259.2,49067.0,81432.0,,1997.0


In [127]:
df_97.isna().sum().sum()

70574

In [128]:
null_cleaning(df_97)

The problem columns are:  ['group', 'annual']
After dropping the problem columns, you are left with  0.9130434782608695 % of your columns.
After dropping the problem columns, and then dropping all rows containing nulls,
 you are left with  0.9890221589754015 % of your rows.


[['group', 31764, 0.922],
 ['tot_emp', 378, 0.011],
 ['emp_prse', 378, 0.011],
 ['h_mean', 378, 0.011],
 ['a_mean', 378, 0.011],
 ['mean_prse', 378, 0.011],
 ['h_wpct10', 378, 0.011],
 ['h_wpct25', 378, 0.011],
 ['h_median', 378, 0.011],
 ['h_wpct75', 378, 0.011],
 ['h_wpct90', 378, 0.011],
 ['a_wpct10', 378, 0.011],
 ['a_wpct25', 378, 0.011],
 ['a_median', 378, 0.011],
 ['a_wpct75', 378, 0.011],
 ['a_wpct90', 378, 0.011],
 ['annual', 32762, 0.951],
 ['year', 378, 0.011]]

In [129]:
features = ['st','occ_code','occ_titl','tot_emp','h_mean','a_mean','year']
df_97 = df_97[features].copy()

In [130]:
df_97.head()

39,st,occ_code,occ_titl,tot_emp,h_mean,a_mean,year
40,AL,10000,Managerial and Administrative Occupations,,,,
41,AL,13000,Staff and Administrative Specialty Managerial ...,**,18.04,37530.0,1997.0
42,AL,13002,Financial Managers,8140,24.23,50400.0,1997.0
43,AL,13005,"Personnel, Training, and Labor Relations Managers",2760,20.84,43350.0,1997.0
44,AL,13008,Purchasing Managers,2370,19.07,39660.0,1997.0


In [131]:
null_cleaning(df_97)

The problem columns are:  []
After dropping the problem columns, you are left with  1.0 % of your columns.
After dropping the problem columns, and then dropping all rows containing nulls,
 you are left with  0.9890221589754015 % of your rows.


[['tot_emp', 378, 0.011],
 ['h_mean', 378, 0.011],
 ['a_mean', 378, 0.011],
 ['year', 378, 0.011]]

In [132]:
df_97[df_97['tot_emp'].isnull()]['occ_titl'].value_counts()

Professional, Paraprofessional, and Technical Occupations                              54
Managerial and Administrative Occupations                                              54
Clerical and Administrative Support Occupations                                        54
Agricultural, Forestry, Fishing, and Related Occupations                               54
Sales and Related Occupations                                                          54
Production, Construction, Operating, Maintenance, and Material Handling Occupations    54
Service Occupations                                                                    54
Name: occ_titl, dtype: int64

In [133]:
df_97.shape

(34433, 7)

In [134]:
df_97.dtypes

39
st          object
occ_code    object
occ_titl    object
tot_emp     object
h_mean      object
a_mean      object
year        object
dtype: object

In [135]:
df_97.rename(columns = {'occ_titl':'occ_title'}, inplace = True)

In [136]:
df_97.head()

39,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
40,AL,10000,Managerial and Administrative Occupations,,,,
41,AL,13000,Staff and Administrative Specialty Managerial ...,**,18.04,37530.0,1997.0
42,AL,13002,Financial Managers,8140,24.23,50400.0,1997.0
43,AL,13005,"Personnel, Training, and Labor Relations Managers",2760,20.84,43350.0,1997.0
44,AL,13008,Purchasing Managers,2370,19.07,39660.0,1997.0


In [157]:
df_97.to_csv('./1997_jobs.csv')

# 1998

In [137]:
df_98 = pd.read_excel('../data/state_1998_dl.xls')

In [138]:
# dropping indicies 0 through 38

df_98.drop(index = list(range(39)), inplace = True)

In [139]:
df_98.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22
39,,,,,,,,,,,...,,,,,,,,,,
40,area,st,state,occ_code,occ_title,group,tot_emp,emp_prse,h_mean,a_mean,...,h_median,h_wpct75,h_wpct90,a_wpct10,a_wpct25,a_median,a_wpct75,a_wpct90,annual,year
41,01,AL,Alabama,10000,Managerial and Administrative Occupations,div,,,,,...,,,,,,,,,,
42,01,AL,Alabama,13000,Staff and Administrative Specialty Managerial ...,maj,38650,1.8,18.71,38930,...,17,22.98,34.74,20530,26460,35350,47800,72260,,1998
43,01,AL,Alabama,13002,Financial Managers,,8140,2.5,24.43,50800,...,21.51,34.28,46.16,23740,32070,44740,71300,96020,,1998


In [140]:
new_header = df_98.iloc[0] #grab the first row for the header
df_98 = df_98[1:].copy() #take the data less the header row
df_98.columns = new_header #set the header row as the df header

In [141]:
df_98.head()

39,NaN,NaN.1,NaN.2,NaN.3,NaN.4,NaN.5,NaN.6,NaN.7,NaN.8,NaN.9,...,NaN.10,NaN.11,NaN.12,NaN.13,NaN.14,NaN.15,NaN.16,NaN.17,NaN.18,NaN.19
40,area,st,state,occ_code,occ_title,group,tot_emp,emp_prse,h_mean,a_mean,...,h_median,h_wpct75,h_wpct90,a_wpct10,a_wpct25,a_median,a_wpct75,a_wpct90,annual,year
41,01,AL,Alabama,10000,Managerial and Administrative Occupations,div,,,,,...,,,,,,,,,,
42,01,AL,Alabama,13000,Staff and Administrative Specialty Managerial ...,maj,38650,1.8,18.71,38930,...,17,22.98,34.74,20530,26460,35350,47800,72260,,1998
43,01,AL,Alabama,13002,Financial Managers,,8140,2.5,24.43,50800,...,21.51,34.28,46.16,23740,32070,44740,71300,96020,,1998
44,01,AL,Alabama,13005,"Personnel, Training, and Labor Relations Managers",,2910,2.4,21.53,44780,...,19.79,30.27,40.04,21730,28430,41150,62970,83280,,1998


In [142]:
new_header = df_98.iloc[0] #grab the first row for the header
df_98 = df_98[1:].copy() #take the data less the header row
df_98.columns = new_header #set the header row as the df header

In [143]:
df_98.head()

40,area,st,state,occ_code,occ_title,group,tot_emp,emp_prse,h_mean,a_mean,...,h_median,h_wpct75,h_wpct90,a_wpct10,a_wpct25,a_median,a_wpct75,a_wpct90,annual,year
41,1,AL,Alabama,10000,Managerial and Administrative Occupations,div,,,,,...,,,,,,,,,,
42,1,AL,Alabama,13000,Staff and Administrative Specialty Managerial ...,maj,38650.0,1.8,18.71,38930.0,...,17.0,22.98,34.74,20530.0,26460.0,35350.0,47800.0,72260.0,,1998.0
43,1,AL,Alabama,13002,Financial Managers,,8140.0,2.5,24.43,50800.0,...,21.51,34.28,46.16,23740.0,32070.0,44740.0,71300.0,96020.0,,1998.0
44,1,AL,Alabama,13005,"Personnel, Training, and Labor Relations Managers",,2910.0,2.4,21.53,44780.0,...,19.79,30.27,40.04,21730.0,28430.0,41150.0,62970.0,83280.0,,1998.0
45,1,AL,Alabama,13008,Purchasing Managers,,2490.0,9.6,18.06,37560.0,...,15.75,22.77,37.06,15400.0,20810.0,32750.0,47360.0,77090.0,,1998.0


In [144]:
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean','year ']
df_98 = df_98[features].copy()

In [145]:
# fixing space at the end of year
df_98.rename(columns = {'year ':'year'}, inplace = True)

In [146]:
null_cleaning(df_98)

The problem columns are:  []
After dropping the problem columns, you are left with  1.0 % of your columns.
After dropping the problem columns, and then dropping all rows containing nulls,
 you are left with  0.9894117647058823 % of your rows.


[['tot_emp', 378, 0.011],
 ['h_mean', 378, 0.011],
 ['a_mean', 378, 0.011],
 ['year', 378, 0.011]]

In [147]:
df_98[df_98['tot_emp'].isnull()]['occ_title'].value_counts()

Professional, Paraprofessional, and Technical Occupations                              54
Managerial and Administrative Occupations                                              54
Clerical and Administrative Support Occupations                                        54
Agricultural, Forestry, Fishing, and Related Occupations                               54
Sales and Related Occupations                                                          54
Production, Construction, Operating, Maintenance, and Material Handling Occupations    54
Service Occupations                                                                    54
Name: occ_title, dtype: int64

In [148]:
df_98.dtypes

40
st           object
occ_code     object
occ_title    object
tot_emp      object
h_mean       object
a_mean       object
year         object
dtype: object

# 1999

In [149]:
df_99 = pd.read_excel('../data/state_1999_dl.xls')

In [150]:
df_99.head(45)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24
0,,,,,1999 State OES Estimates,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,Occupational Employment Statistics (OES) Survey,,,,,,,,,...,,,,,,,,,,
3,,"Bureau of Labor Statistics, Department of Labor",,,,,,,,,...,,,,,,,,,,
4,,website: http://stats.bls.gov/oes/,,,,,,,,,...,,,,,,,,,,
5,,phone: 202-691-6569,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,Column labels:,,,,,,,,,...,,,,,,,,,,
8,,area - fips code of the state,,,,,,,,,...,,,,,,,,,,
9,,st - the state abbreviation,,,,,,,,,...,,,,,,,,,,


In [151]:
# dropping indicies 0 through 41

df_99.drop(index = list(range(42)), inplace = True)

In [152]:
df_99.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24
42,area,st,state,occ_code,occ_titl,group,tot_emp,emp_prse,h_mean,a_mean,...,h_wpct90,a_wpct10,a_wpct25,a_median,a_wpct75,a_wpct90,one or three,annual,release,year
43,01,AL,Alabama,11-0000,Management Occupations,major,133700,1.9,25.61,53270,...,47.58,20630,29920,46020,69490,98970,1,,,1999
44,01,AL,Alabama,11-1011,Chief Executives,,9030,4.4,44.21,91960,...,#,37230,59640,93130,142280,#,1,,,1999
45,01,AL,Alabama,11-1021,General and Operations Managers,,44460,2.7,26.46,55040,...,54.02,20720,29480,44660,73270,112370,1,,,1999
46,01,AL,Alabama,11-1031,Legislators,,2060,21.4,7.16,14900,...,9.16,11140,11780,12850,13930,19060,1,,,1999


In [153]:
new_header = df_99.iloc[0] #grab the first row for the header
df_99 = df_99[1:].copy() #take the data less the header row
df_99.columns = new_header #set the header row as the df header


In [154]:
df_99.head()

42,area,st,state,occ_code,occ_titl,group,tot_emp,emp_prse,h_mean,a_mean,...,h_wpct90,a_wpct10,a_wpct25,a_median,a_wpct75,a_wpct90,one or three,annual,release,year
43,1,AL,Alabama,11-0000,Management Occupations,major,133700,1.9,25.61,53270,...,47.58,20630,29920,46020,69490,98970,1,,,1999
44,1,AL,Alabama,11-1011,Chief Executives,,9030,4.4,44.21,91960,...,#,37230,59640,93130,142280,#,1,,,1999
45,1,AL,Alabama,11-1021,General and Operations Managers,,44460,2.7,26.46,55040,...,54.02,20720,29480,44660,73270,112370,1,,,1999
46,1,AL,Alabama,11-1031,Legislators,,2060,21.4,7.16,14900,...,9.16,11140,11780,12850,13930,19060,1,,,1999
47,1,AL,Alabama,11-2011,Advertising and Promotions Managers,,980,11.5,18.13,37710,...,31.81,18210,23030,32170,47600,66150,1,,,1999


In [155]:
features = ['st','occ_code','occ_titl','tot_emp','h_mean','a_mean','year']
df_99 = df_99[features].copy()

In [156]:
df_99.rename(columns = {'occ_titl':'occ_title'}, inplace = True)

In [157]:
df_99.head()

42,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
43,AL,11-0000,Management Occupations,133700,25.61,53270,1999
44,AL,11-1011,Chief Executives,9030,44.21,91960,1999
45,AL,11-1021,General and Operations Managers,44460,26.46,55040,1999
46,AL,11-1031,Legislators,2060,7.16,14900,1999
47,AL,11-2011,Advertising and Promotions Managers,980,18.13,37710,1999


# 2000

In [158]:
df_00 = pd.read_excel('../data/state_2000_dl.xls')

In [159]:
df_00.head(45)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
0,,2000 State OES Estimates,,,,,,,,,...,,,,,,,,,,
1,,,,,,,,,,,...,,,,,,,,,,
2,,Occupational Employment Statistics (OES) Survey,,,,,,,,,...,,,,,,,,,,
3,,"Bureau of Labor Statistics, Department of Labor",,,,,,,,,...,,,,,,,,,,
4,,website: http://stats.bls.gov/oes/,,,,,,,,,...,,,,,,,,,,
5,,phone: 202-691-6569,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,Column labels:,,,,,,,,,...,,,,,,,,,,
8,,area - fips code of the state,,,,,,,,,...,,,,,,,,,,
9,,st - the state abbreviation,,,,,,,,,...,,,,,,,,,,


In [160]:
# dropping indicies 0 through 40

df_00.drop(index = list(range(41)), inplace = True)

In [161]:
df_00.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,...,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23
41,area,st,state,occ_code,occ_titl,group,tot_emp,emp_prse,h_mean,a_mean,...,h_wpct75,h_wpct90,a_wpct10,a_wpct25,a_median,a_wpct75,a_wpct90,annual,release,year
42,01,AL,Alabama,11-0000,Management Occupations,major,120380,1.7,27.67,57560,...,36.38,50.76,22530,33310,51620,75680,105590,,,2000
43,01,AL,Alabama,11-1011,Chief Executives,,6090,3.7,48.26,100380,...,#,#,43650,67190,107050,#,#,,,2000
44,01,AL,Alabama,11-1021,General and Operations Managers,,38020,2.6,28.29,58840,...,37.94,55.36,22230,32540,50400,78920,115160,,,2000
45,01,AL,Alabama,11-1031,Legislators,,1220,20.8,8.48,17640,...,8.31,14.32,11520,12340,13720,17280,29790,,,2000


In [162]:
new_header = df_00.iloc[0] #grab the first row for the header
df_00 = df_00[1:].copy() #take the data less the header row
df_00.columns = new_header #set the header row as the df header

In [163]:
df_00.head()

41,area,st,state,occ_code,occ_titl,group,tot_emp,emp_prse,h_mean,a_mean,...,h_wpct75,h_wpct90,a_wpct10,a_wpct25,a_median,a_wpct75,a_wpct90,annual,release,year
42,1,AL,Alabama,11-0000,Management Occupations,major,120380,1.7,27.67,57560,...,36.38,50.76,22530,33310,51620,75680,105590,,,2000
43,1,AL,Alabama,11-1011,Chief Executives,,6090,3.7,48.26,100380,...,#,#,43650,67190,107050,#,#,,,2000
44,1,AL,Alabama,11-1021,General and Operations Managers,,38020,2.6,28.29,58840,...,37.94,55.36,22230,32540,50400,78920,115160,,,2000
45,1,AL,Alabama,11-1031,Legislators,,1220,20.8,8.48,17640,...,8.31,14.32,11520,12340,13720,17280,29790,,,2000
46,1,AL,Alabama,11-2011,Advertising and Promotions Managers,,800,6.3,20.7,43050,...,25.98,38.33,19630,26060,35510,54030,79720,,,2000


In [164]:
features = ['st','occ_code','occ_titl','tot_emp','h_mean','a_mean','year']
df_00 = df_00[features].copy()

In [165]:
df_00.head()

41,st,occ_code,occ_titl,tot_emp,h_mean,a_mean,year
42,AL,11-0000,Management Occupations,120380,27.67,57560,2000
43,AL,11-1011,Chief Executives,6090,48.26,100380,2000
44,AL,11-1021,General and Operations Managers,38020,28.29,58840,2000
45,AL,11-1031,Legislators,1220,8.48,17640,2000
46,AL,11-2011,Advertising and Promotions Managers,800,20.7,43050,2000


In [166]:
df_00.rename(columns = {'occ_titl':'occ_title'}, inplace = True)

# 2001

In [167]:
df_01 = pd.read_excel('../data/state_2001_dl.xls')

In [168]:
df_01.head()


Unnamed: 0,area,st,state,occ_code,occ_title,group,tot_emp,emp_prse,h_mean,a_mean,...,h_median,h_wpct75,h_wpct90,a_wpct10,a_wpct25,a_median,a_wpct75,a_wpct90,annual,year
0,1,AL,Alabama,00-0000,All Occupations,,1827960,0.5,14.2,29530,...,11.02,17.52,26.04,12950,15950,22910,36450,54160,,2001
1,1,AL,Alabama,11-0000,Management Occupations,major,107460,1.3,29.33,61000,...,26.44,38.52,53.51,24690,36400,55000,80110,111300,,2001
2,1,AL,Alabama,11-1011,Chief Executives,,5290,2.9,49.69,103350,...,53.72,#,#,46510,71520,111740,#,#,,2001
3,1,AL,Alabama,11-1021,General and Operations Managers,,33420,2.1,30.85,64170,...,26.76,41.49,60.86,25430,36400,55660,86300,126580,,2001
4,1,AL,Alabama,11-1031,Legislators,,1740,21.2,7.58,15760,...,6.44,7.25,10.01,11510,12220,13400,15070,20820,,2001


In [169]:
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean','year']
df_01 = df_01[features].copy()

In [170]:
df_01.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1827960,14.2,29530,2001
1,AL,11-0000,Management Occupations,107460,29.33,61000,2001
2,AL,11-1011,Chief Executives,5290,49.69,103350,2001
3,AL,11-1021,General and Operations Managers,33420,30.85,64170,2001
4,AL,11-1031,Legislators,1740,7.58,15760,2001


# 2002

In [171]:
df_02 = pd.read_excel('../data/state_2002_dl.xls')

In [172]:
df_02.head()

Unnamed: 0,area,st,state,occ_code,occ_title,group,tot_emp,emp_prse,h_mean,a_mean,...,h_wpct25,h_median,h_wpct75,h_wpct90,a_wpct10,a_wpct25,a_median,a_wpct75,a_wpct90,annual
0,1,AL,Alabama,00-0000,All Occupations,major,1819390,0.5,14.88,30940,...,7.89,11.37,18.13,26.94,13250,16420,23640,37710,56040,
1,1,AL,Alabama,11-0000,Management occupations,major,103720,1.2,32.19,66960,...,18.48,27.93,40.33,56.54,26200,38450,58100,83880,117600,
2,1,AL,Alabama,11-1011,Chief executives,,5340,2.8,60.22,125260,...,35.21,56.3,#,#,44940,73230,117110,#,#,
3,1,AL,Alabama,11-1021,General and operations managers,,31660,1.8,34.63,72040,...,18.8,28.53,43.53,65.57,27110,39100,59350,90540,136390,
4,1,AL,Alabama,11-1031,Legislators,,1730,20.7,7.71,16030,...,6.05,6.64,7.51,9.73,11820,12590,13820,15620,20240,


In [173]:
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_02 = df_02[features].copy()

In [174]:
df_02['year'] = [2002 for i in range(len(df_02))]

In [27]:
df_02.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1819390,14.88,30940,2002
1,AL,11-0000,Management occupations,103720,32.19,66960,2002
2,AL,11-1011,Chief executives,5340,60.22,125260,2002
3,AL,11-1021,General and operations managers,31660,34.63,72040,2002
4,AL,11-1031,Legislators,1730,7.71,16030,2002


# 2003

In [29]:
df_03 = pd.read_excel('../data/state_may2003_dl.xls')

In [30]:
df_03.head()

Unnamed: 0,AREA,ST,STATE,OCC_CODE,OCC_TITLE,GROUP,TOT_EMP,EMP_PRSE,H_MEAN,A_MEAN,...,H_PCT25,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL
0,1,AL,Alabama,00-0000,All Occupations,,1820170,0.5,15.06,31330,...,7.97,11.52,18.33,27.23,13250,16580,23960,38140,56650,
1,1,AL,Alabama,11-0000,Management occupations,major,91920,1.2,34.58,71920,...,20.59,30.27,42.96,60.24,29230,42820,62960,89360,125290,
2,1,AL,Alabama,11-1011,Chief executives,,4460,3.5,65.17,135560,...,40.84,63.15,#,#,53720,84950,131360,#,#,
3,1,AL,Alabama,11-1021,General and operations managers,,27570,1.9,37.87,78770,...,21.52,31.79,46.90,#,31750,44770,66130,97550,#,
4,1,AL,Alabama,11-1031,Legislators,,1530,22.8,7.95,16550,...,6.08,6.68,7.67,9.57,11900,12650,13880,15960,19900,


In [31]:
new_cols = [x.lower() for x in df_03.columns]

In [33]:
df_03.columns = new_cols

In [35]:
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_03 = df_03[features].copy()

In [36]:
df_03['year'] = [2003 for i in range(len(df_03))]

In [37]:
df_03.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1820170,15.06,31330,2003
1,AL,11-0000,Management occupations,91920,34.58,71920,2003
2,AL,11-1011,Chief executives,4460,65.17,135560,2003
3,AL,11-1021,General and operations managers,27570,37.87,78770,2003
4,AL,11-1031,Legislators,1530,7.95,16550,2003


# 2004

In [39]:
df_04 = pd.read_excel('../data/state_may2004_dl.xls')

In [40]:
df_04.head()

Unnamed: 0,AREA,ST,STATE,OCC_CODE,OCC_TITLE,GROUP,TOT_EMP,EMP_PRSE,H_MEAN,A_MEAN,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1,AL,Alabama,00-0000,All Occupations,,1830360,0.6,15.19,31590,...,11.76,18.48,27.20,13320,16910,24470,38430,56570,,
1,1,AL,Alabama,11-0000,Management occupations,major,81040,1.3,36.58,76100,...,31.73,44.65,65.81,32510,46480,66000,92860,136890,,
2,1,AL,Alabama,11-1011,Chief executives,,3880,3.8,64.87,134930,...,64.69,#,#,50150,87420,134560,#,#,,
3,1,AL,Alabama,11-1021,General and operations managers,,25990,2.0,41.31,85920,...,34.29,52.51,#,35570,48260,71330,109210,#,,
4,1,AL,Alabama,11-1031,Legislators,,1590,23.1,*,16600,...,*,*,*,11880,12610,13840,16200,22250,True,


In [41]:
new_cols = [x.lower() for x in df_04.columns]
df_04.columns = new_cols
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_04 = df_04[features].copy()
df_04['year'] = [2004 for i in range(len(df_04))]

In [42]:
df_04.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1830360,15.19,31590,2004
1,AL,11-0000,Management occupations,81040,36.58,76100,2004
2,AL,11-1011,Chief executives,3880,64.87,134930,2004
3,AL,11-1021,General and operations managers,25990,41.31,85920,2004
4,AL,11-1031,Legislators,1590,*,16600,2004


# 2005

In [43]:
df_05 = pd.read_excel('../data/state_may2005_dl.xls')

In [44]:
df_05.head()

Unnamed: 0,AREA,ST,STATE,OCC_CODE,OCC_TITLE,GROUP,TOT_EMP,EMP_PRSE,H_MEAN,A_MEAN,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1,AL,Alabama,00-0000,All Occupations,,1872600,0.6,15.54,32310,...,12.07,18.91,27.76,13400,17250,25110,39330,57740,,
1,1,AL,Alabama,11-0000,Management occupations,major,79730,1.2,37.12,77210,...,32.39,45.48,66.80,33340,47420,67370,94600,138950,,
2,1,AL,Alabama,11-1011,Chief executives,,3300,4.0,63.63,132350,...,64.39,#,#,47060,83380,133940,#,#,,
3,1,AL,Alabama,11-1021,General and operations managers,,27450,1.4,41.57,86460,...,34.44,53.07,#,36100,48620,71630,110380,#,,
4,1,AL,Alabama,11-1031,Legislators,,1730,15.7,*,14700,...,*,*,*,11330,11950,12980,14010,20430,True,


In [45]:
new_cols = [x.lower() for x in df_05.columns]
df_05.columns = new_cols
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_05 = df_05[features].copy()
df_05['year'] = [2005 for i in range(len(df_05))]

In [46]:
df_05.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1872600,15.54,32310,2005
1,AL,11-0000,Management occupations,79730,37.12,77210,2005
2,AL,11-1011,Chief executives,3300,63.63,132350,2005
3,AL,11-1021,General and operations managers,27450,41.57,86460,2005
4,AL,11-1031,Legislators,1730,*,14700,2005


# 2006

In [48]:
df_06 = pd.read_excel('../data/state_may2006_dl.xls')

In [49]:
df_06.head()

Unnamed: 0,AREA,ST,STATE,OCC_CODE,OCC_TITLE,GROUP,TOT_EMP,EMP_PRSE,H_MEAN,A_MEAN,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1,AL,Alabama,00-0000,All Occupations,,1912220,0.6,16.08,33440,...,12.49,19.52,29.02,13730,17800,25980,40610,60350,,
1,1,AL,Alabama,11-0000,Management occupations,major,77880,1.4,38.78,80660,...,33.80,47.75,#,35100,49590,70310,99320,#,,
2,1,AL,Alabama,11-1011,Chief executives,,2700,3.8,68.18,141810,...,68.35,#,#,61630,91420,142170,#,#,,
3,1,AL,Alabama,11-1021,General and operations managers,,28780,1.4,43.00,89440,...,36.03,54.41,#,38390,51510,74940,113170,#,,
4,1,AL,Alabama,11-1031,Legislators,,2010,15.8,*,15200,...,*,*,*,11490,12200,13370,14600,21300,True,


In [50]:
new_cols = [x.lower() for x in df_06.columns]
df_06.columns = new_cols
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_06 = df_06[features].copy()
df_06['year'] = [2006 for i in range(len(df_06))]

In [51]:
df_06.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1912220,16.08,33440,2006
1,AL,11-0000,Management occupations,77880,38.78,80660,2006
2,AL,11-1011,Chief executives,2700,68.18,141810,2006
3,AL,11-1021,General and operations managers,28780,43.00,89440,2006
4,AL,11-1031,Legislators,2010,*,15200,2006


# 2007

In [52]:
df_07 = pd.read_excel('../data/state_may2007_dl.xls')

In [53]:
df_07.head()

Unnamed: 0,AREA,ST,STATE,OCC_CODE,OCC_TITLE,GROUP,TOT_EMP,EMP_PRSE,H_MEAN,A_MEAN,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1,AL,Alabama,00-0000,All Occupations,,1931970,0.5,16.80,34950,...,13.11,20.44,30.27,14010,18430,27260,42520,62950,,
1,1,AL,Alabama,11-0000,Management occupations,major,78560,1.1,40.71,84680,...,35.43,50.48,#,36570,51960,73700,104990,#,,
2,1,AL,Alabama,11-1011,Chief executives,,3470,3.2,71.00,147680,...,68.15,#,#,64710,93170,141750,#,#,,
3,1,AL,Alabama,11-1021,General and operations managers,,30850,1.4,43.61,90720,...,36.37,55.01,#,39620,53340,75660,114410,#,,
4,1,AL,Alabama,11-1031,Legislators,,1950,12.3,*,15630,...,*,*,*,11740,12500,13770,15090,21220,True,


In [55]:
new_cols = [x.lower() for x in df_07.columns]
df_07.columns = new_cols
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_07 = df_07[features].copy()
df_07['year'] = [2007 for i in range(len(df_07))]

In [56]:
df_07.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1931970,16.80,34950,2007
1,AL,11-0000,Management occupations,78560,40.71,84680,2007
2,AL,11-1011,Chief executives,3470,71.00,147680,2007
3,AL,11-1021,General and operations managers,30850,43.61,90720,2007
4,AL,11-1031,Legislators,1950,*,15630,2007


# 2008

In [59]:
df_08 = pd.read_excel('../data/state__M2008_dl.xls')

In [60]:
df_08.head()

Unnamed: 0,AREA,ST,STATE,OCC_CODE,OCC_TITLE,GROUP,TOT_EMP,EMP_PRSE,H_MEAN,A_MEAN,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1,AL,Alabama,00-0000,All Occupations,,1945300,0.4,17.39,36170,...,13.53,21.19,31.23,14820,18970,28140,44070,64950,,
1,1,AL,Alabama,11-0000,Management occupations,major,79400,0.9,42.41,88210,...,36.75,52.02,75.47,38690,54430,76450,108210,156970,,
2,1,AL,Alabama,11-1011,Chief executives,,3630,3.7,75.62,157290,...,73.07,#,#,65680,100680,151990,#,#,,
3,1,AL,Alabama,11-1021,General and operations managers,,33500,1.3,44.39,92330,...,37.03,54.55,#,41090,54960,77010,113470,#,,
4,1,AL,Alabama,11-1031,Legislators,,1580,6.2,*,17180,...,*,*,*,12900,13640,14880,16240,23080,True,


In [62]:
new_cols = [x.lower() for x in df_08.columns]
df_08.columns = new_cols
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_08 = df_08[features].copy()
df_08['year'] = [2008 for i in range(len(df_08))]

In [63]:
df_08.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1945300,17.39,36170,2008
1,AL,11-0000,Management occupations,79400,42.41,88210,2008
2,AL,11-1011,Chief executives,3630,75.62,157290,2008
3,AL,11-1021,General and operations managers,33500,44.39,92330,2008
4,AL,11-1031,Legislators,1580,*,17180,2008


# 2009

In [64]:
df_09 = pd.read_excel('../data/state_dl.xls')

In [66]:
df_09.head()

Unnamed: 0,AREA,ST,STATE,OCC_CODE,OCC_TITLE,GROUP,TOT_EMP,EMP_PRSE,JOBS_1000,H_MEAN,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1,AL,Alabama,00-0000,All Occupations,total,1863620,0.4,1000.0,18.03,...,13.95,22.01,32.20,15550,19410,29020,45770,66980,,
1,1,AL,Alabama,11-0000,Management occupations,major,75100,1.0,40.298,44.06,...,37.92,53.94,77.24,40540,56100,78870,112190,160660,,
2,1,AL,Alabama,11-1011,Chief executives,,2960,3.5,1.589,79.98,...,77.32,#,#,70060,105360,160830,#,#,,
3,1,AL,Alabama,11-1021,General and operations managers,,33390,1.4,17.916,46.09,...,38.03,56.43,#,42470,56280,79090,117360,#,,
4,1,AL,Alabama,11-1031,Legislators,,1350,5.1,0.725,*,...,*,*,*,14090,14720,15780,18590,26680,True,


In [67]:
new_cols = [x.lower() for x in df_09.columns]
df_09.columns = new_cols
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_09 = df_09[features].copy()
df_09['year'] = [2009 for i in range(len(df_09))]

In [68]:
df_09.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1863620,18.03,37500,2009
1,AL,11-0000,Management occupations,75100,44.06,91650,2009
2,AL,11-1011,Chief executives,2960,79.98,166350,2009
3,AL,11-1021,General and operations managers,33390,46.09,95880,2009
4,AL,11-1031,Legislators,1350,*,18840,2009


# 2010

In [69]:
df_10 = pd.read_excel('../data/state_M2010_dl.xls')

In [70]:
df_10.head()

Unnamed: 0,AREA,ST,STATE,OCC_CODE,OCC_TITLE,GROUP,TOT_EMP,EMP_PRSE,JOBS_1000,LOC QUOTIENT,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1,AL,Alabama,00-0000,All Occupations,total,1807480,0.4,1000.0,1.0,...,14.21,22.57,33.5,16470,19630,29570,46940,69670,,
1,1,AL,Alabama,11-0000,Management Occupations,major,69180,1.0,38.274,0.808,...,40.04,56.44,#,44140,60540,83290,117390,#,,
2,1,AL,Alabama,11-1011,Chief Executives,,1750,4.8,0.971,0.451,...,#,#,#,85270,120370,#,#,#,,
3,1,AL,Alabama,11-1021,General and Operations Managers,,30870,1.3,17.08,1.271,...,40.93,60.51,#,46240,60700,85130,125860,#,,
4,1,AL,Alabama,11-1031,Legislators,,1200,4.4,0.663,1.282,...,*,*,*,15570,15930,17150,19060,24540,True,


In [71]:
new_cols = [x.lower() for x in df_10.columns]
df_10.columns = new_cols
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_10 = df_10[features].copy()
df_10['year'] = [2010 for i in range(len(df_10))]

In [72]:
df_10.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1807480,18.55,38590,2010
1,AL,11-0000,Management Occupations,69180,46.39,96480,2010
2,AL,11-1011,Chief Executives,1750,89.14,185420,2010
3,AL,11-1021,General and Operations Managers,30870,49.37,102690,2010
4,AL,11-1031,Legislators,1200,*,19450,2010


# 2011

In [73]:
df_11 = pd.read_excel('../data/state_M2011_dl.xls')

In [74]:
df_11.head()

Unnamed: 0,AREA,ST,STATE,OCC_CODE,OCC_TITLE,GROUP,TOT_EMP,EMP_PRSE,JOBS_1000,LOC_Q,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1,AL,Alabama,00-0000,All Occupations,total,1809420,0.4,1000.0,1.0,...,14.35,22.91,34.18,17110,19910,29850,47650,71100,,
1,1,AL,Alabama,11-0000,Management Occupations,major,66050,1.0,36.502,0.76,...,42.28,58.36,81.81,48550,64680,87940,121400,170170,,
2,1,AL,Alabama,11-1011,Chief Executives,,1530,8.9,0.847,0.41,...,79.13,#,#,85410,112460,164590,#,#,,
3,1,AL,Alabama,11-1021,General and Operations Managers,,28220,1.4,15.596,1.11,...,44.5,65.23,#,51490,66670,92570,135670,#,,
4,1,AL,Alabama,11-1031,Legislators,,1140,5.4,0.63,1.3,...,*,*,*,15960,16760,18100,19470,27760,True,


In [75]:
new_cols = [x.lower() for x in df_11.columns]
df_11.columns = new_cols
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_11 = df_11[features].copy()
df_11['year'] = [2011 for i in range(len(df_11))]

In [76]:
df_11.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1809420,18.84,39180,2011
1,AL,11-0000,Management Occupations,66050,48.32,100500,2011
2,AL,11-1011,Chief Executives,1530,85.87,178610,2011
3,AL,11-1021,General and Operations Managers,28220,52.82,109870,2011
4,AL,11-1031,Legislators,1140,*,20340,2011


# 2012

In [77]:
df_12 = pd.read_excel('../data/state_M2012_dl.xls')

In [78]:
df_12.head()

Unnamed: 0,AREA,ST,STATE,OCC_CODE,OCC_TITLE,OCC_GROUP,TOT_EMP,EMP_PRSE,JOBS_1000,LOC_Q,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1,AL,Alabama,00-0000,All Occupations,total,1824400,0.4,1000.0,1.0,...,14.4,23.12,34.73,17150,19850,29950,48090,72230,,
1,1,AL,Alabama,11-0000,Management Occupations,major,66790,1.0,36.608,0.75,...,43.21,59.64,83.37,49870,66520,89870,124060,173420,,
2,1,AL,Alabama,11-1011,Chief Executives,detailed,1330,10.0,0.727,0.37,...,80.98,#,#,86540,113020,168450,#,#,,
3,1,AL,Alabama,11-1021,General and Operations Managers,detailed,27040,1.5,14.822,1.02,...,47.09,69.04,#,54870,71140,97950,143600,#,,
4,1,AL,Alabama,11-1031,Legislators,detailed,1070,6.5,0.584,1.34,...,*,*,*,16010,16860,18270,19850,26680,True,


In [79]:
new_cols = [x.lower() for x in df_12.columns]
df_12.columns = new_cols
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_12 = df_12[features].copy()
df_12['year'] = [2012 for i in range(len(df_12))]

In [80]:
df_12.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1824400,19.01,39550,2012
1,AL,11-0000,Management Occupations,66790,49.24,102420,2012
2,AL,11-1011,Chief Executives,1330,86.38,179680,2012
3,AL,11-1021,General and Operations Managers,27040,55.58,115610,2012
4,AL,11-1031,Legislators,1070,*,19860,2012


# 2013

In [81]:
df_13 = pd.read_excel('../data/state_M2013_dl.xls')

In [82]:
df_13.head()

Unnamed: 0,AREA,ST,STATE,OCC_CODE,OCC_TITLE,OCC_GROUP,TOT_EMP,EMP_PRSE,JOBS_1000,LOC_Q,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1,AL,Alabama,00-0000,All Occupations,total,1844080,0.4,1000.0,1.0,...,14.61,23.64,35.4,17200,20010,30390,49180,73620,,
1,1,AL,Alabama,11-0000,Management Occupations,major,66820,1.1,36.233,0.73,...,44.53,61.52,86.05,51190,68650,92610,127970,178980,,
2,1,AL,Alabama,11-1011,Chief Executives,detailed,1300,11.1,0.705,0.38,...,83.98,#,#,87940,117790,174690,#,#,,
3,1,AL,Alabama,11-1021,General and Operations Managers,detailed,26490,1.7,14.364,0.96,...,49.06,71.18,#,58470,74340,102050,148050,#,,
4,1,AL,Alabama,11-1031,Legislators,detailed,1400,9.8,0.76,1.81,...,*,*,*,16110,17010,18510,21340,36190,True,


In [83]:
new_cols = [x.lower() for x in df_13.columns]
df_13.columns = new_cols
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_13 = df_13[features].copy()
df_13['year'] = [2013 for i in range(len(df_13))]

In [84]:
df_13.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1844080,19.35,40240,2013
1,AL,11-0000,Management Occupations,66820,50.77,105600,2013
2,AL,11-1011,Chief Executives,1300,88.34,183750,2013
3,AL,11-1021,General and Operations Managers,26490,57.62,119850,2013
4,AL,11-1031,Legislators,1400,*,22660,2013


# 2014

In [86]:
df_14 = pd.read_excel('../data/state_M2014_dl.xlsx')

In [87]:
df_14.head()

Unnamed: 0,AREA,ST,STATE,OCC_CODE,OCC_TITLE,OCC_GROUP,TOT_EMP,EMP_PRSE,JOBS_1000,LOC_Q,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1,AL,Alabama,00-0000,All Occupations,total,1857530,0.4,1000.0,1.0,...,14.83,23.95,36.04,17260,20220,30850,49810,74950,,
1,1,AL,Alabama,11-0000,Management Occupations,major,67500,1.1,36.338,0.73,...,44.98,62.09,88.43,51050,68830,93550,129150,183940,,
2,1,AL,Alabama,11-1011,Chief Executives,detailed,1080,4.8,0.58,0.32,...,#,#,#,108270,140570,#,#,#,,
3,1,AL,Alabama,11-1021,General and Operations Managers,detailed,26480,1.5,14.258,0.94,...,49,71.44,#,57510,74390,101930,148590,#,,
4,1,AL,Alabama,11-1031,Legislators,detailed,1470,8.7,0.79,1.94,...,*,*,*,16120,17000,18450,20670,32820,True,


In [88]:
new_cols = [x.lower() for x in df_14.columns]
df_14.columns = new_cols
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_14 = df_14[features].copy()
df_14['year'] = [2014 for i in range(len(df_14))]

In [89]:
df_14.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1857530,19.66,40890,2014
1,AL,11-0000,Management Occupations,67500,51.48,107080,2014
2,AL,11-1011,Chief Executives,1080,97.67,203150,2014
3,AL,11-1021,General and Operations Managers,26480,58,120640,2014
4,AL,11-1031,Legislators,1470,*,21920,2014


# 2015

In [91]:
df_15 = pd.read_excel('../data/state_M2015_dl.xlsx')

In [92]:
df_15.head()


Unnamed: 0,AREA,ST,STATE,OCC_CODE,OCC_TITLE,OCC_GROUP,TOT_EMP,EMP_PRSE,JOBS_1000,LOC_Q,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1,AL,Alabama,00-0000,All Occupations,total,1883310,0.4,1000.0,1.0,...,15.17,24.39,36.88,17380,20650,31550,50730,76720,,
1,1,AL,Alabama,11-0000,Management Occupations,major,69100,1.0,36.689,0.73,...,45.9,63.15,#,52320,70120,95470,131360,#,,
2,1,AL,Alabama,11-1011,Chief Executives,detailed,1100,4.1,0.587,0.34,...,#,#,#,109410,145150,#,#,#,,
3,1,AL,Alabama,11-1021,General and Operations Managers,detailed,27240,1.4,14.466,0.93,...,49.2,71.98,#,57800,74950,102330,149720,#,,
4,1,AL,Alabama,11-1031,Legislators,detailed,1430,8.7,0.76,1.88,...,*,*,*,16110,16960,18380,20200,32120,True,


In [93]:
new_cols = [x.lower() for x in df_15.columns]
df_15.columns = new_cols
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_15 = df_15[features].copy()
df_15['year'] = [2015 for i in range(len(df_15))]

In [94]:
df_15.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1883310,20.15,41920,2015
1,AL,11-0000,Management Occupations,69100,52.79,109800,2015
2,AL,11-1011,Chief Executives,1100,101.21,210530,2015
3,AL,11-1021,General and Operations Managers,27240,58.85,122410,2015
4,AL,11-1031,Legislators,1430,*,22690,2015


# 2016

In [95]:
df_16 = pd.read_excel('../data/state_M2016_dl.xlsx')

In [96]:
df_16.head()

Unnamed: 0,AREA,ST,STATE,OCC_CODE,OCC_TITLE,OCC_GROUP,TOT_EMP,EMP_PRSE,JOBS_1000,LOC_Q,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1,AL,Alabama,00-0000,All Occupations,total,1912990,0.4,1000.0,1.0,...,15.43,24.7,37.4,17580,21200,32100,51370,77780,,
1,1,AL,Alabama,11-0000,Management Occupations,major,70090,1.0,36.639,0.73,...,46.09,63.18,90.25,52470,70040,95870,131410,187720,,
2,1,AL,Alabama,11-1011,Chief Executives,detailed,1030,5.5,0.538,0.34,...,94.59,#,#,98310,137140,196750,#,#,,
3,1,AL,Alabama,11-1021,General and Operations Managers,detailed,26930,1.4,14.077,0.9,...,49.53,71.49,#,55770,74270,103030,148710,#,,
4,1,AL,Alabama,11-1031,Legislators,detailed,1210,8.9,0.635,1.66,...,*,*,*,16180,17070,18560,21590,37320,True,


In [97]:
new_cols = [x.lower() for x in df_16.columns]
df_16.columns = new_cols
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_16 = df_16[features].copy()
df_16['year'] = [2016 for i in range(len(df_16))]

In [98]:
df_16.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1912990,20.44,42510,2016
1,AL,11-0000,Management Occupations,70090,52.98,110210,2016
2,AL,11-1011,Chief Executives,1030,100.63,209300,2016
3,AL,11-1021,General and Operations Managers,26930,58.86,122420,2016
4,AL,11-1031,Legislators,1210,*,23390,2016


# 2017

In [107]:
df_17 = pd.read_excel('../data/state_M2017_dl.xlsx')

In [108]:
df_17.head()

Unnamed: 0,AREA,ST,STATE,OCC_CODE,OCC_TITLE,OCC_GROUP,TOT_EMP,EMP_PRSE,JOBS_1000,LOC_Q,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1,AL,Alabama,00-0000,All Occupations,total,1922570,0.4,1000.0,1.0,...,15.77,25.01,37.83,17770,21740,32800,52020,78690,,
1,1,AL,Alabama,11-0000,Management Occupations,major,69950,1.0,36.385,0.71,...,46.63,64.12,90.8,52130,70130,96980,133360,188860,,
2,1,AL,Alabama,11-1011,Chief Executives,detailed,1120,6.0,0.585,0.4,...,94.45,#,#,84520,125290,196460,#,#,,
3,1,AL,Alabama,11-1021,General and Operations Managers,detailed,27150,1.6,14.123,0.91,...,49.19,71.09,#,54040,72610,102320,147860,#,,
4,1,AL,Alabama,11-1031,Legislators,detailed,1100,12.0,0.572,1.59,...,*,*,*,16310,17280,18910,24630,47510,True,


In [109]:
new_cols = [x.lower() for x in df_17.columns]
df_17.columns = new_cols
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_17 = df_17[features].copy()
df_17['year'] = [2017 for i in range(len(df_17))]

In [110]:
df_17.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1922570,20.76,43170,2017
1,AL,11-0000,Management Occupations,69950,53.44,111150,2017
2,AL,11-1011,Chief Executives,1120,99.56,207090,2017
3,AL,11-1021,General and Operations Managers,27150,58.04,120730,2017
4,AL,11-1031,Legislators,1100,*,25410,2017


# 2018

In [105]:
df_18 = pd.read_excel('../data/state_M2018_dl.xlsx')

In [106]:
df_18.head()

Unnamed: 0,AREA,ST,STATE,OCC_CODE,OCC_TITLE,OCC_GROUP,TOT_EMP,EMP_PRSE,JOBS_1000,LOC_Q,...,H_MEDIAN,H_PCT75,H_PCT90,A_PCT10,A_PCT25,A_MEDIAN,A_PCT75,A_PCT90,ANNUAL,HOURLY
0,1,AL,Alabama,00-0000,All Occupations,total,1943760,0.4,1000.0,1.0,...,16.22,25.63,38.28,18030,22400,33740,53310,79630,,
1,1,AL,Alabama,11-0000,Management Occupations,major,73860,1.1,38.001,0.72,...,46.01,63.87,90.06,48650,68030,95710,132860,187330,,
2,1,AL,Alabama,11-1011,Chief Executives,detailed,1390,5.6,0.716,0.53,...,91.31,#,#,79530,125570,189920,#,#,,
3,1,AL,Alabama,11-1021,General and Operations Managers,detailed,28600,1.8,14.712,0.93,...,48.55,70.79,#,50180,68990,100980,147250,#,,
4,1,AL,Alabama,11-1031,Legislators,detailed,970,6.4,0.498,1.43,...,*,*,*,16310,17390,19210,32290,63390,True,


In [111]:
new_cols = [x.lower() for x in df_18.columns]
df_18.columns = new_cols
features = ['st','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_18 = df_18[features].copy()
df_18['year'] = [2018 for i in range(len(df_18))]

In [112]:
df_18.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,AL,00-0000,All Occupations,1943760,21.05,43790,2018
1,AL,11-0000,Management Occupations,73860,52.73,109680,2018
2,AL,11-1011,Chief Executives,1390,98.36,204590,2018
3,AL,11-1021,General and Operations Managers,28600,56.94,118440,2018
4,AL,11-1031,Legislators,970,*,31760,2018


# 2019

In [114]:
df_19 = pd.read_excel('../data/state_M2019_dl.xlsx')

In [115]:
df_19.head()

Unnamed: 0,area,area_title,area_type,naics,naics_title,i_group,own_code,occ_code,occ_title,o_group,...,h_median,h_pct75,h_pct90,a_pct10,a_pct25,a_median,a_pct75,a_pct90,annual,hourly
0,1,Alabama,2,0,Cross-industry,cross-industry,1235,00-0000,All Occupations,total,...,16.73,26.34,39.26,18270,23000,34800,54790,81660,,
1,1,Alabama,2,0,Cross-industry,cross-industry,1235,11-0000,Management Occupations,major,...,45.03,63.07,90.16,47250,66140,93660,131180,187530,,
2,1,Alabama,2,0,Cross-industry,cross-industry,1235,11-1011,Chief Executives,detailed,...,86.91,#,#,68630,115110,180780,#,#,,
3,1,Alabama,2,0,Cross-industry,cross-industry,1235,11-1021,General and Operations Managers,detailed,...,47.42,70.48,#,48240,66370,98630,146610,#,,
4,1,Alabama,2,0,Cross-industry,cross-industry,1235,11-1031,Legislators,detailed,...,*,*,*,16200,17210,18880,28660,55220,True,


In [117]:
features = ['area_title','occ_code','occ_title','tot_emp','h_mean','a_mean']
df_19 = df_19[features].copy()
df_19['year'] = [2019 for i in range(len(df_19))]

In [118]:
df_19.head()

Unnamed: 0,area_title,occ_code,occ_title,tot_emp,h_mean,a_mean,year
0,Alabama,00-0000,All Occupations,1974170,21.6,44930,2019
1,Alabama,11-0000,Management Occupations,83760,51.86,107860,2019
2,Alabama,11-1011,Chief Executives,1320,92.84,193110,2019
3,Alabama,11-1021,General and Operations Managers,30790,56.41,117340,2019
4,Alabama,11-1031,Legislators,1110,*,29130,2019


In [119]:
df_19.rename(columns = {'area_title':'st'},inplace = True)

# merging everyone!

In [199]:
df = pd.concat([df_97,
                df_98,
                df_99,
                df_00,
                df_01,
                df_02,
                df_03,
                df_04,
                df_05,
                df_06,
                df_07,
                df_08,
                df_09,
                df_10,
                df_11,
                df_12,
                df_13,
                df_14,
                df_15,
                df_16,
                df_17,
                df_18,
                df_19
               ])

In [200]:
df.head()

Unnamed: 0,st,occ_code,occ_title,tot_emp,h_mean,a_mean,year
40,AL,10000,Managerial and Administrative Occupations,,,,
41,AL,13000,Staff and Administrative Specialty Managerial ...,**,18.04,37530.0,1997.0
42,AL,13002,Financial Managers,8140,24.23,50400.0,1997.0
43,AL,13005,"Personnel, Training, and Labor Relations Managers",2760,20.84,43350.0,1997.0
44,AL,13008,Purchasing Managers,2370,19.07,39660.0,1997.0


In [202]:
df['year'].value_counts()

2013    37725
2014    37717
2015    37642
2012    37634
2016    37561
2017    36992
2018    36897
2007    36822
2008    36765
2009    36728
2011    36637
2010    36566
2019    36382
2005    36240
2004    35859
2006    35797
1998    35322
1997    34055
2000    33192
2001    33000
2002    32406
2003    32355
1999    30447
Name: year, dtype: int64

In [204]:
df.to_csv('../data/occupation_99_to_19.csv', index = False)