In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('./source_data/2010-2020-race-demographics.csv')

In [30]:
data = df[['Yearly July 1st Estimates', 'States', 'Race', 'Ethnicity', 'Population']]
data.columns = ['year', 'state', 'race', 'ethnicity', 'population']

all_totals = data.groupby(['year', 'state'])[['population']]\
                    .sum()\
                    .rename(columns = {'population' : 'total'})

race_totals = data.groupby(['year', 'state', 'race'])[['population']]\
                    .sum().reset_index()\
                    .pivot(index = ['year', 'state'], 
                           columns = 'race', 
                           values = 'population')\
                    .reset_index()

race_totals.columns = ['year', 'state', 'american_indian', 'asian', 'black', 'more_than_one', 'pacific_islander', 'white']
race_totals = race_totals.set_index(['year', 'state']).div(all_totals['total'], axis = 0)

hispanic_totals = data[~data['ethnicity'].str.contains('Not')]\
                    .groupby(['year','state'])[['population']]\
                    .sum()\
                    .rename(columns = {'population' : 'hispanic'})\
                    .div(all_totals['total'], axis = 0)

white_non_h = data[(data['race'] == 'White') & ~data['ethnicity'].str.contains('Not')]\
                     [['year', 'state', 'population']]\
                    .rename(columns = {'population' : 'white_non_hispanic'})\
                    .set_index(['year', 'state'])\
                    .div(all_totals['total'], axis = 0)
                    

all_data = pd.concat([race_totals, hispanic_totals, white_non_h], axis = 1, ignore_index = False)\
                    .reset_index()

state_pos = ['AL', 'AK', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 
             'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 
             'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY']

states = sorted(list(all_data['state'].unique()))

all_data.loc[:, 'state'] = all_data['state'].apply(lambda x : state_pos[states.index(x)])

all_data.to_csv(path_or_buf='./processed_data/2010-2020-race-demographics.csv', header= True, index= False)