<a href="https://colab.research.google.com/github/oimartin/Older-and-Wiser/blob/main/gender_data_US_Census.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Comparing different population groups population over 10 years: 2009 - 2019
<br>
Source: U.S. Census Bureau, Current Population Survey, Annual Social and Economic Supplement, 2009 - 2019
<br>
NOTE: The 2014 CPS ASEC included redesigned questions for income and health insurance coverage

# Digest Data

In [1]:
# import dependencies
import pandas as pd
import plotly.express as px

In [2]:
# import data from github
# data originally as xlsx Excel file, but converted to csv to be hosted in github
yr2009 = pd.read_csv('https://raw.githubusercontent.com/oimartin/Older-and-Wiser/main/data/2009gender_table1.csv')
yr2010 = pd.read_csv('https://raw.githubusercontent.com/oimartin/Older-and-Wiser/main/data/2010gender_table1.csv')
yr2011 = pd.read_csv('https://raw.githubusercontent.com/oimartin/Older-and-Wiser/main/data/2011gender_table1.csv')
yr2012 = pd.read_csv('https://raw.githubusercontent.com/oimartin/Older-and-Wiser/main/data/2012gender_table1.csv')
yr2013 = pd.read_csv('https://raw.githubusercontent.com/oimartin/Older-and-Wiser/main/data/2013gender_table1.csv')
yr2014 = pd.read_csv('https://raw.githubusercontent.com/oimartin/Older-and-Wiser/main/data/2014gender_table1.csv')
yr2015 = pd.read_csv('https://raw.githubusercontent.com/oimartin/Older-and-Wiser/main/data/2015gender_table1.csv')
yr2016 = pd.read_csv('https://raw.githubusercontent.com/oimartin/Older-and-Wiser/main/data/2016gender_table1.csv')
yr2017 = pd.read_csv('https://raw.githubusercontent.com/oimartin/Older-and-Wiser/main/data/2017gender_table1.csv')
yr2018 = pd.read_csv('https://raw.githubusercontent.com/oimartin/Older-and-Wiser/main/data/2018gender_table1.csv')
yr2019 = pd.read_csv('https://raw.githubusercontent.com/oimartin/Older-and-Wiser/main/data/2019gender_table1.csv')

In [3]:
# Save what the original file looked like
# with notes
original_yr2009 = yr2009
original_yr2014 = yr2014

In [4]:
# Check shape of different years
years = [yr2009, yr2010, yr2011, yr2012, yr2013, yr2014,
         yr2015, yr2016, yr2017, yr2018, yr2019]

yrs = ['2009', '2010', '2011', '2012', '2013', '2014',
       '2015', '2016', '2017', '2018', '2019']
for year in years:
  display(year.shape)

(40, 8)

(40, 8)

(39, 8)

(39, 8)

(41, 8)

(49, 8)

(39, 8)

(41, 8)

(41, 8)

(41, 8)

(42, 8)

In [5]:
# clean up each year of data
for year in range(0,11):

  # Remove column with NaN
  years[year].drop(columns=['Unnamed: 7'], inplace=True)

  # Rename columns
  years[year].rename(columns={years[year].columns[0]:'Age',
                              years[year].columns[1]:'Both_Sexes_No',
                              years[year].columns[2]:'Both_Sexes_Perc.',
                              years[year].columns[3]:'Male_No',
                              years[year].columns[4]:'Male_Perc.',
                              years[year].columns[5]:'Fem_No',
                              years[year].columns[6]:'Fem_Perc.'}, inplace=True)
  # Remove NaN, (X), commas, white space, periods
  years[year].dropna(axis=0, inplace=True)
  years[year].replace('(X)', 0, regex=True, inplace=True)
  years[year].replace(',', '', regex=True, inplace=True)
  years[year]['Age'] = years[year]['Age'].str.lstrip()
  years[year]['Age'] = years[year]['Age'].str.lstrip('.')

  # Convert strings to floats
  for column in list(years[year].columns)[1:-1]:
      years[year][column] = years[year][column].astype(float)

  # Create new age - 20 years and under
  under20 =  years[year].iloc[19, 0:].copy() +  years[year].iloc[20, 0:].copy() +  years[year].iloc[21, 0:].copy()
  under20['Age'] = '20 years and under'
  under20df = pd.DataFrame(under20).T
  years[year] = pd.concat([years[year], under20df]).copy()

  # Add year to df
  years[year][yrs[year]] = yrs[year]
  years[year].rename(columns={years[year].columns[7]:'Year'}, inplace=True)

  # Create index of both_sexes, male, and female populations from 2009
  years[year]['Both_Sexes_No_Index'] = (years[year]['Both_Sexes_No']/ years[0]['Both_Sexes_No'])*100
  years[year]['Male_No_Index'] = (years[year]['Male_No']/ years[0]['Male_No'])*100
  years[year]['Fem_No_Index'] = (years[year]['Fem_No']/ years[0]['Fem_No'])*100
  

In [6]:
 # confirm changes to each year of data
for year in years:
  display(year.shape)

(27, 11)

(27, 11)

(27, 11)

(27, 11)

(27, 11)

(27, 11)

(27, 11)

(27, 11)

(27, 11)

(27, 11)

(27, 11)

## Separate out Median Age from other Ages

In [7]:
# combine all of the years of data
all_09_19 = pd.concat(years)

In [8]:
# seperate out general groups 
gen_groups = all_09_19.loc[(all_09_19['Age'] == '45 to 64 years') | 
              (all_09_19['Age'] == '21 to 44 years') |
              (all_09_19['Age'] == '20 years and under') |
              (all_09_19['Age'] == '65 years and over')].copy()

In [9]:
gen_groups.head()

Unnamed: 0,Age,Both_Sexes_No,Both_Sexes_Perc.,Male_No,Male_Perc.,Fem_No,Fem_Perc.,Year,Both_Sexes_No_Index,Male_No_Index,Fem_No_Index
28,21 to 44 years,98420.0,32.6,49207.0,33.2,49214.0,32.1,2009,100.0,100.0,100.0
29,45 to 64 years,78655.0,26.1,38279.0,25.8,40377.0,26.3,2009,100.0,100.0,100.0
30,65 years and over,37788.0,12.5,16308.0,11.0,21480.0,14.0,2009,100.0,100.0,100.0
0,20 years and under,86620.0,28.7,44301.0,29.9,42318.0,19.54.23.8,2009,100.0,100.0,100.0
28,21 to 44 years,98303.0,32.3,49236.0,32.9,49068.0,31.7,2010,99.881122,100.058935,99.703336


In [14]:
gen_groups.loc[:,'Age_order'] = gen_groups.loc[:,'Age'].replace({'20 years and under': 1,
                                                                  '21 to 44 years': 2,
                                                                  '45 to 64 years': 3,
                                                                  '65 years and over': 4})
gen_groups.sort_values(by=['Year', 'Age_order'], inplace=True)

# Compare different age groups change in population from 2009-2019

## Final Graphs

In [19]:
fig = px.area(gen_groups, x='Year', y='Both_Sexes_No',
              color_discrete_map={'45 to 64 years': '#8154b7',
                                  '21 to 44 years': '#9fbb07',
                                  '20 years and under': '#538ca9',
                                  '65 years and over': '#ab6845'},
              color='Age', markers=True,
              title="Total US Population From 2009 to 2019")
fig.update_layout(yaxis_title='Population (in thousands)', showlegend=False,
                  template="plotly_white")
fig.show()

In [17]:
fig = px.line(gen_groups, x='Year', y='Both_Sexes_No_Index', color='Age',
              color_discrete_map={'45 to 64 years': '#8154b7',
                                  '21 to 44 years': '#9fbb07',
                                  '20 years and under': '#538ca9',
                                  '65 years and over': '#ab6845'},
              markers=True, title='US Population Age Groups Change since 2009')
fig.update_layout(yaxis_title='Index of Population Compared to 2009',
                  template="plotly_white",)
fig.show()