In [1]:
import pandas as pd
import numpy as np

In [2]:
df_old = pd.read_csv('./data/data_raw/pop-2010-2019.csv')
df_new = pd.read_csv('./data/data_raw/pop-2020-2021.csv')

df_old.head()

Unnamed: 0,Geographic Area Name (NAME),Date Description (DATE_DESC),Population (POP)
0,Alabama,4/1/2010 Census population,4779736
1,Alabama,4/1/2010 population estimates base,4780125
2,Alabama,7/1/2010 population estimate,4785437
3,Alabama,7/1/2011 population estimate,4799069
4,Alabama,7/1/2012 population estimate,4815588


In [3]:
df_old.rename(columns={
    'Geographic Area Name (NAME)' : 'state',
    'Date Description (DATE_DESC)' : 'date',
    'Population (POP)' : 'pop'
}, inplace=True)

In [4]:
# Removing strings from date data to convert to datetime
df_old['date'] = df_old['date'].str.replace(' population estimate', '')
df_old['date'] = df_old['date'].str.replace('s base', '')
df_old['date'] = df_old['date'].str.replace(' Census population', '')
df_old['date'] = pd.to_datetime(df_old['date'])

# Removing strings from population to convert to numeric
df_old['pop'] = df_old['pop'].str.replace(',', '')
df_old['pop'] = df_old['pop'].astype(int)

# Using groupby to handle duplicate dates in data and average populations with duplicate dates
df_old = df_old.groupby(by=['state', 'date'])['pop'].mean().reset_index()
df_old.index = df_old['date']
df_old = df_old.drop(columns=['date'])

In [5]:
def clean_df_old(df):
    """Returns a dataframe where the index is datetime and the columns are US states.
    This is for the purpose of reformatting the dataframe in a way where we can use it later
    and merge it with df_new."""

    df1 = pd.DataFrame(columns = df_old['state'].unique())

    for x in df['state'].unique():
        df_state = df[df['state'] == x]
        df1[x] = df_state['pop']

    return df1

df_old1 = clean_df_old(df_old)

In [6]:
df_old1.columns = df_old1.columns.str.lower()
df_old1

Unnamed: 0_level_0,alabama,alaska,arizona,arkansas,california,colorado,connecticut,delaware,district of columbia,florida,...,south dakota,tennessee,texas,utah,vermont,virginia,washington,west virginia,wisconsin,wyoming
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-04-01,4779930.5,710240.0,6392152.5,2915974.5,37254237.5,5029257.5,3574122.0,897935.5,601745.0,18802937.0,...,814189.0,6346190.5,25145826.0,2763888.0,625739.0,8001036.5,6724540.0,1853006.0,5687135.5,563700.5
2010-07-01,4785437.0,713910.0,6407172.0,2921964.0,37319502.0,5047349.0,3579114.0,899593.0,605226.0,18845537.0,...,816166.0,6355311.0,25241971.0,2775332.0,625879.0,8023699.0,6742830.0,1854239.0,5690475.0,564487.0
2011-07-01,4799069.0,722128.0,6472643.0,2940667.0,37638369.0,5121108.0,3588283.0,907381.0,619800.0,19053237.0,...,823579.0,6399291.0,25645629.0,2814384.0,627049.0,8101155.0,6826627.0,1856301.0,5705288.0,567299.0
2012-07-01,4815588.0,730443.0,6554978.0,2952164.0,37948800.0,5192647.0,3594547.0,915179.0,634924.0,19297822.0,...,833566.0,6453898.0,26084481.0,2853375.0,626090.0,8185080.0,6897058.0,1856872.0,5719960.0,576305.0
2013-07-01,4830081.0,737068.0,6632764.0,2959400.0,38260787.0,5269035.0,3594841.0,923576.0,650581.0,19545621.0,...,842316.0,6494340.0,26480266.0,2897640.0,626210.0,8252427.0,6963985.0,1853914.0,5736754.0,582122.0
2014-07-01,4841799.0,736283.0,6730413.0,2967392.0,38596972.0,5350101.0,3594524.0,932487.0,662328.0,19845911.0,...,849129.0,6541223.0,26964333.0,2936879.0,625214.0,8310993.0,7054655.0,1849489.0,5751525.0,582531.0
2015-07-01,4852347.0,737498.0,6829676.0,2978048.0,38918045.0,5450623.0,3587122.0,941252.0,675400.0,20209042.0,...,853988.0,6591170.0,27470056.0,2981835.0,625216.0,8361808.0,7163657.0,1842050.0,5760940.0,585613.0
2016-07-01,4863525.0,741456.0,6941072.0,2989918.0,39167117.0,5539215.0,3578141.0,948921.0,685815.0,20613477.0,...,862996.0,6646010.0,27914410.0,3041868.0,623657.0,8410106.0,7294771.0,1831023.0,5772628.0,584215.0
2017-07-01,4874486.0,739700.0,7044008.0,3001345.0,39358497.0,5611885.0,3573297.0,956823.0,694906.0,20963613.0,...,872868.0,6708799.0,28295273.0,3101042.0,624344.0,8463587.0,7423362.0,1817004.0,5790186.0,578931.0
2018-07-01,4887681.0,735139.0,7158024.0,3009733.0,39461588.0,5691287.0,3571520.0,965479.0,701547.0,21244317.0,...,878698.0,6771631.0,28628666.0,3153550.0,624358.0,8501286.0,7523869.0,1804291.0,5807406.0,577601.0


In [7]:
df_new.rename(columns={
    'Geographic Area Name (NAME)' : 'state',
    'Estimates Base Population, April 1, 2020 (POP_BASE2020)' : '2020-04-01',
    'Population Estimate, July 1, 2020 (POP_2020)' : '2020-07-01',
    'Population Estimate, July 1, 2021 (POP_2021)' : '2021-07-01'
}, inplace=True)

df_new = df_new.T
df_new.columns = df_new.iloc[0]

df_new.drop(
    index=df_new.index[0],
    axis=0,
    inplace=True
)

# Removing strings from population to convert to numeric
# But this dataframe already has each column as states, thus
# the need for the for loop
for col in df_new:
    df_new[col] = df_new[col].str.replace(',', '')
    df_new[col] = df_new[col].astype(int)

df_new.columns = df_new.columns.str.lower()

In [8]:
df_m = pd.concat([df_old1, df_new])
df_m.index = pd.to_datetime(df_m.index)
df_m

Unnamed: 0,alabama,alaska,arizona,arkansas,california,colorado,connecticut,delaware,district of columbia,florida,...,south dakota,tennessee,texas,utah,vermont,virginia,washington,west virginia,wisconsin,wyoming
2010-04-01,4779930.5,710240.0,6392152.5,2915974.5,37254237.5,5029257.5,3574122.0,897935.5,601745.0,18802937.0,...,814189.0,6346190.5,25145826.0,2763888.0,625739.0,8001036.5,6724540.0,1853006.0,5687135.5,563700.5
2010-07-01,4785437.0,713910.0,6407172.0,2921964.0,37319502.0,5047349.0,3579114.0,899593.0,605226.0,18845537.0,...,816166.0,6355311.0,25241971.0,2775332.0,625879.0,8023699.0,6742830.0,1854239.0,5690475.0,564487.0
2011-07-01,4799069.0,722128.0,6472643.0,2940667.0,37638369.0,5121108.0,3588283.0,907381.0,619800.0,19053237.0,...,823579.0,6399291.0,25645629.0,2814384.0,627049.0,8101155.0,6826627.0,1856301.0,5705288.0,567299.0
2012-07-01,4815588.0,730443.0,6554978.0,2952164.0,37948800.0,5192647.0,3594547.0,915179.0,634924.0,19297822.0,...,833566.0,6453898.0,26084481.0,2853375.0,626090.0,8185080.0,6897058.0,1856872.0,5719960.0,576305.0
2013-07-01,4830081.0,737068.0,6632764.0,2959400.0,38260787.0,5269035.0,3594841.0,923576.0,650581.0,19545621.0,...,842316.0,6494340.0,26480266.0,2897640.0,626210.0,8252427.0,6963985.0,1853914.0,5736754.0,582122.0
2014-07-01,4841799.0,736283.0,6730413.0,2967392.0,38596972.0,5350101.0,3594524.0,932487.0,662328.0,19845911.0,...,849129.0,6541223.0,26964333.0,2936879.0,625214.0,8310993.0,7054655.0,1849489.0,5751525.0,582531.0
2015-07-01,4852347.0,737498.0,6829676.0,2978048.0,38918045.0,5450623.0,3587122.0,941252.0,675400.0,20209042.0,...,853988.0,6591170.0,27470056.0,2981835.0,625216.0,8361808.0,7163657.0,1842050.0,5760940.0,585613.0
2016-07-01,4863525.0,741456.0,6941072.0,2989918.0,39167117.0,5539215.0,3578141.0,948921.0,685815.0,20613477.0,...,862996.0,6646010.0,27914410.0,3041868.0,623657.0,8410106.0,7294771.0,1831023.0,5772628.0,584215.0
2017-07-01,4874486.0,739700.0,7044008.0,3001345.0,39358497.0,5611885.0,3573297.0,956823.0,694906.0,20963613.0,...,872868.0,6708799.0,28295273.0,3101042.0,624344.0,8463587.0,7423362.0,1817004.0,5790186.0,578931.0
2018-07-01,4887681.0,735139.0,7158024.0,3009733.0,39461588.0,5691287.0,3571520.0,965479.0,701547.0,21244317.0,...,878698.0,6771631.0,28628666.0,3153550.0,624358.0,8501286.0,7523869.0,1804291.0,5807406.0,577601.0


In [9]:
df_m.to_csv('./data/pop_df_merged.csv')