# Combine Census PEP characteristics data for Ohio counties

This script will retrieve the County Population by Characteristics datasets for several decades from the Census Population Estimates Program website, merge them, and standarize them as a long-format CSV file. 

## Setup

### Import required packages

In [196]:
import os
import pandas as pd
import re
import json
import sys
sys.path.append(os.path.normpath("../morpc-common"))
import morpc

### Parameters

In [197]:
# Define county groups
CONST_REGIONS = {
    "15-County Region": ["Delaware", "Fairfield", "Fayette", "Franklin", "Hocking", "Knox", "Licking", "Logan", "Madison", "Marion", "Morrow", "Perry", "Pickaway", "Ross", "Union"],
    "10-County Region": ["Delaware", "Fairfield", "Franklin", "Knox", "Licking", "Madison", "Marion", "Morrow", "Pickaway", "Union"],
    "CORPO Region": ["Fairfield", "Knox", "Madison", "Marion", "Morrow", "Pickaway", "Union"]
}

# Define input and output directories
INPUT_DIR = "./input_data"
OUTPUT_DIR = "./output_data"

### Define inputs

#### Census PEP data 2000+

In [198]:
CENSUSPEP_COUNTY_WIDE_2000_TABLE_FILENAME = "censusPep_county_char.csv"
CENSUSPEP_COUNTY_WIDE_2000_TABLE_PATH = os.path.join(INPUT_DIR, CENSUSPEP_COUNTY_WIDE_2000_TABLE_FILENAME)
CENSUSPEP_COUNTY_WIDE_2000_TABLE_SCHEMA_FILENAME = CENSUSPEP_COUNTY_WIDE_2000_TABLE_FILENAME.replace(".csv","_schema.json")
CENSUSPEP_COUNTY_WIDE_2000_TABLE_SCHEMA_PATH = os.path.join(INPUT_DIR, CENSUSPEP_COUNTY_WIDE_2000_TABLE_SCHEMA_FILENAME)

#### Census PEP data 1980-1989

In [199]:
CENSUSPEP_COUNTY_WIDE_1980_1989_TABLE_FILENAME = "pe-02.csv"
CENSUSPEP_COUNTY_WIDE_1980_1989_TABLE_PATH = os.path.join(INPUT_DIR, CENSUSPEP_COUNTY_WIDE_1980_1989_TABLE_FILENAME)
CENSUSPEP_COUNTY_WIDE_1980_1989_TABLE_SCHEMA_FILENAME = CENSUSPEP_COUNTY_WIDE_1980_1989_TABLE_FILENAME.replace(".csv","_schema.json")
CENSUSPEP_COUNTY_WIDE_1980_1989_TABLE_SCHEMA_PATH = os.path.join(INPUT_DIR, CENSUSPEP_COUNTY_WIDE_1980_1989_TABLE_SCHEMA_FILENAME)

#### Census PEP data 1990-1999

In [200]:
CENSUSPEP_COUNTY_WIDE_1990_1999_TABLE_FILENAME = "stch-icen1990.txt"
CENSUSPEP_COUNTY_WIDE_1990_1999_TABLE_PATH = os.path.join(INPUT_DIR, CENSUSPEP_COUNTY_WIDE_1990_1999_TABLE_FILENAME)
CENSUSPEP_COUNTY_WIDE_1990_1999_CSV_TABLE_FILENAME = "stch-icen1990.csv"
CENSUSPEP_COUNTY_WIDE_1990_1999_CSV_TABLE_PATH = os.path.join(INPUT_DIR, CENSUSPEP_COUNTY_WIDE_1990_1999_CSV_TABLE_FILENAME)
CENSUSPEP_COUNTY_WIDE_1990_1999_TABLE_SCHEMA_FILENAME = CENSUSPEP_COUNTY_WIDE_1990_1999_TABLE_FILENAME.replace(".txt","_schema.json")
CENSUSPEP_COUNTY_WIDE_1990_1999_TABLE_SCHEMA_PATH = os.path.join(INPUT_DIR, CENSUSPEP_COUNTY_WIDE_1990_1999_TABLE_SCHEMA_FILENAME)

### Define output

In [201]:
OUTPUT_TABLE_FILENAME = "output_data.csv"
OUTPUT_TABLE_PATH = os.path.join(OUTPUT_DIR, OUTPUT_TABLE_FILENAME)
OUTPUT_TABLE1_FILENAME = "output_data1.csv"
OUTPUT_TABLE1_PATH = os.path.join(OUTPUT_DIR, OUTPUT_TABLE1_FILENAME)

## Getting 2000+ data

In [202]:
# Adjust the path to where the .csv table and .json schema are located
table_path = "../morpc-censuspep-fetch/output_data/censusPep_county_char.csv"

# Read the CSV file into a DataFrame
data = pd.read_csv(table_path, index_col=0)

# Filter out rows where CTYNAME is not in the "10-County Region"
filtered_df = data[data['CTYNAME'].isin(CONST_REGIONS["10-County Region"])]

# Filter rows where "Variable" is equal to "TOT_POP"
filtered_df = filtered_df[filtered_df['Variable'] == 'TOT_POP']

# Save the census data to the input directory
filtered_df.to_csv(CENSUSPEP_COUNTY_WIDE_2000_TABLE_PATH)

In [203]:
grouped_sum = filtered_df.groupby(['YEAR', 'AGEGRP'])['Value'].sum().reset_index()
aggregated_df = pd.DataFrame(columns=['YEAR', 'AGEGRP', 'Value'])
aggregated_df = pd.concat([aggregated_df, grouped_sum], ignore_index=True)

# Show the aggregated DataFrame
aggregated_df.head()

aggregated_df.to_csv(CENSUSPEP_COUNTY_WIDE_2000_TABLE_PATH)

In [204]:
# Group by "YEAR" and sum "Value" for each group
yearly_sum = aggregated_df.groupby('YEAR')['Value'].sum().reset_index()

# Add a new column for "AGEGRP" with the value "Total" for these summary rows
yearly_sum['AGEGRP'] = 'Total'

# Rearrange columns to match the original DataFrame's order
yearly_sum = yearly_sum[['YEAR', 'AGEGRP', 'Value']]

# Append these new summary rows to the original DataFrame
updated_df = pd.concat([aggregated_df, yearly_sum], ignore_index=True)

# Convert AGEGRP to a type that sorts numerically first, then by string
updated_df['AGEGRP'] = pd.Categorical(updated_df['AGEGRP'], ordered=True, 
                                      categories=sorted(updated_df['AGEGRP'].unique(), key=str))

# Sort the DataFrame by 'YEAR' and then by 'AGEGRP'
sorted_df = updated_df.sort_values(by=['YEAR', 'AGEGRP'])

yearly_sum = yearly_sum[yearly_sum['YEAR'].isin([2000, 2010, 2020])]

# Filter the DataFrame to keep only rows where YEAR is 2000, 2010, or 2020
filtered_df = sorted_df[sorted_df['YEAR'].isin([2000, 2010, 2020])]

totals_df = filtered_df[filtered_df['AGEGRP'] == 'total']

In [205]:
yearly_sum.head()

Unnamed: 0,YEAR,AGEGRP,Value
0,2000,Total,1740458
10,2010,Total,3936535
20,2020,Total,6614062


### Splitting age ranges

In [206]:
# Filter the rows for the 15-19 age group
age_15_19 = filtered_df[filtered_df['AGEGRP'] == '15 to 19']

# Calculate new values assuming equal proportions for the 15-17 and 18-19 sub-ranges
# Assuming each year within 15-19 has an equal proportion
proportion = 3/5  # 3 years (15-17) out of 5 (15-19)
age_15_17_value = age_15_19['Value'] * proportion
age_18_19_value = age_15_19['Value'] * (2/5)  # Remaining proportion for 2 years (18-19)

# Create new rows for the 15-17 and 18-19 sub-ranges
age_15_17_rows = age_15_19.copy()
age_18_19_rows = age_15_19.copy()

age_15_17_rows['AGEGRP'] = '15 to 17'
age_15_17_rows['Value'] = age_15_17_value

age_18_19_rows['AGEGRP'] = '18 to 19'
age_18_19_rows['Value'] = age_18_19_value

# Append the new rows to the original DataFrame
filtered_df = pd.concat([filtered_df, age_15_17_rows, age_18_19_rows], ignore_index=True)

# Remove the original 15-19 age group rows
filtered_df = filtered_df[filtered_df['AGEGRP'] != '15 to 19']

In [207]:
AGEUNDER18 = ['0 to 4', '5 to 9', '10 to 14', '15 to 17']

# Filter the DataFrame for the specified age groups
under_18_df = filtered_df[filtered_df['AGEGRP'].isin(AGEUNDER18)]

# Group by 'YEAR' and sum 'Value' for each group, creating a new 'AGEGRP' for "UNDER 18"
under_18_df = under_18_df.groupby('YEAR')['Value'].sum().reset_index()
under_18_df['AGEGRP'] = 'UNDER 18'

under_18_df.head()

Unnamed: 0,YEAR,Value,AGEGRP
0,2000,446730.8,UNDER 18
1,2010,973021.4,UNDER 18
2,2020,1548935.6,UNDER 18


In [208]:
# Define the age groups for 35 to 64
AGE18TO64 = ['18 to 19','20 to 24' '25 to 29', '30 to 34', '35 to 39', '40 to 44', '45 to 49','50 to 54','55 to 59','60 to 64']

# Filter the DataFrame for the specified age groups
filtered_df_18to64 = filtered_df[filtered_df['AGEGRP'].isin(AGE18TO64)]

# Group by 'YEAR' and sum 'Value' for each group, creating a new 'AGEGRP' for "35 TO 64"
age18to64_df = filtered_df_18to64.groupby('YEAR')['Value'].sum().reset_index()
age18to64_df['AGEGRP'] = '35 TO 64'

age18to64_df.head()

Unnamed: 0,YEAR,Value,AGEGRP
0,2000,848493.2,35 TO 64
1,2010,1949592.6,35 TO 64
2,2020,3214781.4,35 TO 64


In [209]:
# Define the age groups for 65+
AGE65PLUS = ['65 to 69', '70 to 74', '75 to 79', '80 to 84', '85 or older'] 

# Filter the DataFrame for the specified age groups
over_64_df = filtered_df[filtered_df['AGEGRP'].isin(AGE65PLUS)]

# Group by 'YEAR' and sum 'Value' for each group, creating a new 'AGEGRP' for "65 AND OLDER"
over_64_df = over_64_df.groupby('YEAR')['Value'].sum().reset_index()
over_64_df['AGEGRP'] = '65 AND OLDER'

over_64_df.head()

Unnamed: 0,YEAR,Value,AGEGRP
0,2000,178877,65 AND OLDER
1,2010,428157,65 AND OLDER
2,2020,920987,65 AND OLDER


In [210]:
combined_2000_df = pd.concat([under_18_df, age18to64_df, over_64_df, yearly_sum], ignore_index=True)
combined_2000_df.head()

Unnamed: 0,YEAR,Value,AGEGRP
0,2000,446730.8,UNDER 18
1,2010,973021.4,UNDER 18
2,2020,1548935.6,UNDER 18
3,2000,848493.2,35 TO 64
4,2010,1949592.6,35 TO 64


## Adding 1990

In [211]:
# Read the file using read_csv with a regex pattern as the delimiter for multiple spaces
df = pd.read_csv(CENSUSPEP_COUNTY_WIDE_1990_1999_TABLE_PATH, sep='\s+', header=None)

df.columns = ['YEAR', 'FIPSCOUNTY', 'AGEGRP', 'RACE-SEX', 'LATIN', 'Value']

df.to_csv(CENSUSPEP_COUNTY_WIDE_1990_1999_CSV_TABLE_PATH)

df.head()

  df = pd.read_csv(CENSUSPEP_COUNTY_WIDE_1990_1999_TABLE_PATH, sep='\s+', header=None)


Unnamed: 0,YEAR,FIPSCOUNTY,AGEGRP,RACE-SEX,LATIN,Value
0,90,1001,0,1,1,239
1,90,1001,0,2,1,203
2,90,1001,1,1,1,821
3,90,1001,1,2,1,769
4,90,1001,2,1,1,1089


In [212]:
fips_codes_to_keep= [39041, 39045, 39049, 39083, 39089, 39097, 39101, 39117, 39129, 39159]

# Filter for 10-County region
filtered_df = df[df['FIPSCOUNTY'].isin(fips_codes_to_keep)]

# Keep only the 'YEAR', 'AGEGRP', and 'VALUE' columns
filtered_df = filtered_df[['YEAR', 'AGEGRP', 'Value']]

# Filter the DataFrame to keep only rows where YEAR equals 90
filtered_df = filtered_df[filtered_df['YEAR'] == 90]

print(filtered_df)

        YEAR  AGEGRP  Value
626240    90       0    482
626241    90       0    451
626242    90       1   2016
626243    90       1   1915
626244    90       2   2667
...      ...     ...    ...
644475    90      16      0
644476    90      17      0
644477    90      17      0
644478    90      18      0
644479    90      18      0

[3040 rows x 3 columns]


In [213]:
# Group by 'YEAR' and 'AGEGRP', and sum 'VALUE' for each group
added_1990_df = filtered_df.groupby(['YEAR', 'AGEGRP'], as_index=False)['Value'].sum()

# Print the combined DataFrame
print(added_1990_df)

# Sum all values in the "VALUE" column
total_value_sum = added_1990_df['Value'].sum()

    YEAR  AGEGRP   Value
0     90       0   23969
1     90       1   89499
2     90       2  107569
3     90       3  101009
4     90       4  111660
5     90       5  134734
6     90       6  141905
7     90       7  143236
8     90       8  126129
9     90       9  112327
10    90      10   83453
11    90      11   69314
12    90      12   61711
13    90      13   60062
14    90      14   52851
15    90      15   39541
16    90      16   29359
17    90      17   19221
18    90      18   15450


### Splitting age ranges

In [214]:
# Filter the rows for the 15-19 age group
age_15_19 = added_1990_df[added_1990_df['AGEGRP'] == 4]

# Calculate new values assuming equal proportions for the 15-17 and 18-19 sub-ranges
# Assuming each year within 15-19 has an equal proportion
proportion = 3/5  # 3 years (15-17) out of 5 (15-19)
age_15_17_value = age_15_19['Value'] * proportion
age_18_19_value = age_15_19['Value'] * (2/5)  # Remaining proportion for 2 years (18-19)

# Create new rows for the 15-17 and 18-19 sub-ranges
age_15_17_rows = age_15_19.copy()
age_18_19_rows = age_15_19.copy()

age_15_17_rows['AGEGRP'] = '15 to 17'
age_15_17_rows['Value'] = age_15_17_value

age_18_19_rows['AGEGRP'] = '18 to 19'
age_18_19_rows['Value'] = age_18_19_value

# Append the new rows to the original DataFrame
added_1990_df = pd.concat([filtered_df, age_15_17_rows, age_18_19_rows], ignore_index=True)

# Remove the original 15-19 age group rows
added_1990_df = added_1990_df[added_1990_df['AGEGRP'] != 4]

In [215]:
# Define the age groups for 65+
AGE65PLUS = [14, 15, 16, 17, 18] 

# Filter the DataFrame for the specified age groups
over_64_df = added_1990_df[added_1990_df['AGEGRP'].isin(AGE65PLUS)]

# Group by 'YEAR' and sum 'Value' for each group, creating a new 'AGEGRP' for "35 TO 64"
over_64_df = over_64_df.groupby('YEAR')['Value'].sum().reset_index()
over_64_df['AGEGRP'] = '65 AND OLDER'

over_64_df.head()

Unnamed: 0,YEAR,Value,AGEGRP
0,90,156422.0,65 AND OLDER


In [216]:
# Define the age groups for 18 to 64
AGE18TO64 = ['18 to 19',5,6,7,8, 9, 10,11,12,13]

# Filter the DataFrame for the specified age groups
filtered_df_18to64 = added_1990_df[added_1990_df['AGEGRP'].isin(AGE18TO64)]

# Group by 'YEAR' and sum 'Value' for each group, creating a new 'AGEGRP' for "35 TO 64"
age18to64_df = filtered_df_18to64.groupby('YEAR')['Value'].sum().reset_index()
age18to64_df['AGEGRP'] = '18 TO 64'

age18to64_df.head()

Unnamed: 0,YEAR,Value,AGEGRP
0,90,977535.0,18 TO 64


In [217]:
# Define the age groups for under 18
AGEUNDER18 = [0,1,2,3,'15 to 17']

# Filter the DataFrame for the specified age groups
under_18_df = added_1990_df[added_1990_df['AGEGRP'].isin(AGEUNDER18)]

# Group by 'YEAR' and sum 'Value' for each group, creating a new 'AGEGRP' for "UNDER 35"
under_18_df = under_18_df.groupby('YEAR')['Value'].sum().reset_index()
under_18_df['AGEGRP'] = 'UNDER 18'

under_18_df.head()

Unnamed: 0,YEAR,Value,AGEGRP
0,90,389042.0,UNDER 18


In [218]:
# Create a new DataFrame for the row to be added
new_row_df = pd.DataFrame({'YEAR': [90], 'AGEGRP': ['Total'], 'Value': [total_value_sum]})

combined_1990_df = pd.concat([under_18_df, age18to64_df, over_64_df, new_row_df], ignore_index=True)

# Change rows where YEAR="90" to YEAR="1990"
combined_1990_df.loc[combined_1990_df['YEAR'] == 90, 'YEAR'] = 1990

combined_1990_df.head()

Unnamed: 0,YEAR,Value,AGEGRP
0,1990,389042.0,UNDER 18
1,1990,977535.0,18 TO 64
2,1990,156422.0,65 AND OLDER
3,1990,1522999.0,Total


## Adding 1980

In [219]:
# Read the CSV file into a DataFrame
data = pd.read_csv(CENSUSPEP_COUNTY_WIDE_1980_1989_TABLE_PATH, index_col=0,low_memory=False)

# Remove the first four rows
df = data.iloc[4:]

# Use the first row to set the column names
df.columns = df.iloc[0]

# Drop the first row
df = df.drop(df.index[0])

# Reset the index
df.reset_index(drop=True, inplace=True)

# Keep only the first 18847 rows (years after 1980) of the DataFrame
df_filtered = df.iloc[:18847]


In [220]:
fips_codes_to_keep= ["39041", "39045", "39049", "39083", "39089", "39097", "39101", "39117", "39129", "39159"]

# Filter for 10-County region
filtered_df = df_filtered[df_filtered['FIPS State and County Codes'].isin(fips_codes_to_keep)]

filtered_df = filtered_df[['Under 5 years', '5 to 9 years', '10 to 14 years','15 to 19 years','20 to 24 years','25 to 29 years','30 to 34 years','35 to 39 years', '40 to 44 years','45 to 49 years','50 to 54 years','55 to 59 years','60 to 64 years','65 to 69 years','70 to 74 years','75 to 79 years','80 to 84 years','85 years and over']]

filtered_df.head()

Year of Estimate,Under 5 years,5 to 9 years,10 to 14 years,15 to 19 years,20 to 24 years,25 to 29 years,30 to 34 years,35 to 39 years,40 to 44 years,45 to 49 years,50 to 54 years,55 to 59 years,60 to 64 years,65 to 69 years,70 to 74 years,75 to 79 years,80 to 84 years,85 years and over
12355,1972,2101,2319,2978,2417,2094,2113,1820,1576,1419,1352,1163,884,690,522,317,175,123
12356,1818,1966,2186,2763,2351,2138,2181,1937,1550,1341,1296,1206,979,857,722,552,378,347
12357,52,52,54,97,69,51,45,33,34,26,26,26,20,12,12,9,7,2
12358,50,38,66,118,62,43,41,23,31,27,26,28,23,25,16,8,7,6
12359,9,6,11,12,21,14,8,8,7,3,2,1,1,4,2,1,1,0


In [221]:
# Convert all columns in the DataFrame to integer type
filtered_df = filtered_df.astype(int)

# Sum all columns individually
column_sums = filtered_df.sum()

# Sum the first 3.6 column sums
UNDER18SUM = (column_sums.iloc[:3].sum() + (column_sums.iloc[3] *3/5))

# Sum the next 6 column sums (from the 8th to the 13th column)
AGE18TO64SUM = (column_sums.iloc[3] *2/5)+ column_sums.iloc[4:13].sum()

# Sum the following 5 column sums after the first 13 (starting from the 14th column)
OVER64SUM = column_sums.iloc[13:18].sum()

TOTALSUM=(column_sums.sum())

# Create a new DataFrame for the row to be added
new_row_1980_1 = pd.DataFrame({'YEAR': [1980], 'AGEGRP': ['UNDER 18'], 'Value': [UNDER18SUM]})
# Create a new DataFrame for the row to be added
new_row_1980_2 = pd.DataFrame({'YEAR': [1980], 'AGEGRP': ['18 TO 64'], 'Value': [AGE18TO64SUM]})
# Create a new DataFrame for the row to be added
new_row_1980_3 = pd.DataFrame({'YEAR': [1980], 'AGEGRP': ['65 AND OLDER'], 'Value': [OVER64SUM]})
# Create a new DataFrame for the row to be added
new_row_1980_4 = pd.DataFrame({'YEAR': [1980], 'AGEGRP': ['Total'], 'Value': [TOTALSUM]})

combined_1980_df= pd.concat([new_row_1980_1, new_row_1980_2, new_row_1980_3, new_row_1980_4], ignore_index=True)

combined_1980_df.head()

Unnamed: 0,YEAR,AGEGRP,Value
0,1980,UNDER 18,396449.4
1,1980,18 TO 64,864221.6
2,1980,65 AND OLDER,127812.0
3,1980,Total,1388483.0


## Combining Census data

In [222]:
combined_all_df= pd.concat([combined_1980_df, combined_1990_df, combined_2000_df], ignore_index=True)

# Sort the DataFrame by 'YEAR' and then by 'AGEGRP'
combined_all_df = combined_all_df.sort_values(by=['YEAR', 'AGEGRP'])

combined_all_df.to_csv(OUTPUT_TABLE_PATH)

## Getting County Projection data

In [223]:
# Path to the .xlsx file
excel_path = "../morpc-county-controls/deliverables/CountyControls_WEB.xlsx"

# Use ExcelFile to open the Excel file and iterate through sheets
with pd.ExcelFile(excel_path) as xls:
    for sheet_name in xls.sheet_names:
        # Read each sheet to a pandas DataFrame
        df = pd.read_excel(xls, sheet_name=sheet_name)
        
        # Generate CSV file name based on sheet name
        csv_file = f'./input_data/{sheet_name}.csv'
        
        # Save the DataFrame as a CSV file
        df.to_csv(csv_file, index=False)

        print(f'Saved {sheet_name} to {csv_file}')

Saved Total Population to ./input_data/Total Population.csv
Saved Population Charts to ./input_data/Population Charts.csv
Saved Population by Age to ./input_data/Population by Age.csv
Saved Group Quarters and Households to ./input_data/Group Quarters and Households.csv
Saved Household Population by Age to ./input_data/Household Population by Age.csv
Saved Households and Housing Units to ./input_data/Households and Housing Units.csv
Saved Residential Labor Force to ./input_data/Residential Labor Force.csv
Saved Jobs to ./input_data/Jobs.csv
Saved All Data (Unformatted) to ./input_data/All Data (Unformatted).csv
Saved Variable Dictionary to ./input_data/Variable Dictionary.csv
Saved Population Chart Data to ./input_data/Population Chart Data.csv
Saved Revision History to ./input_data/Revision History.csv


In [224]:
data = pd.read_csv("./input_data/Population by Age.csv", index_col=0,low_memory=False)
# Skip the first two rows and reset the index
df_modified = data.iloc[2:].reset_index(drop=True)

# Use the first row (previously the third row of the original df) as column headers
new_header = df_modified.iloc[0] # Capture the third row's values to be the new header
df_modified = df_modified[1:] # Remove the first row from data
df_modified.columns = new_header # Set the new header as the df columns

# Reset the index again to account for the removed row
df_modified.reset_index(drop=True, inplace=True)


In [225]:
# Keep only the first 8 columns
df_filtered = df_modified.iloc[:, :8].copy()

# Columns to keep
columns_to_keep = [2030.0, 2040.0, 2050.0]

# Filter the DataFrame to keep only the specified columns
df_filtered = df_filtered[columns_to_keep]

In [226]:
# Take the first 16 rows of the DataFrame
df_16_filtered = df_filtered.iloc[:16].copy()
df_16_filtered = df_16_filtered.astype(int)

# Take the 20-36 rows of the DataFrame
df_20_36_filtered = df_filtered.iloc[20:36].copy()
df_20_36_filtered = df_20_36_filtered.astype(int)

# Take the first 41-56 rows of the DataFrame
df_41_56_filtered = df_filtered.iloc[40:56].copy()
df_41_56_filtered = df_41_56_filtered.astype(int)

In [227]:
# Drop specified rows for 10_County region
rows_to_remove = [2, 4, 7, 11, 13, 15]  # Adjusting indices by -1 for zero-based indexing
under_18_filtered = df_16_filtered.drop(rows_to_remove)
rows_to_remove = [22, 24, 27, 31, 33, 35]
age_18_64_filtered = df_20_36_filtered.drop(rows_to_remove)
rows_to_remove = [42, 44, 47, 51, 53, 55]
over_64_filtered = df_41_56_filtered.drop(rows_to_remove)

In [228]:
print(age_18_64_filtered)

0   2030.0  2040.0   2050.0
20  162958  195085   226432
21  102543  114186   126510
23  897086  985153  1051047
25   35251   36799    38702
26  114735  130275   140179
28   29783   32561    35517
29   37481   38725    39029
30   21385   22481    23961
32   40177   44691    48981
34   43147   49981    58482


In [229]:
# Calculate the sum of each column
column_over_64_sums = over_64_filtered.sum()

# Create a new DataFrame from the sums
sums_over_64_df = pd.DataFrame({'Column': column_over_64_sums.index, 'Value': column_over_64_sums.values})


# Calculate the sum of each column
column_under_18_sums = under_18_filtered.sum()

# Create a new DataFrame from the sums
sums_under_18_df = pd.DataFrame({'Column': column_under_18_sums.index, 'Value': column_under_18_sums.values})


# Calculate the sum of each column
column_18_64_sums = age_18_64_filtered.sum()

# Create a new DataFrame from the sums
sums_18_64_df = pd.DataFrame({'Column': column_18_64_sums.index, 'Value': column_18_64_sums.values})



In [230]:
# Create a new DataFrame for the transformation
done_over_64_df = pd.DataFrame()

# Assign 'Column' to 'YEAR', keep 'Value' as is, and set 'AGEGRP' to "UNDER 18"
done_over_64_df['YEAR'] = sums_over_64_df['Column'].astype(int).astype(str)  # Converting to int then to string if you want YEAR as a string
done_over_64_df['AGEGRP'] = "65 AND OLDER"
done_over_64_df['Value'] = sums_over_64_df['Value']


# Create a new DataFrame for the transformation
done_under_18_df = pd.DataFrame()
# Assign 'Column' to 'YEAR', keep 'Value' as is, and set 'AGEGRP' to "UNDER 18"
done_under_18_df['YEAR'] = sums_under_18_df['Column'].astype(int).astype(str)  # Converting to int then to string if you want YEAR as a string
done_under_18_df['AGEGRP'] = "UNDER 18"
done_under_18_df['Value'] = sums_under_18_df['Value']

# Create a new DataFrame for the transformation
done_18_64_df = pd.DataFrame()
# Assign 'Column' to 'YEAR', keep 'Value' as is, and set 'AGEGRP' to "UNDER 18"
done_18_64_df['YEAR'] = sums_18_64_df['Column'].astype(int).astype(str)  # Converting to int then to string if you want YEAR as a string
done_18_64_df['AGEGRP'] = "18 TO 64"
done_18_64_df['Value'] = sums_18_64_df['Value']

almost_done_df= pd.concat([done_under_18_df, done_18_64_df, done_over_64_df, combined_all_df], ignore_index=True)


# Filter rows for the year 2030
year_2030_rows = almost_done_df[almost_done_df['YEAR'] == '2030']

# Calculate the sum of 'Value' for these rows
total_value_2030 = year_2030_rows['Value'].sum()

# Create a new row with YEAR="2030", AGEGRP='Total', and the calculated sum
new_row = pd.DataFrame({'YEAR': ['2030'], 'AGEGRP': ['Total'], 'Value': [total_value_2030]})

# Append the new row to the DataFrame
almost_done_df = pd.concat([almost_done_df, new_row], ignore_index=True)

# Filter rows for the year 2030
year_2040_rows = almost_done_df[almost_done_df['YEAR'] == '2040']

# Calculate the sum of 'Value' for these rows
total_value_2040 = year_2040_rows['Value'].sum()

# Create a new row with YEAR="2030", AGEGRP='Total', and the calculated sum
new_row = pd.DataFrame({'YEAR': ['2040'], 'AGEGRP': ['Total'], 'Value': [total_value_2040]})

# Append the new row to the DataFrame
almost_done_df = pd.concat([almost_done_df, new_row], ignore_index=True)

# Filter rows for the year 2030
year_2050_rows = almost_done_df[almost_done_df['YEAR'] == '2050']

# Calculate the sum of 'Value' for these rows
total_value_2050 = year_2050_rows['Value'].sum()

# Create a new row with YEAR="2030", AGEGRP='Total', and the calculated sum
new_row = pd.DataFrame({'YEAR': ['2050'], 'AGEGRP': ['Total'], 'Value': [total_value_2050]})

# Append the new row to the DataFrame
almost_done_df = pd.concat([almost_done_df, new_row], ignore_index=True)

almost_done_df['Value'] = almost_done_df['Value'].round()

# Sort the DataFrame by 'YEAR' and then by 'AGEGRP'
almost_done_df = almost_done_df.sort_values(by=['YEAR', 'AGEGRP'])

print(almost_done_df)

    YEAR        AGEGRP      Value
9   1980      18 TO 64   864221.6
10  1980  65 AND OLDER   127812.0
11  1980         Total  1388483.0
12  1980      UNDER 18   396449.4
13  1990      18 TO 64   977535.0
14  1990  65 AND OLDER   156422.0
15  1990         Total  1522999.0
16  1990      UNDER 18   389042.0
17  2000      35 TO 64   848493.2
18  2000  65 AND OLDER     178877
19  2000         Total    1740458
20  2000      UNDER 18   446730.8
21  2010      35 TO 64  1949592.6
22  2010  65 AND OLDER     428157
23  2010         Total    3936535
24  2010      UNDER 18   973021.4
25  2020      35 TO 64  3214781.4
26  2020  65 AND OLDER     920987
27  2020         Total    6614062
28  2020      UNDER 18  1548935.6
3   2030      18 TO 64    1484546
6   2030  65 AND OLDER     410514
29  2030         Total    2459127
0   2030      UNDER 18     564067
4   2040      18 TO 64    1649937
7   2040  65 AND OLDER     458806
30  2040         Total    2724853
1   2040      UNDER 18     616110
5   2050      

In [231]:
almost_done_df.to_csv(OUTPUT_TABLE_PATH)