# Census Data for San Antonio Districts

In [1]:
import pandas as pd
import numpy as np
import os
import re
import math

## District Data

This sheet tells which district each census tract aligns with and what percentage of the district is included. District 0 are census tracts outside of San Antonio.

In [51]:
dist = pd.read_excel('CMO_CDTractsPrecincts_220406.xlsx')
dist = dist[['Council District',
             'Tract 2020 ID\n(Use with 2020 Decennial Census)',
             'Percentage of Tract 2020 in Council District']]
dist.columns = ['council','tract','percent']
dist = dist.iloc[1:].reset_index(drop=True)
dist.tract = dist.tract.astype(str)
dist.council = np.where(dist.council == 'Outside CoSA', 0, dist.council)
dist.head()

Unnamed: 0,council,tract,percent
0,1,110100,0.999534
1,1,110300,0.499153
2,1,110500,0.00015
3,1,110600,0.002611
4,1,110700,0.953113


## Demographic Data

This is all the demographic data pulled from the census. 

In [52]:
dataall = pd.read_csv('ACSDP5Y2021.DP05-Data.csv')
dataall.columns = dataall.columns.str.lower()

dataall = dataall.iloc[1:].reset_index(drop=True)
# dataall = dataall.replace('(X)', np.nan).replace('-',np.nan).replace('**',np.nan)
# dataall = dataall.dropna(axis=1, how='all')

dataall.geo_id = dataall.geo_id.str[-6:]
dataall.geo_id = dataall.geo_id.astype(str)

dataall.head()

Unnamed: 0,geo_id,name,dp05_0001e,dp05_0001ea,dp05_0001m,dp05_0001ma,dp05_0002e,dp05_0002m,dp05_0002ma,dp05_0002ea,...,dp05_0087pea,dp05_0088pe,dp05_0088pm,dp05_0088pma,dp05_0088pea,dp05_0089pe,dp05_0089pm,dp05_0089pma,dp05_0089pea,unnamed: 714
0,110100,"Census Tract 1101, Bexar County, Texas",2934,,565,,1698,365,,,...,,53.2,8.7,,,46.8,8.7,,,
1,110300,"Census Tract 1103, Bexar County, Texas",2930,,652,,1444,456,,,...,,49.8,6.8,,,50.2,6.8,,,
2,110500,"Census Tract 1105, Bexar County, Texas",2201,,309,,1030,198,,,...,,39.2,6.8,,,60.8,6.8,,,
3,110600,"Census Tract 1106, Bexar County, Texas",5384,,1620,,4117,1585,,,...,,77.6,8.5,,,22.4,8.5,,,
4,110700,"Census Tract 1107, Bexar County, Texas",982,,246,,525,148,,,...,,52.8,6.9,,,47.2,6.9,,,


## Labels

These are the labels for the census data.

In [58]:
labels = pd.read_csv('ACSDP5Y2021.DP05-Column-Metadata.csv')
labels.columns = labels.columns.str.lower().str.replace('\W+','_')
labels.column_name = labels.column_name.str.lower()

labels = labels [labels.label.str.startswith('Estimate')]
labels = labels [~labels.label.str.contains('ratio')]

## Combine it all together

In [59]:
def cal_counts(label):
    #pull out estimate and moa
    name_est = label + 'e'
    name_moa = label + 'm'

    #find the corresponding title to census label
    string = labels [labels.column_name == name_est].label.iloc[0]
    label_title = re.findall('!!([\w\s]+)$',string)[0]

    #isolate just one indicator
    data = dataall[['geo_id', name_est, name_moa]].copy()

    #join districts and indicator together
    df = dist.merge(data, how='inner', right_on='geo_id', left_on='tract')
    df = df.replace(np.nan, 0)

    #get count percentages for each census tract per district
    df['tru_count'] = round(df[name_est].astype(float) * df.percent).astype(int)
    df['tru_error'] = round(df[name_moa].astype(float) * df.percent).astype(int)

    #for each council district, sum up counts, and calculate new moe
    council_counts = []
    for x in df.council.unique():
        total = df [df.council == x].tru_count.sum().astype(int)
        error = round((df [df.council == x].tru_error ** 2).sum()**.5).astype(int)
        council_counts.append([x,total,error])

    #convert to dataframe and format
    dff = pd.DataFrame(council_counts).sort_values(0)
    dff.columns = [label_title, 'counts', 'moe']
    dff = dff.set_index(label_title, drop=True)

    return dff

In [60]:
grouping = 'SEX AND AGE'

labels = labels [labels.label.str.contains(grouping)]
labels = labels.iloc[:-13]
labels_check = labels.column_name.str[:-1]
labels

for label in labels.label:
    print(label)

Estimate!!SEX AND AGE!!Total population
Estimate!!SEX AND AGE!!Total population!!Male
Estimate!!SEX AND AGE!!Total population!!Female
Estimate!!SEX AND AGE!!Total population!!Under 5 years
Estimate!!SEX AND AGE!!Total population!!5 to 9 years
Estimate!!SEX AND AGE!!Total population!!10 to 14 years
Estimate!!SEX AND AGE!!Total population!!15 to 19 years
Estimate!!SEX AND AGE!!Total population!!20 to 24 years
Estimate!!SEX AND AGE!!Total population!!25 to 34 years
Estimate!!SEX AND AGE!!Total population!!35 to 44 years
Estimate!!SEX AND AGE!!Total population!!45 to 54 years
Estimate!!SEX AND AGE!!Total population!!55 to 59 years
Estimate!!SEX AND AGE!!Total population!!60 to 64 years
Estimate!!SEX AND AGE!!Total population!!65 to 74 years
Estimate!!SEX AND AGE!!Total population!!75 to 84 years
Estimate!!SEX AND AGE!!Total population!!85 years and over


In [61]:
district_totals = pd.DataFrame(np.arange(11))
totals = []

for label in labels_check:
    dff = cal_counts(label)
#     dff = dff.iloc[1:]
    
    if label == labels_check.iloc[0]:
        total_count = dff.counts.sum()
        total_moa = round(math.sqrt(sum(dff.moe**2)))
        
    totals.append([dff.index.name,
                   dff.counts.sum(),
                   round(math.sqrt(sum(dff.moe**2))),
                   round(dff.counts.sum()/total_count,3)])
    
    dff.columns = [dff.index.name + ' counts', dff.index.name + ' moe']
    dff = dff.reset_index(drop=True)

    
    district_totals = pd.concat([district_totals,dff],axis=1)
    
df_total = pd.DataFrame(totals, columns=['title','count','moe','percent'])
district_totals = district_totals.drop(columns=0)

In [62]:
df_total

Unnamed: 0,title,count,moe,percent
0,Total population,1990520,16649,1.0
1,Male,987301,10018,0.496
2,Female,1003215,9721,0.504
3,Under 5 years,136261,3872,0.068
4,5 to 9 years,140769,3659,0.071
5,10 to 14 years,145909,3717,0.073
6,15 to 19 years,144841,3870,0.073
7,20 to 24 years,143891,4160,0.072
8,25 to 34 years,312827,5795,0.157
9,35 to 44 years,276262,5073,0.139


In [1]:
# district_totals.to_excel('CCD_test.xlsx')