# Create Racial Distributions of Demographic Data
### Author: Lane Hartwig 
### Date created: 4/10/2023
### Last updated: 4/16/2023

## Define functions

In [None]:
from decimal import Decimal

def createIncomeDistributions(racial_pop, house_income, bloc_bound, path, year):
    # copy data to new dataframe
    populations = racial_pop[['racial_population_list', 'block group', 'tract', 'geometry']]
    populations['area_name'] = np.nan

    # iterate by row
    for id in range(0,len(populations)):
      # combine tract and block group columns together for area name
      area = populations.iloc[id][['block group','tract']]
      area[1] = '{0:g}'.format(area[1])
      string = 'Block {}, Census Tract {}'.format(area[0],area[1])
      populations['area_name'][id] = string

      # standardize racial_population_list on a [0,1] range
      str_list = populations['racial_population_list'][id]
      int_list = [int(i) for i in str_list]
      if (int_list[0]==0): # set to 0 if no population is recorded for an area
        int_list = [0,0,0]
        populations['racial_population_list'][id] = int_list
      else:
        norm_list = [float(i)/int_list[0] for i in int_list]
        formatted_norm_list = ['%.2f' % elem for elem in norm_list[1:4]]
        populations['racial_population_list'][id] = formatted_norm_list

    # organize columns
    populations = populations.rename(columns={"racial_population_list": "racial_distribution"})
    populations = populations[['racial_distribution','area_name','geometry']]

    # merge dataframes by area names
    # householdIncome2022 = householdIncome2022.rename(columns={"Tract": "area_name"})
    mergeDF = pd.merge(populations, house_income, how='inner', on=['area_name'])
    data = []

    # iterate through rows
    for id in range(0,len(mergeDF)):
      temp = mergeDF.iloc[id]
      race_dist = [float(i) for i in temp[0]]
      for j in range(3,len(temp)):
        temp[j] = [float(i)*temp[j] for i in race_dist]
      data.append(temp)

    # dataframe for geiod
    geoid = bloc_bound[['block group', 'tract', 'geoid']]
    geoid['area_name'] = np.nan

    for id in range(0,len(geoid)):
      area = geoid.iloc[id][['block group','tract']]
      area[1] = '{0:g}'.format(float(area[1]))
      string = 'Block {}, Census Tract {}'.format(area[0],area[1])
      geoid['area_name'][id] = string

    geoid = geoid[['area_name', 'geoid']]

    # create new dataframe and clean up the formatting
    incomeDistribution = pd.DataFrame(data)
    incomeDistribution = pd.merge(incomeDistribution, geoid, how='inner', on=['area_name'])
    incomeDistribution = incomeDistribution[['geoid', 'area_name', 'geometry', 'Total','Less than $10,000', '$10,000 to \$14,999', '\$15,000 to \$19,999','$20,000 to \$24,999', '$25,000 to \$29,999', '$30,000 to \$34,999','$35,000 to \$39,999', '$40,000 to \$44,999', '$45,000 to \$49,999','$50,000 to \$59,999', '$60,000 to \$74,999', '$75,000 to \$99,999','$100,000 to \$124,999', '$125,000 to \$149,999','$150,000 to \$199,999', '$200,000 or more']]
    incomeDistribution.columns = ['geoid', 'area_name', 'geometry', 'Total','Less than $10,000', '$10,000 to $14,999', '$15,000 to $19,999','$20,000 to $24,999', '$25,000 to $29,999', '$30,000 to $34,999','$35,000 to $39,999', '$40,000 to $44,999', '$45,000 to $49,999','$50,000 to $59,999', '$60,000 to $74,999', '$75,000 to $99,999','$100,000 to $124,999', '$125,000 to $149,999', '$150,000 to $199,999','$200,000 or more']

    # save to .csv
    incomeDistribution.to_csv(path + 'incomeDistribution{}.csv'.format(year), index=False)

     # save to pickle
    with open(path + 'incomeDistribution{}.pickle'.format(year), 'wb') as file:
        pickle.dump(incomeDistribution, file)

    # return new dataframe
    return incomeDistribution

def createEducationDistributions(racial_pop, education, bloc_bound, path, year):
    # copy data to new dataframe
    populations = racial_pop[['racial_population_list', 'block group', 'tract', 'geometry']]
    populations['area_name'] = np.nan

    # iterate by row
    for id in range(0,len(populations)):
      # combine tract and block group columns together for area name
      area = populations.iloc[id][['block group','tract']]
      area[1] = '{0:g}'.format(area[1])
      string = 'Block {}, Census Tract {}'.format(area[0],area[1])
      populations['area_name'][id] = string

      # standardize racial_population_list on a [0,1] range
      str_list = populations['racial_population_list'][id]
      int_list = [int(i) for i in str_list]
      if (int_list[0]==0): # set to 0 if no population is recorded for an area
        int_list = [0,0,0]
        populations['racial_population_list'][id] = int_list
      else:
        norm_list = [float(i)/int_list[0] for i in int_list]
        formatted_norm_list = ['%.2f' % elem for elem in norm_list[1:4]]
        populations['racial_population_list'][id] = formatted_norm_list

    # organize columns
    populations = populations.rename(columns={"racial_population_list": "racial_distribution"})
    populations = populations[['racial_distribution','area_name','geometry']]

    # merge dataframes by area names
    mergeDF = pd.merge(populations, education, how='inner', on=['area_name'])
    data = []

    # iterate through rows
    for id in range(0,len(mergeDF)):
      temp = mergeDF.iloc[id]
      race_dist = [float(i) for i in temp[0]]
      for j in range(3,len(temp)):
        temp[j] = [float(i)*temp[j] for i in race_dist]
      data.append(temp)

    # dataframe for geiod
    geoid = bloc_bound[['block group', 'tract', 'geoid']]
    geoid['area_name'] = np.nan

    for id in range(0,len(geoid)):
      area = geoid.iloc[id][['block group','tract']]
      area[1] = '{0:g}'.format(float(area[1]))
      string = 'Block {}, Census Tract {}'.format(area[0],area[1])
      geoid['area_name'][id] = string

    geoid = geoid[['area_name', 'geoid']]

    # create new dataframe
    educationDistribution = pd.DataFrame(data)
    educationDistribution = pd.merge(educationDistribution, geoid, how='inner', on=['area_name'])
    educationDistribution = educationDistribution[['geoid','area_name','geometry','Total', 'No schooling completed','Nursery school', 'Kindergarten','1st grade', '2nd grade','3rd grade', '4th grade','5th grade', '6th grade','7th grade', '8th grade','9th grade', '10th grade','11th grade','12th grade, no diploma','Regular high school diploma','GED or alternative credential','Some college, less than 1 year','Some college, 1 or more years, no degree',"Associate's degree","Bachelor's degree","Master's degree",'Professional school degree','Doctorate degree']]

    # aggregate columns
    lessHS = []
    HSno = []
    HSged = []
    someColl = []
    under = []
    grad = []
    # iterate through rows
    for id in range(0,len(educationDistribution)):
        temp = educationDistribution.iloc[id]
        preK = [float(i) for i in temp[5]]
        k = [float(i) for i in temp[6]]
        fir = [float(i) for i in temp[7]]
        sec = [float(i) for i in temp[8]]
        thir = [float(i) for i in temp[9]]
        four = [float(i) for i in temp[10]]
        fif = [float(i) for i in temp[11]]
        six = [float(i) for i in temp[12]]
        sev = [float(i) for i in temp[13]]
        eig = [float(i) for i in temp[14]]
        nin = [float(i) for i in temp[15]]
        ten = [float(i) for i in temp[16]]
        ele = [float(i) for i in temp[17]]
        tweNo = [float(i) for i in temp[18]]
        twe = [float(i) for i in temp[19]]
        ged = [float(i) for i in temp[20]]
        collegeLE1 = [float(i) for i in temp[21]]
        collegeGE1 = [float(i) for i in temp[22]]
        assoc = [float(i) for i in temp[23]]
        bach = [float(i) for i in temp[24]]
        mast = [float(i) for i in temp[25]]
        prof = [float(i) for i in temp[26]]
        doct = [float(i) for i in temp[27]]
        res1 = []
        res2 = []
        res3 = []
        res4 = []
        res5 = []
        res6 = []
        for i in range(0, len(preK)):
            res1.append(preK[i]+k[i]+fir[i]+sec[i]+thir[i]+four[i]+fif[i]+six[i]+sev[i]+eig[i])
            res2.append(nin[i]+ten[i]+ele[i]+tweNo[i])
            res3.append(twe[i]+ged[i])
            res4.append(collegeLE1[i]+collegeGE1[i])
            res5.append(assoc[i]+bach[i])
            res6.append(mast[i]+prof[i]+doct[i])
        lessHS.append(res1)
        HSno.append(res2)
        HSged.append(res3)
        someColl.append(res4)
        under.append(res5)
        grad.append(res6)

    educationDistribution['Less than High School'] = lessHS
    educationDistribution['High School, No Diploma'] = HSno
    educationDistribution['High School/GED or alternative credential'] = HSged
    educationDistribution['Some College, No Degree'] = someColl
    educationDistribution['Undergraduate Degree'] = under
    educationDistribution['Graduate Degree'] = grad

    # clean up formatting of the columns
    educationDistribution = educationDistribution[['geoid', 'area_name', 'geometry', 'No schooling completed','Less than High School','High School, No Diploma', 'High School/GED or alternative credential', 'Some College, No Degree','Undergraduate Degree','Graduate Degree']]

    # save to .csv
    educationDistribution.to_csv(path + 'educationDistribution{}.csv'.format(year), index=False)

    # save to pickle
    with open(path + 'educationDistribution{}.pickle'.format(year), 'wb') as file:
        pickle.dump(educationDistribution, file)

    # return new dataframe
    return educationDistribution

def splitEducationDistributions(educationDistribution,year):
    noSchooling = educationDistribution[['geoid', 'area_name', 'No schooling completed']]
    lessThanHS = educationDistribution[['geoid', 'area_name','Less than High School']]
    HSNoDegree = educationDistribution[['geoid', 'area_name','High School, No Diploma']]
    HSDegree = educationDistribution[['geoid', 'area_name','High School/GED or alternative credential']]
    someCollege = educationDistribution[['geoid', 'area_name','Some College, No Degree']]
    undergrad = educationDistribution[['geoid', 'area_name','Undergraduate Degree']]
    graduate = educationDistribution[['geoid', 'area_name','Graduate Degree']]

    # save to .csv
    noSchooling.to_csv(path + 'educationDistributionA{}.csv'.format(year), index=False)
    lessThanHS.to_csv(path + 'educationDistributionB{}.csv'.format(year), index=False)
    HSNoDegree.to_csv(path + 'educationDistributionC{}.csv'.format(year), index=False)
    HSDegree.to_csv(path + 'educationDistributionD{}.csv'.format(year), index=False)
    someCollege.to_csv(path + 'educationDistributionE{}.csv'.format(year), index=False)
    undergrad.to_csv(path + 'educationDistributionF{}.csv'.format(year), index=False)
    graduate.to_csv(path + 'educationDistributionG{}.csv'.format(year), index=False)
    
    # save to .pickle
    with open(path + 'educationDistributionA{}.pickle'.format(year), 'wb') as file: # no schooling
        pickle.dump(noSchooling, file)
    with open(path + 'educationDistributionB{}.pickle'.format(year), 'wb') as file: # less than High School
        pickle.dump(lessThanHS, file)
    with open(path + 'educationDistributionC{}.pickle'.format(year), 'wb') as file: # High School, no degree
        pickle.dump(HSNoDegree, file)
    with open(path + 'educationDistributionD{}.pickle'.format(year), 'wb') as file: # High School degree/GED
        pickle.dump(HSDegree, file)
    with open(path + 'educationDistributionE{}.pickle'.format(year), 'wb') as file: # Some college, no degree
        pickle.dump(someCollege, file)
    with open(path + 'educationDistributionF{}.pickle'.format(year), 'wb') as file: # Undergraduate degree
        pickle.dump(undergrad, file)
    with open(path + 'educationDistributionG{}.pickle'.format(year), 'wb') as file: # Graduate degree
        pickle.dump(graduate, file)


## Household Income 

In [None]:
# read in data
householdIncome2022 = pd.read_csv(path + 'cleanHouseholdIncome2022.csv')

# apply createDistributions function
incomeDistribution2022 = createIncomeDistributions(racial_population_with_geometry, householdIncome2022, Blockgroup_boundary, path, '2022')

# preview data
incomeDistribution2022.head(3)

In [None]:
# read in data
householdIncome2021 = pd.read_csv(path + 'cleanHouseholdIncome2021.csv')

# apply createDistributions function
incomeDistribution2021 = createIncomeDistributions(racial_population_with_geometry, householdIncome2021, Blockgroup_boundary, path, '2021')

# preview data
incomeDistribution2021.head(3)

In [None]:
# read in data
householdIncome2020 = pd.read_csv(path + 'cleanHouseholdIncome2020.csv')

# apply createDistributions function
incomeDistribution2020 = createIncomeDistributions(racial_population_with_geometry, householdIncome2020, Blockgroup_boundary, path, '2020')

# preview data
incomeDistribution2020.head(3)

In [None]:
# read in data
householdIncome2019 = pd.read_csv(path + 'cleanHouseholdIncome2019.csv')

# apply createDistributions function
incomeDistribution2019 = createIncomeDistributions(racial_population_with_geometry, householdIncome2019, Blockgroup_boundary, path, '2019')

# preview data
incomeDistribution2019.head(3)

In [None]:
# read in data
householdIncome2018 = pd.read_csv(path + 'cleanHouseholdIncome2018.csv')

# apply createDistributions function
incomeDistribution2018 = createIncomeDistributions(racial_population_with_geometry, householdIncome2018, Blockgroup_boundary, path, '2018')

# preview data
incomeDistribution2018.head(3)

In [None]:
# read in data
householdIncome2017 = pd.read_csv(path + 'cleanHouseholdIncome2017.csv')

# apply createDistributions function
incomeDistribution2017 = createIncomeDistributions(racial_population_with_geometry, householdIncome2017, Blockgroup_boundary, path, '2017')

# preview data
incomeDistribution2017.head(3)

In [None]:
# read in data
householdIncome2016 = pd.read_csv(path + 'cleanHouseholdIncome2016.csv')

# apply createDistributions function
incomeDistribution2016 = createIncomeDistributions(racial_population_with_geometry, householdIncome2016, Blockgroup_boundary, path, '2016')

# preview data
incomeDistribution2016.head(3)

## Educational Attainment

In [None]:
# read in data
education2022 = pd.read_csv(path + 'cleanEducation2022.csv')

# apply createDistributnmentions function
educationDistribution2022 = createEducationDistributions(racial_population_with_geometry, education2022, Blockgroup_boundary, path, '2022')

# split into separate dataframes
splitEducationDistributions(educationDistribution2022,'2022')

# preview data
educationDistribution2022.head(3)

In [None]:
# read in data
education2021 = pd.read_csv(path + 'cleanEducation2021.csv')

# apply createDistributions function
educationDistribution2021 = createEducationDistributions(racial_population_with_geometry, education2021, Blockgroup_boundary, path, '2021')

# split into separate dataframes
splitEducationDistributions(educationDistribution2021,'2021')

# preview data
educationDistribution2021.head(3)

In [None]:
# read in data
education2020 = pd.read_csv(path + 'cleanEducation2020.csv')

# apply createDistributions function
educationDistribution2020 = createEducationDistributions(racial_population_with_geometry, education2020, Blockgroup_boundary, path, '2020')

# split into separate dataframes
splitEducationDistributions(educationDistribution2020,'2020')

# preview data
educationDistribution2020.head(3)

In [None]:
# read in data
education2019 = pd.read_csv(path + 'cleanEducation2019.csv')

# apply createDistributions function
educationDistribution2019 = createEducationDistributions(racial_population_with_geometry, education2019, Blockgroup_boundary, path, '2019')

# split into separate dataframes
splitEducationDistributions(educationDistribution2019,'2019')

# preview data
educationDistribution2019.head(3)

In [None]:
# read in data
education2018 = pd.read_csv(path + 'cleanEducation2018.csv')

# apply createDistributions function
educationDistribution2018 = createEducationDistributions(racial_population_with_geometry, education2018, Blockgroup_boundary, path, '2018')

# split into separate dataframes
splitEducationDistributions(educationDistribution2018,'2018')

# preview data
educationDistribution2018.head(3)

In [None]:
# read in data
education2017 = pd.read_csv(path + 'cleanEducation2017.csv')

# apply createDistributions function
educationDistribution2017 = createEducationDistributions(racial_population_with_geometry, education2017, Blockgroup_boundary, path, '2017')

# split into separate dataframes
splitEducationDistributions(educationDistribution2017,'2017')

# preview data
educationDistribution2017.head(3)

In [None]:
# read in data
education2016 = pd.read_csv(path + 'cleanEducation2016.csv')

# apply createDistributions function
educationDistribution2016 = createEducationDistributions(racial_population_with_geometry, education2016, Blockgroup_boundary, path, '2016')

# split into separate dataframes
splitEducationDistributions(educationDistribution2016,'2016')

# preview data
educationDistribution2016.head(3)