# Census Data Preparation

**Tasks:**
* Maybe delete total population from education and final dataframe

In [None]:
import os
import zipfile

# Unzip all files
path = "drive/MyDrive/US Elections/Census Data"

for folder in os.listdir(path):
  if folder.endswith("2020"):
    zip_file = os.listdir(f"{path}/{folder}")[0]
    total_path = f"{path}/{folder}/{zip_file}"
    with zipfile.ZipFile(total_path,"r") as zip_ref:
      zip_ref.extractall(f"{path}/{folder}")

## Educational Attainment USA

The end goal is to have a dataframe which shows the perentage of people having a certain educational attainment within a given district. 

Steps:
1. Clean redundant columns✅
2. Summarize education into categories *(below highschool, highschool, college or associate and bachelor or higher)*
3. Include state column to better join them later with other dataframes✅
4. Clean district names so that they match the ones of my target variable dataframe✅
4. Calculate percentage values✅
5. Drop all districts which are not available in the target variable dataframe✅

In [None]:
import pandas as pd
import numpy as np

In [None]:
education = pd.read_csv("drive/MyDrive/US Elections/Census Data/Educational Attainment 2020/educational_attainment_2020.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
education.head()

Unnamed: 0,GEO_ID,NAME,S1501_C01_001E,S1501_C01_001M,S1501_C01_002E,S1501_C01_002M,S1501_C01_003E,S1501_C01_003M,S1501_C01_004E,S1501_C01_004M,...,S1501_C06_060E,S1501_C06_060M,S1501_C06_061E,S1501_C06_061M,S1501_C06_062E,S1501_C06_062M,S1501_C06_063E,S1501_C06_063M,S1501_C06_064E,S1501_C06_064M
0,id,Geographic Area Name,Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT...,Margin of Error!!Total!!AGE BY EDUCATIONAL ATT...,Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT...,Margin of Error!!Total!!AGE BY EDUCATIONAL ATT...,Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT...,Margin of Error!!Total!!AGE BY EDUCATIONAL ATT...,Estimate!!Total!!AGE BY EDUCATIONAL ATTAINMENT...,Margin of Error!!Total!!AGE BY EDUCATIONAL ATT...,...,Estimate!!Percent Female!!MEDIAN EARNINGS IN T...,Margin of Error!!Percent Female!!MEDIAN EARNIN...,Estimate!!Percent Female!!MEDIAN EARNINGS IN T...,Margin of Error!!Percent Female!!MEDIAN EARNIN...,Estimate!!Percent Female!!MEDIAN EARNINGS IN T...,Margin of Error!!Percent Female!!MEDIAN EARNIN...,Estimate!!Percent Female!!MEDIAN EARNINGS IN T...,Margin of Error!!Percent Female!!MEDIAN EARNIN...,Estimate!!Percent Female!!MEDIAN EARNINGS IN T...,Margin of Error!!Percent Female!!MEDIAN EARNIN...
1,620L600US02001,"State House District 1 (2018), Alaska",1873,355,146,76,745,215,789,237,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
2,620L600US02002,"State House District 2 (2018), Alaska",4260,594,71,60,2149,438,1912,381,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
3,620L600US02003,"State House District 3 (2018), Alaska",1656,388,293,141,647,284,631,240,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)
4,620L600US02004,"State House District 4 (2018), Alaska",1506,471,113,97,623,363,667,234,...,(X),(X),(X),(X),(X),(X),(X),(X),(X),(X)


In [None]:
education.shape

(4054, 770)

#### First Cleaning

In [None]:
def first_clean(df):
  """
  Function cleans DataFrames from Census Bureau  to the essential information. From there on further cleaning has 
  to be done by hand (e.g. selecting columns)!
  """
  # Delete id column
  df.drop("GEO_ID", axis=1, inplace=True)

  # Set first row to column and delete first row
  df.columns = df.iloc[0, :]
  df.drop(0, inplace=True)
  
  # Rename "Geographic Area Name" to district column
  df.rename(columns={"Geographic Area Name": "district"}, inplace=True)

  # Delete all columns which show the margin of error
  margin_columns = [col for col in df.columns if "Margin" in col]
  df.drop(margin_columns, axis=1, inplace=True)

  # Change columnnames to lowercase
  df.columns = [col.lower() for col in df.columns]
  return df

In [None]:
education = first_clean(education)
education.shape

(4053, 385)

#### DataFrame dependent cleaning

In [None]:
def delete_male_female(df):
  df.drop(df.iloc[:, df.columns.str.contains("male|female")].columns, axis=1, inplace=True)
  return df

In [None]:
education = delete_male_female(education)
education.shape

(4053, 129)

In [None]:
# Just need columns with 18-24 and 25 or higher -> no more cohorts needed
education = education.iloc[:, :14]

In [None]:
education.head()

Unnamed: 0,district,estimate!!total!!age by educational attainment!!population 18 to 24 years,estimate!!total!!age by educational attainment!!population 18 to 24 years!!less than high school graduate,estimate!!total!!age by educational attainment!!population 18 to 24 years!!high school graduate (includes equivalency),estimate!!total!!age by educational attainment!!population 18 to 24 years!!some college or associate's degree,estimate!!total!!age by educational attainment!!population 18 to 24 years!!bachelor's degree or higher,estimate!!total!!age by educational attainment!!population 25 years and over,estimate!!total!!age by educational attainment!!population 25 years and over!!less than 9th grade,"estimate!!total!!age by educational attainment!!population 25 years and over!!9th to 12th grade, no diploma",estimate!!total!!age by educational attainment!!population 25 years and over!!high school graduate (includes equivalency),"estimate!!total!!age by educational attainment!!population 25 years and over!!some college, no degree",estimate!!total!!age by educational attainment!!population 25 years and over!!associate's degree,estimate!!total!!age by educational attainment!!population 25 years and over!!bachelor's degree,estimate!!total!!age by educational attainment!!population 25 years and over!!graduate or professional degree
1,"State House District 1 (2018), Alaska",1873,146,745,789,193,11068,403,510,3217,3283,860,1803,992
2,"State House District 2 (2018), Alaska",4260,71,2149,1912,128,9157,222,208,2760,2325,1336,1476,830
3,"State House District 3 (2018), Alaska",1656,293,647,631,85,12242,246,597,2862,3543,1790,1937,1267
4,"State House District 4 (2018), Alaska",1506,113,623,667,103,12927,362,234,2336,3415,834,3224,2522
5,"State House District 5 (2018), Alaska",2481,39,622,1587,233,10768,87,408,2334,3328,591,2008,2012


In [None]:
less_than_highschool = [col for col in education.columns if "less than high school graduate" in col 
                        or "less than 9th grade" in col or "9th to 12th grade" in col]
highschool = [col for col in education if "high school graduate (includes equivalency)" in col]
college_associate = [col for col in education if "some college" in col or "associate's degree" in col]
bachelor_higher = [col for col in education if "bachelor's degree" in col or "graduate or professional degree" in col]

In [None]:
education[education.columns[1:]] = education[education.columns[1:]].astype("int")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [None]:
education.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4053 entries, 1 to 4053
Data columns (total 14 columns):
 #   Column                                                                                                                     Non-Null Count  Dtype 
---  ------                                                                                                                     --------------  ----- 
 0   district                                                                                                                   4053 non-null   object
 1   estimate!!total!!age by educational attainment!!population 18 to 24 years                                                  4053 non-null   int64 
 2   estimate!!total!!age by educational attainment!!population 18 to 24 years!!less than high school graduate                  4053 non-null   int64 
 3   estimate!!total!!age by educational attainment!!population 18 to 24 years!!high school graduate (includes equivalency)     4053 non-n

In [None]:
education["less than highschool"] = education[less_than_highschool].sum(axis=1)
education["highschool"] = education[highschool].sum(axis=1)
education["college or associate"] = education[college_associate].sum(axis=1)
education["bachelor or higher"] = education[bachelor_higher].sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [None]:
education.head()

Unnamed: 0,district,estimate!!total!!age by educational attainment!!population 18 to 24 years,estimate!!total!!age by educational attainment!!population 18 to 24 years!!less than high school graduate,estimate!!total!!age by educational attainment!!population 18 to 24 years!!high school graduate (includes equivalency),estimate!!total!!age by educational attainment!!population 18 to 24 years!!some college or associate's degree,estimate!!total!!age by educational attainment!!population 18 to 24 years!!bachelor's degree or higher,estimate!!total!!age by educational attainment!!population 25 years and over,estimate!!total!!age by educational attainment!!population 25 years and over!!less than 9th grade,"estimate!!total!!age by educational attainment!!population 25 years and over!!9th to 12th grade, no diploma",estimate!!total!!age by educational attainment!!population 25 years and over!!high school graduate (includes equivalency),"estimate!!total!!age by educational attainment!!population 25 years and over!!some college, no degree",estimate!!total!!age by educational attainment!!population 25 years and over!!associate's degree,estimate!!total!!age by educational attainment!!population 25 years and over!!bachelor's degree,estimate!!total!!age by educational attainment!!population 25 years and over!!graduate or professional degree,less than highschool,highschool,college or associate,bachelor or higher
1,"State House District 1 (2018), Alaska",1873,146,745,789,193,11068,403,510,3217,3283,860,1803,992,1059,3962,4932,2988
2,"State House District 2 (2018), Alaska",4260,71,2149,1912,128,9157,222,208,2760,2325,1336,1476,830,501,4909,5573,2434
3,"State House District 3 (2018), Alaska",1656,293,647,631,85,12242,246,597,2862,3543,1790,1937,1267,1136,3509,5964,3289
4,"State House District 4 (2018), Alaska",1506,113,623,667,103,12927,362,234,2336,3415,834,3224,2522,709,2959,4916,5849
5,"State House District 5 (2018), Alaska",2481,39,622,1587,233,10768,87,408,2334,3328,591,2008,2012,534,2956,5506,4253


In [None]:
education.shape

(4053, 18)

In [None]:
# Delete columns
education = education.drop(education.columns[1:14], axis=1)
education.head()

Unnamed: 0,district,less than highschool,highschool,college or associate,bachelor or higher
1,"State House District 1 (2018), Alaska",1059,3962,4932,2988
2,"State House District 2 (2018), Alaska",501,4909,5573,2434
3,"State House District 3 (2018), Alaska",1136,3509,5964,3289
4,"State House District 4 (2018), Alaska",709,2959,4916,5849
5,"State House District 5 (2018), Alaska",534,2956,5506,4253


In [None]:
education.head()

Unnamed: 0,district,less than highschool,highschool,college or associate,bachelor or higher
1,"State House District 1 (2018), Alaska",1059,3962,4932,2988
2,"State House District 2 (2018), Alaska",501,4909,5573,2434
3,"State House District 3 (2018), Alaska",1136,3509,5964,3289
4,"State House District 4 (2018), Alaska",709,2959,4916,5849
5,"State House District 5 (2018), Alaska",534,2956,5506,4253


#### Final Cleaning and District Dropping

In [None]:
def clean_census_data(df, percentage=True):

  # Change strings to lower case
  df["district"] = df["district"].str.lower()

  # Get states out of district columns
  # 1. Get states
  df["state"] = df["district"].apply(lambda x: x.split(", ")[-1])
  
  # Clear district names
  # Delete erroneous instances
  df.drop(df[df["district"].str.contains("not defined")].index, inplace=True)

  # Replace ; in massachusetts with ,
  df.loc[df[df["district"] == "barnstable, dukes & nantucket district (2016); massachusetts"].index,
              "district"] = "barnstable, dukes & nantucket district (2016), massachusetts"

  # All state except Massachusetts and Vermont
  without_verm_mass = df[~df["district"].str.contains("massachusetts|vermont")]
  df.loc[without_verm_mass.index, "district"] = without_verm_mass["district"].str.split("district ").str[1].str.split().str[0]

  # Massachusetts
  massachusetts_district = df[df["state"] == "massachusetts"]["district"].apply(lambda x: x.split(" district")[0])
  df.loc[df[df["state"] == "massachusetts"].index, "district"] = massachusetts_district

  # Vermont
  vermont_district = df[df["state"] == "vermont"]["district"].apply(lambda x: x.split(" state")[0])
  df.loc[df[df["state"] == "vermont"].index, "district"] = vermont_district

  # Minor cleaning for South Carolina and Minnesota
  # South Carolina
  southcarolina_district = df[df["state"] == "south carolina"]["district"].apply(lambda x: x.split("-")[-1])
  southcarolina_district.iloc[:10] = southcarolina_district.iloc[:10].apply(lambda x: x.split("00")[-1])
  southcarolina_district[southcarolina_district.str.startswith("0")] = southcarolina_district[southcarolina_district.str.startswith("0")].str[1:]
  df.loc[df[df["state"] == "south carolina"].index, "district"] = southcarolina_district

  # Minnesota
  #df.loc[df[df["state"] == "minnesota"].iloc[:18].index, "district"] = df[df["state"] == "minnesota"]["district"].iloc[:18].apply(lambda x: x[1:])

  # Delete all instances of New Hamphsire -> in target dataframe not available
  df = df.drop(df[df.state == "new hampshire"].index, axis=0)
  
  # Change data types to numeric
  df.iloc[:, 1:-1] = df.iloc[:, 1:-1].astype("float")
  
  if percentage:
    # Calculate percentage values of features
    total_population = df.iloc[:, 1:-1].sum(axis=1)

    for col in df.iloc[:, 1:-1].columns:
        df[col] = df[col] / total_population
        df[col] = df[col].astype("float").round(4)
    
  return df

In [None]:
education = clean_census_data(education)

In [None]:
education.head()

Unnamed: 0,district,less than highschool,highschool,college or associate,bachelor or higher,state
1,1,0.0818,0.3062,0.3811,0.2309,alaska
2,2,0.0373,0.3659,0.4154,0.1814,alaska
3,3,0.0817,0.2525,0.4291,0.2367,alaska
4,4,0.0491,0.205,0.3406,0.4053,alaska
5,5,0.0403,0.2231,0.4156,0.321,alaska


In [None]:
import pandas as pd
target_df = pd.read_csv("drive/MyDrive/US Elections/data_target_2020.csv")
target_df.head()

Unnamed: 0,state,district,office,year,target
0,alaska,1,state house,2020,0
1,alaska,10,state house,2020,0
2,alaska,11,state house,2020,0
3,alaska,12,state house,2020,0
4,alaska,13,state house,2020,0


In [None]:
def check_states_districts(target_df, check_df):
  """
  Function to check for missing states and districts in both the target and the check dataframe.
  """
  # Values which are in target df, but miss in new dataframe
  missing_in_check = []
  # Values which are in new dataframe, but miss in target df
  missing_in_target = []
  for state in target_df["state"].sort_values().unique():
    target_district = target_df[target_df["state"] == state]["district"]
    check_district = check_df[check_df["state"] == state]["district"]
    if state in check_df.state.values:
      #if (target_district.values == check_district.values).sum() == len(target_district.values):
      if sum(1 for a, b in zip(target_district.values, check_district.values) if a == b) == len(target_district.values):
        pass
      else:
        for district in target_district:
          if district not in check_district.values:
            missing_in_check.append([state, district])
        for district in check_district:
          if district not in target_district.values:
            missing_in_target.append([state, district])
  return missing_in_check, missing_in_target

In [None]:
def drop_districts(df, missing_in_target):
  """
  Function to drop districts from the check_df which don't appear in the target_df. Works only for vermont and 
  west virginia. 
  """
  # Create lists for districts to delete to be saved in
  drop_hawaii = []
  drop_oklahoma = []
  drop_vermont = []

  # Get all districts per state which have to be deleted
  for state, district in missing_in_target:
    if state == "hawaii":
      drop_hawaii.append(district)
    elif state == "oklahoma":
      drop_oklahoma.append(district)
    elif state == "vermont": 
      drop_vermont.append(district)

  # Delete districts
  df = df.drop(df[(df["state"] == "hawaii") & (df["district"].isin(drop_hawaii))].index)
  df = df.drop(df[(df["state"] == "oklahoma") & (df["district"].isin(drop_oklahoma))].index)
  df = df.drop(df[(df["state"] == "vermont") & (df["district"].isin(drop_vermont))].index)

  return df

In [None]:
def drop_spare_districts(target_df, df):
  missing_in_check, missing_in_target = check_states_districts(target_df, df)
  if len(missing_in_check) > 0:
    print("Error: There should be no districts in target dataframe which are missing in the Census dataframe!")
  else:
    df = drop_districts(df, missing_in_target)
  return df

In [None]:
education = drop_spare_districts(target_df, education)

#### Reset Index and Save DataFrame

In [None]:
def reset_and_save(df, df_name:str):
  df = df.reset_index(drop=True)
  df.name = df_name
  df.to_csv("drive/MyDrive/US Elections/Census Data/Final Files/" + df.name + "_2020_final.csv", index=False)
  return df

In [None]:
# Reset index so that it's possible to merge the dataframes on the indices
education = reset_and_save(education, "educational_attainment")
education.tail()

Unnamed: 0,district,less than highschool,highschool,college or associate,bachelor or higher,state
3755,56,0.0524,0.349,0.3895,0.2092,wyoming
3756,57,0.0566,0.269,0.4145,0.2599,wyoming
3757,58,0.1063,0.3804,0.4199,0.0933,wyoming
3758,59,0.0898,0.3443,0.4454,0.1204,wyoming
3759,60,0.0518,0.3158,0.4263,0.2061,wyoming


## Occupation

The end goal is to have a dataframe which divides the occupations into different subcategories and shows for every district how many people work in which sector.

Steps:
1. Clean redundant columns✅
2. Divide occupations into categories✅
3. Include state column to better join them later with other dataframes✅
4. Clean district names so that they match the ones of my target variable dataframe✅
4. Calculate percentage values✅
5. Drop all districts which are not available in the target variable dataframe✅

In [None]:
import pandas as pd
occupation = pd.read_csv("drive/MyDrive/US Elections/Census Data/Occupation 2020/occupation_2020.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
occupation.head(3)

Unnamed: 0,GEO_ID,NAME,S2401_C01_001E,S2401_C01_001M,S2401_C01_002E,S2401_C01_002M,S2401_C01_003E,S2401_C01_003M,S2401_C01_004E,S2401_C01_004M,...,S2401_C05_032E,S2401_C05_032M,S2401_C05_033E,S2401_C05_033M,S2401_C05_034E,S2401_C05_034M,S2401_C05_035E,S2401_C05_035M,S2401_C05_036E,S2401_C05_036M
0,id,Geographic Area Name,Estimate!!Total!!Civilian employed population ...,Margin of Error!!Total!!Civilian employed popu...,Estimate!!Total!!Civilian employed population ...,Margin of Error!!Total!!Civilian employed popu...,Estimate!!Total!!Civilian employed population ...,Margin of Error!!Total!!Civilian employed popu...,Estimate!!Total!!Civilian employed population ...,Margin of Error!!Total!!Civilian employed popu...,...,Estimate!!Percent Female!!Civilian employed po...,Margin of Error!!Percent Female!!Civilian empl...,Estimate!!Percent Female!!Civilian employed po...,Margin of Error!!Percent Female!!Civilian empl...,Estimate!!Percent Female!!Civilian employed po...,Margin of Error!!Percent Female!!Civilian empl...,Estimate!!Percent Female!!Civilian employed po...,Margin of Error!!Percent Female!!Civilian empl...,Estimate!!Percent Female!!Civilian employed po...,Margin of Error!!Percent Female!!Civilian empl...
1,620L600US02001,"State House District 1 (2018), Alaska",7638,626,2205,419,855,287,566,233,...,0.0,7.6,9.4,6.0,14.1,13.9,7.3,8.4,7.4,7.4
2,620L600US02002,"State House District 2 (2018), Alaska",5678,634,1965,365,481,141,367,134,...,6.8,12.6,18.0,12.6,10.8,16.1,17.3,19.1,21.7,21.6


#### First Cleaning

In [None]:
occupation = first_clean(occupation)
occupation.shape

(4053, 181)

#### DataFrame dependent cleaning

In [None]:
occupation.head(1)

Unnamed: 0,district,estimate!!total!!civilian employed population 16 years and over,"estimate!!total!!civilian employed population 16 years and over!!management, business, science, and arts occupations:","estimate!!total!!civilian employed population 16 years and over!!management, business, science, and arts occupations:!!management, business, and financial occupations:","estimate!!total!!civilian employed population 16 years and over!!management, business, science, and arts occupations:!!management, business, and financial occupations:!!management occupations","estimate!!total!!civilian employed population 16 years and over!!management, business, science, and arts occupations:!!management, business, and financial occupations:!!business and financial operations occupations","estimate!!total!!civilian employed population 16 years and over!!management, business, science, and arts occupations:!!computer, engineering, and science occupations:","estimate!!total!!civilian employed population 16 years and over!!management, business, science, and arts occupations:!!computer, engineering, and science occupations:!!computer and mathematical occupations","estimate!!total!!civilian employed population 16 years and over!!management, business, science, and arts occupations:!!computer, engineering, and science occupations:!!architecture and engineering occupations","estimate!!total!!civilian employed population 16 years and over!!management, business, science, and arts occupations:!!computer, engineering, and science occupations:!!life, physical, and social science occupations",...,estimate!!percent female!!civilian employed population 16 years and over!!sales and office occupations:!!sales and related occupations,estimate!!percent female!!civilian employed population 16 years and over!!sales and office occupations:!!office and administrative support occupations,"estimate!!percent female!!civilian employed population 16 years and over!!natural resources, construction, and maintenance occupations:","estimate!!percent female!!civilian employed population 16 years and over!!natural resources, construction, and maintenance occupations:!!farming, fishing, and forestry occupations","estimate!!percent female!!civilian employed population 16 years and over!!natural resources, construction, and maintenance occupations:!!construction and extraction occupations","estimate!!percent female!!civilian employed population 16 years and over!!natural resources, construction, and maintenance occupations:!!installation, maintenance, and repair occupations","estimate!!percent female!!civilian employed population 16 years and over!!production, transportation, and material moving occupations:","estimate!!percent female!!civilian employed population 16 years and over!!production, transportation, and material moving occupations:!!production occupations","estimate!!percent female!!civilian employed population 16 years and over!!production, transportation, and material moving occupations:!!transportation occupations","estimate!!percent female!!civilian employed population 16 years and over!!production, transportation, and material moving occupations:!!material moving occupations"
1,"State House District 1 (2018), Alaska",7638,2205,855,566,289,372,153,141,78,...,53.8,70.4,3.1,8.3,3.7,0.0,9.4,14.1,7.3,7.4


In [None]:
occupation = delete_male_female(occupation)
occupation.shape

(4053, 37)

In [None]:
occupation.columns = occupation.columns.str.rsplit("estimate!!total!!").str[-1]

In [None]:
occupation.head()

Unnamed: 0,district,civilian employed population 16 years and over,"civilian employed population 16 years and over!!management, business, science, and arts occupations:","civilian employed population 16 years and over!!management, business, science, and arts occupations:!!management, business, and financial occupations:","civilian employed population 16 years and over!!management, business, science, and arts occupations:!!management, business, and financial occupations:!!management occupations","civilian employed population 16 years and over!!management, business, science, and arts occupations:!!management, business, and financial occupations:!!business and financial operations occupations","civilian employed population 16 years and over!!management, business, science, and arts occupations:!!computer, engineering, and science occupations:","civilian employed population 16 years and over!!management, business, science, and arts occupations:!!computer, engineering, and science occupations:!!computer and mathematical occupations","civilian employed population 16 years and over!!management, business, science, and arts occupations:!!computer, engineering, and science occupations:!!architecture and engineering occupations","civilian employed population 16 years and over!!management, business, science, and arts occupations:!!computer, engineering, and science occupations:!!life, physical, and social science occupations",...,civilian employed population 16 years and over!!sales and office occupations:!!sales and related occupations,civilian employed population 16 years and over!!sales and office occupations:!!office and administrative support occupations,"civilian employed population 16 years and over!!natural resources, construction, and maintenance occupations:","civilian employed population 16 years and over!!natural resources, construction, and maintenance occupations:!!farming, fishing, and forestry occupations","civilian employed population 16 years and over!!natural resources, construction, and maintenance occupations:!!construction and extraction occupations","civilian employed population 16 years and over!!natural resources, construction, and maintenance occupations:!!installation, maintenance, and repair occupations","civilian employed population 16 years and over!!production, transportation, and material moving occupations:","civilian employed population 16 years and over!!production, transportation, and material moving occupations:!!production occupations","civilian employed population 16 years and over!!production, transportation, and material moving occupations:!!transportation occupations","civilian employed population 16 years and over!!production, transportation, and material moving occupations:!!material moving occupations"
1,"State House District 1 (2018), Alaska",7638,2205,855,566,289,372,153,141,78,...,818,1140,925,121,518,286,1152,348,383,421
2,"State House District 2 (2018), Alaska",5678,1965,481,367,114,251,71,158,22,...,613,660,684,23,280,381,634,130,214,290
3,"State House District 3 (2018), Alaska",8921,3064,1165,828,337,302,66,135,101,...,754,1211,1412,35,771,606,1054,144,542,368
4,"State House District 4 (2018), Alaska",10060,4378,1285,773,512,829,197,259,373,...,589,1165,1354,45,638,671,755,178,543,34
5,"State House District 5 (2018), Alaska",8580,3771,848,569,279,817,175,251,391,...,711,877,794,59,439,296,909,252,391,266


In [None]:
remaining_columns = occupation.columns[occupation.columns.str.endswith(":")][[0, 5, 7, 8, 9]].tolist()
remaining_columns = ["district", *remaining_columns]
occupation = occupation[remaining_columns]
occupation.head(1)

Unnamed: 0,district,"civilian employed population 16 years and over!!management, business, science, and arts occupations:",civilian employed population 16 years and over!!service occupations:,civilian employed population 16 years and over!!sales and office occupations:,"civilian employed population 16 years and over!!natural resources, construction, and maintenance occupations:","civilian employed population 16 years and over!!production, transportation, and material moving occupations:"
1,"State House District 1 (2018), Alaska",2205,1398,1958,925,1152


In [None]:
occupation.columns = occupation.columns.str.split("!!").str[-1]

In [None]:
occupation.iloc[:, 1:] = occupation.iloc[:, 1:].apply(pd.to_numeric)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [None]:
occupation["blue collar occupations"] = occupation.iloc[:,-2:].sum(axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
occupation.columns

Index(['district', 'management, business, science, and arts occupations:',
       'service occupations:', 'sales and office occupations:',
       'natural resources, construction, and maintenance occupations:',
       'production, transportation, and material moving occupations:',
       'blue collar occupations'],
      dtype='object')

In [None]:
# Delete columns
to_delete = ["natural resources, construction, and maintenance occupations:",
             "production, transportation, and material moving occupations:"]
occupation.drop(to_delete, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
# Delete : in column names
occupation.columns = occupation.columns.str.replace(":", "")

#### Final Cleaning and District Dropping

In [None]:
occupation = clean_census_data(occupation)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

In [None]:
occupation.head()

Unnamed: 0,district,"management, business, science, and arts occupations",service occupations,sales and office occupations,blue collar occupations,state
1,1,0.2887,0.183,0.2563,0.2719,alaska
2,2,0.3461,0.1976,0.2242,0.2321,alaska
3,3,0.3435,0.1598,0.2203,0.2764,alaska
4,4,0.4352,0.1808,0.1744,0.2096,alaska
5,5,0.4395,0.1769,0.1851,0.1985,alaska


In [None]:
occupation = drop_spare_districts(target_df, occupation)

In [None]:
occupation.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3760 entries, 1 to 4053
Data columns (total 6 columns):
 #   Column                                               Non-Null Count  Dtype  
---  ------                                               --------------  -----  
 0   district                                             3760 non-null   object 
 1   management, business, science, and arts occupations  3760 non-null   float64
 2   service occupations                                  3760 non-null   float64
 3   sales and office occupations                         3760 non-null   float64
 4   blue collar occupations                              3760 non-null   float64
 5   state                                                3760 non-null   object 
dtypes: float64(4), object(2)
memory usage: 205.6+ KB


#### Reset Index and Save DataFrame

In [None]:
# Reset index so that it's possible to merge the dataframes on the indices
occupation = reset_and_save(occupation, "occupation")
occupation.tail()

Unnamed: 0,district,"management, business, science, and arts occupations",service occupations,sales and office occupations,blue collar occupations,state
3755,56,0.3812,0.1924,0.2095,0.217,wyoming
3756,57,0.3372,0.2043,0.2119,0.2466,wyoming
3757,58,0.2513,0.1728,0.1663,0.4097,wyoming
3758,59,0.2264,0.2672,0.209,0.2973,wyoming
3759,60,0.3538,0.1714,0.1373,0.3375,wyoming


## Earnings

In [None]:
earnings = pd.read_csv("drive/MyDrive/US Elections/Census Data/Earnings 2020/earnings_2020.csv")

In [None]:
earnings.head(1)

Unnamed: 0,GEO_ID,NAME,S2001_C01_001E,S2001_C01_001M,S2001_C01_002E,S2001_C01_002M,S2001_C01_003E,S2001_C01_003M,S2001_C01_004E,S2001_C01_004M,...,S2001_C06_016E,S2001_C06_016M,S2001_C06_017E,S2001_C06_017M,S2001_C06_018E,S2001_C06_018M,S2001_C06_019E,S2001_C06_019M,S2001_C06_020E,S2001_C06_020M
0,id,Geographic Area Name,Estimate!!Total!!Population 16 years and over ...,Margin of Error!!Total!!Population 16 years an...,Estimate!!Total!!Population 16 years and over ...,Margin of Error!!Total!!Population 16 years an...,Estimate!!Total!!Population 16 years and over ...,Margin of Error!!Total!!Population 16 years an...,Estimate!!Total!!Population 16 years and over ...,Margin of Error!!Total!!Population 16 years an...,...,Estimate!!Percent Female!!MEDIAN EARNINGS BY E...,Margin of Error!!Percent Female!!MEDIAN EARNIN...,Estimate!!Percent Female!!MEDIAN EARNINGS BY E...,Margin of Error!!Percent Female!!MEDIAN EARNIN...,Estimate!!Percent Female!!MEDIAN EARNINGS BY E...,Margin of Error!!Percent Female!!MEDIAN EARNIN...,Estimate!!Percent Female!!MEDIAN EARNINGS BY E...,Margin of Error!!Percent Female!!MEDIAN EARNIN...,Estimate!!Percent Female!!MEDIAN EARNINGS BY E...,Margin of Error!!Percent Female!!MEDIAN EARNIN...


#### First Cleaning

In [None]:
earnings = first_clean(earnings)

In [None]:
earnings = delete_male_female(earnings)
earnings.head(1)

Unnamed: 0,district,estimate!!total!!population 16 years and over with earnings,estimate!!total!!population 16 years and over with earnings!!median earnings (dollars),"estimate!!total!!population 16 years and over with earnings!!full-time, year-round workers with earnings","estimate!!total!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$1 to $9,999 or loss","estimate!!total!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$10,000 to $14,999","estimate!!total!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$15,000 to $24,999","estimate!!total!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$25,000 to $34,999","estimate!!total!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$35,000 to $49,999","estimate!!total!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$50,000 to $64,999",...,"estimate!!percent!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$75,000 to $99,999","estimate!!percent!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$100,000 or more","estimate!!percent!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!median earnings (dollars) for full-time, year-round workers with earnings","estimate!!percent!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!mean earnings (dollars) for full-time, year-round workers with earnings",estimate!!percent!!median earnings by educational attainment!!population 25 years and over with earnings,estimate!!percent!!median earnings by educational attainment!!population 25 years and over with earnings!!less than high school graduate,estimate!!percent!!median earnings by educational attainment!!population 25 years and over with earnings!!high school graduate (includes equivalency),estimate!!percent!!median earnings by educational attainment!!population 25 years and over with earnings!!some college or associate's degree,estimate!!percent!!median earnings by educational attainment!!population 25 years and over with earnings!!bachelor's degree,estimate!!percent!!median earnings by educational attainment!!population 25 years and over with earnings!!graduate or professional degree
1,"State House District 1 (2018), Alaska",9532,35647,5653,52,89,621,920,1307,884,...,11.2,13.2,(X),(X),(X),(X),(X),(X),(X),(X)


#### DataFrame dependent cleaning

In [None]:
# Save separately, because of cleaning functions
median_earnings = earnings.iloc[:, 2]

# Delete unnecessary columns
earnings = pd.concat([earnings.iloc[:, 0], earnings.iloc[:, 4:13]], axis=1)

earnings.head(1)

Unnamed: 0,district,"estimate!!total!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$1 to $9,999 or loss","estimate!!total!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$10,000 to $14,999","estimate!!total!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$15,000 to $24,999","estimate!!total!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$25,000 to $34,999","estimate!!total!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$35,000 to $49,999","estimate!!total!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$50,000 to $64,999","estimate!!total!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$65,000 to $74,999","estimate!!total!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$75,000 to $99,999","estimate!!total!!population 16 years and over with earnings!!full-time, year-round workers with earnings!!$100,000 or more"
1,"State House District 1 (2018), Alaska",52,89,621,920,1307,884,400,631,749


In [None]:
# Change Data Type to float
earnings.iloc[:, 1:] = earnings.iloc[:, 1:].astype("float")

In [None]:
# Make new categories of earnings
earnings["earnings $1 - $24'999"] = earnings.iloc[:, 1:4].sum(axis=1)
earnings["earnings $25'000 - 49'999"] = earnings.iloc[:, 4:6].sum(axis=1)
earnings["earnings $50'000 - 74'999"] = earnings.iloc[:, 6:8].sum(axis=1)
earnings["earnings $75'000 or more"] = earnings.iloc[:, 8:10].sum(axis=1)

In [None]:
final_columns = [col for col in earnings.columns if "full-time" not in col]
earnings = earnings[final_columns]
earnings.head(1)

Unnamed: 0,district,earnings $1 - $24'999,earnings $25'000 - 49'999,earnings $50'000 - 74'999,earnings $75'000 or more
1,"State House District 1 (2018), Alaska",762.0,2227.0,1284.0,1380.0


#### Final Cleaning and District Dropping

In [None]:
earnings = clean_census_data(earnings)
earnings.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

Unnamed: 0,district,earnings $1 - $24'999,earnings $25'000 - 49'999,earnings $50'000 - 74'999,earnings $75'000 or more,state
1,1,0.1348,0.394,0.2271,0.2441,alaska


In [None]:
earnings = drop_spare_districts(target_df, earnings)
earnings.shape

(3760, 6)

In [None]:
earnings["median earnings"] = median_earnings
earnings.head(1)

Unnamed: 0,district,earnings $1 - $24'999,earnings $25'000 - 49'999,earnings $50'000 - 74'999,earnings $75'000 or more,state,median earnings
1,1,0.1348,0.394,0.2271,0.2441,alaska,35647


#### Reset Index and Save DataFrame

In [None]:
df = reset_and_save(earnings, "earnings")
df.tail()

Unnamed: 0,district,earnings $1 - $24'999,earnings $25'000 - 49'999,earnings $50'000 - 74'999,earnings $75'000 or more,state,median earnings
3755,56,0.1697,0.3633,0.2419,0.2251,wyoming,33162
3756,57,0.2177,0.4518,0.2058,0.1247,wyoming,34196
3757,58,0.1701,0.3644,0.175,0.2905,wyoming,34607
3758,59,0.1612,0.4634,0.2257,0.1497,wyoming,31297
3759,60,0.0954,0.2589,0.2663,0.3793,wyoming,47912


## Poverty

In [None]:
poverty = pd.read_csv("drive/MyDrive/US Elections/Census Data/Poverty 2020/poverty_2020.csv")
poverty.head(2)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,GEO_ID,NAME,S1701_C01_001E,S1701_C01_001M,S1701_C01_002E,S1701_C01_002M,S1701_C01_003E,S1701_C01_003M,S1701_C01_004E,S1701_C01_004M,...,S1701_C03_057E,S1701_C03_057M,S1701_C03_058E,S1701_C03_058M,S1701_C03_059E,S1701_C03_059M,S1701_C03_060E,S1701_C03_060M,S1701_C03_061E,S1701_C03_061M
0,id,Geographic Area Name,Estimate!!Total!!Population for whom poverty s...,Margin of Error!!Total!!Population for whom po...,Estimate!!Total!!Population for whom poverty s...,Margin of Error!!Total!!Population for whom po...,Estimate!!Total!!Population for whom poverty s...,Margin of Error!!Total!!Population for whom po...,Estimate!!Total!!Population for whom poverty s...,Margin of Error!!Total!!Population for whom po...,...,Estimate!!Percent below poverty level!!UNRELAT...,Margin of Error!!Percent below poverty level!!...,Estimate!!Percent below poverty level!!UNRELAT...,Margin of Error!!Percent below poverty level!!...,Estimate!!Percent below poverty level!!UNRELAT...,Margin of Error!!Percent below poverty level!!...,Estimate!!Percent below poverty level!!UNRELAT...,Margin of Error!!Percent below poverty level!!...,Estimate!!Percent below poverty level!!UNRELAT...,Margin of Error!!Percent below poverty level!!...
1,620L600US02001,"State House District 1 (2018), Alaska",15989,892,3367,395,1064,233,2303,330,...,22.9,18.3,(X),(X),1.3,1.5,25.4,9.2,39.7,11.1


In [None]:
poverty.shape

(4054, 368)

#### First Cleaning

In [None]:
poverty = first_clean(poverty)
poverty.shape

(4053, 184)

In [None]:
poverty.head(1)

Unnamed: 0,district,estimate!!total!!population for whom poverty status is determined,estimate!!total!!population for whom poverty status is determined!!age!!under 18 years,estimate!!total!!population for whom poverty status is determined!!age!!under 18 years!!under 5 years,estimate!!total!!population for whom poverty status is determined!!age!!under 18 years!!5 to 17 years,estimate!!total!!population for whom poverty status is determined!!age!!under 18 years!!related children of householder under 18 years,estimate!!total!!population for whom poverty status is determined!!age!!18 to 64 years,estimate!!total!!population for whom poverty status is determined!!age!!18 to 64 years!!18 to 34 years,estimate!!total!!population for whom poverty status is determined!!age!!18 to 64 years!!35 to 64 years,estimate!!total!!population for whom poverty status is determined!!age!!60 years and over,...,estimate!!percent below poverty level!!unrelated individuals for whom poverty status is determined!!25 to 34 years,estimate!!percent below poverty level!!unrelated individuals for whom poverty status is determined!!35 to 44 years,estimate!!percent below poverty level!!unrelated individuals for whom poverty status is determined!!45 to 54 years,estimate!!percent below poverty level!!unrelated individuals for whom poverty status is determined!!55 to 64 years,estimate!!percent below poverty level!!unrelated individuals for whom poverty status is determined!!65 to 74 years,estimate!!percent below poverty level!!unrelated individuals for whom poverty status is determined!!75 years and over,estimate!!percent below poverty level!!unrelated individuals for whom poverty status is determined!!mean income deficit for unrelated individuals (dollars),"estimate!!percent below poverty level!!unrelated individuals for whom poverty status is determined!!worked full-time, year-round in the past 12 months","estimate!!percent below poverty level!!unrelated individuals for whom poverty status is determined!!worked less than full-time, year-round in the past 12 months",estimate!!percent below poverty level!!unrelated individuals for whom poverty status is determined!!did not work
1,"State House District 1 (2018), Alaska",15989,3367,1064,2303,3367,9999,5074,4925,3463,...,7.4,22.4,16.2,31.4,14.8,22.9,(X),1.3,25.4,39.7


#### DataFrame dependent cleaning

In [None]:
percent_columns = [col for col in poverty.columns if "percent below poverty level" in col or col == "district"]
poverty = poverty[percent_columns].iloc[:, :2]
poverty.rename(columns={"estimate!!percent below poverty level!!population for whom poverty status is determined": "percent of tot. population in poverty"},
               inplace=True)
poverty.head()

Unnamed: 0,district,percent of tot. population in poverty
1,"State House District 1 (2018), Alaska",10.1
2,"State House District 2 (2018), Alaska",8.2
3,"State House District 3 (2018), Alaska",5.5
4,"State House District 4 (2018), Alaska",4.1
5,"State House District 5 (2018), Alaska",5.8


#### Final Cleaning and District dropping

In [None]:
poverty = clean_census_data(poverty, percentage=False)
poverty.head()

Unnamed: 0,district,percent of tot. population in poverty,state
1,1,10.1,alaska
2,2,8.2,alaska
3,3,5.5,alaska
4,4,4.1,alaska
5,5,5.8,alaska


In [None]:
poverty = drop_spare_districts(target_df, poverty)
poverty.shape

(3760, 3)

#### Reset Index and Save DataFrame

In [None]:
poverty = reset_and_save(poverty, "poverty")
poverty.tail()

Unnamed: 0,district,percent of tot. population in poverty,state
3755,56,14.2,wyoming
3756,57,10.9,wyoming
3757,58,9.3,wyoming
3758,59,9.2,wyoming
3759,60,5.6,wyoming


## Foreign Borns

Percentage of foreign born on total population in district.

In [None]:
foreign_borns = pd.read_csv("drive/MyDrive/US Elections/Census Data/Foreign Born 2020/foreign_born_2020.csv")

In [None]:
foreign_borns.head(1)

Unnamed: 0,B06008_001E,B06008_001M,B06008_002E,B06008_002M,B06008_003E,B06008_003M,B06008_004E,B06008_004M,B06008_005E,B06008_005M,...,B06008_027E,B06008_027M,B06008_028E,B06008_028M,B06008_029E,B06008_029M,B06008_030E,B06008_030M,GEO_ID,NAME
0,Estimate!!Total:,Margin of Error!!Total:,Estimate!!Total:!!Never married,Margin of Error!!Total:!!Never married,"Estimate!!Total:!!Now married, except separated","Margin of Error!!Total:!!Now married, except s...",Estimate!!Total:!!Divorced,Margin of Error!!Total:!!Divorced,Estimate!!Total:!!Separated,Margin of Error!!Total:!!Separated,...,"Estimate!!Total:!!Foreign born:!!Now married, ...",Margin of Error!!Total:!!Foreign born:!!Now ma...,Estimate!!Total:!!Foreign born:!!Divorced,Margin of Error!!Total:!!Foreign born:!!Divorced,Estimate!!Total:!!Foreign born:!!Separated,Margin of Error!!Total:!!Foreign born:!!Separated,Estimate!!Total:!!Foreign born:!!Widowed,Margin of Error!!Total:!!Foreign born:!!Widowed,id,Geographic Area Name


In [None]:
foreign_borns.shape

(4054, 62)

#### First Cleaning

In [None]:
foreign_borns = first_clean(foreign_borns)
foreign_borns.head(1)

Unnamed: 0,estimate!!total:,estimate!!total:!!never married,"estimate!!total:!!now married, except separated",estimate!!total:!!divorced,estimate!!total:!!separated,estimate!!total:!!widowed,estimate!!total:!!born in state of residence:,estimate!!total:!!born in state of residence:!!never married,"estimate!!total:!!born in state of residence:!!now married, except separated",estimate!!total:!!born in state of residence:!!divorced,...,estimate!!total:!!native; born outside the united states:!!divorced,estimate!!total:!!native; born outside the united states:!!separated,estimate!!total:!!native; born outside the united states:!!widowed,estimate!!total:!!foreign born:,estimate!!total:!!foreign born:!!never married,"estimate!!total:!!foreign born:!!now married, except separated",estimate!!total:!!foreign born:!!divorced,estimate!!total:!!foreign born:!!separated,estimate!!total:!!foreign born:!!widowed,district
1,13399,4924,5620,1884,402,569,4571,2490,1439,481,...,150,0,0,811,193,509,109,0,0,"State House District 1 (2018), Alaska"


#### DataFrame dependent Cleaning

In [None]:
# Select only Foreign Born column
citizenship_cols = foreign_borns.columns[foreign_borns.columns.str.split("!!").str.len() <= 3]
foreign_borns = foreign_borns[citizenship_cols]
district = foreign_borns.district
natives = foreign_borns.iloc[:, -5:-2].astype("int").sum(axis=1)
foreigns = foreign_borns.iloc[:, -2].astype("int")
foreign_borns = pd.DataFrame({"district": district, "native": natives, "foreign born": foreigns})
foreign_borns.head()

Unnamed: 0,district,native,foreign born
1,"State House District 1 (2018), Alaska",12588,811
2,"State House District 2 (2018), Alaska",12811,1221
3,"State House District 3 (2018), Alaska",14057,553
4,"State House District 4 (2018), Alaska",14387,677
5,"State House District 5 (2018), Alaska",12666,1050


#### Final Cleaning and Ditrict dropping

In [None]:
foreign_borns = clean_census_data(foreign_borns)

In [None]:
foreign_borns.head()

Unnamed: 0,district,native,foreign born,state
1,1,0.9395,0.0605,alaska
2,2,0.913,0.087,alaska
3,3,0.9621,0.0379,alaska
4,4,0.9551,0.0449,alaska
5,5,0.9234,0.0766,alaska


In [None]:
foreign_borns = drop_spare_districts(target_df, foreign_borns)
foreign_borns.shape

(3760, 4)

#### Reset Index and Save DataFrame

In [None]:
foreign_borns = reset_and_save(foreign_borns, "foreign_borns")

In [None]:
foreign_borns.tail()

Unnamed: 0,district,native,foreign born,state
3755,56,0.9752,0.0248,wyoming
3756,57,0.9907,0.0093,wyoming
3757,58,0.984,0.016,wyoming
3758,59,0.9859,0.0141,wyoming
3759,60,0.9772,0.0228,wyoming


## Average Family Size

In [None]:
family_size = pd.read_csv("drive/MyDrive/US Elections/Census Data/Family Size & Housing Tenure 2020/family_size_housing_tenure_2020.csv")

In [None]:
family_size.head(1)

Unnamed: 0,GEO_ID,NAME,S1101_C01_001E,S1101_C01_001M,S1101_C01_002E,S1101_C01_002M,S1101_C01_003E,S1101_C01_003M,S1101_C01_004E,S1101_C01_004M,...,S1101_C05_014E,S1101_C05_014M,S1101_C05_015E,S1101_C05_015M,S1101_C05_016E,S1101_C05_016M,S1101_C05_017E,S1101_C05_017M,S1101_C05_018E,S1101_C05_018M
0,id,Geographic Area Name,Estimate!!Total!!HOUSEHOLDS!!Total households,Margin of Error!!Total!!HOUSEHOLDS!!Total hous...,Estimate!!Total!!HOUSEHOLDS!!Average household...,Margin of Error!!Total!!HOUSEHOLDS!!Average ho...,Estimate!!Total!!FAMILIES!!Total families,Margin of Error!!Total!!FAMILIES!!Total families,Estimate!!Total!!FAMILIES!!Average family size,Margin of Error!!Total!!FAMILIES!!Average fami...,...,Estimate!!Nonfamily household!!Total household...,Margin of Error!!Nonfamily household!!Total ho...,Estimate!!Nonfamily household!!Total household...,Margin of Error!!Nonfamily household!!Total ho...,Estimate!!Nonfamily household!!Total household...,Margin of Error!!Nonfamily household!!Total ho...,Estimate!!Nonfamily household!!Total household...,Margin of Error!!Nonfamily household!!Total ho...,Estimate!!Nonfamily household!!Total household...,Margin of Error!!Nonfamily household!!Total ho...


In [None]:
family_size.shape

(4054, 182)

#### First Cleaning

In [None]:
family_size = first_clean(family_size)

In [None]:
family_size.head(1)

Unnamed: 0,district,estimate!!total!!households!!total households,estimate!!total!!households!!average household size,estimate!!total!!families!!total families,estimate!!total!!families!!average family size,estimate!!total!!age of own children!!households with own children of the householder under 18 years,estimate!!total!!age of own children!!households with own children of the householder under 18 years!!under 6 years only,estimate!!total!!age of own children!!households with own children of the householder under 18 years!!under 6 years and 6 to 17 years,estimate!!total!!age of own children!!households with own children of the householder under 18 years!!6 to 17 years only,estimate!!total!!total households,...,estimate!!nonfamily household!!total households,estimate!!nonfamily household!!total households!!selected households by type!!households with one or more people under 18 years,estimate!!nonfamily household!!total households!!selected households by type!!households with one or more people 60 years and over,estimate!!nonfamily household!!total households!!selected households by type!!householder living alone,estimate!!nonfamily household!!total households!!selected households by type!!householder living alone!!65 years and over,estimate!!nonfamily household!!total households!!units in structure!!1-unit structures,estimate!!nonfamily household!!total households!!units in structure!!2-or-more-unit structures,estimate!!nonfamily household!!total households!!units in structure!!mobile homes and all other types of units,estimate!!nonfamily household!!total households!!housing tenure!!owner-occupied housing units,estimate!!nonfamily household!!total households!!housing tenure!!renter-occupied housing units
1,"State House District 1 (2018), Alaska",6746,2.36,3923,2.94,1881,27.4,21.5,51.1,6746,...,2823,0.7,46.1,68.9,28.6,33.9,65.8,0.3,34.7,65.3


#### DataFrame dependent Cleaning

In [None]:
columns = [col for col in family_size.columns if "average family size" in col or col == "district"]
family_size = family_size[columns]
family_size.head()

Unnamed: 0,district,estimate!!total!!families!!average family size,estimate!!married-couple family household!!families!!average family size,"estimate!!male householder, no spouse present, family household!!families!!average family size","estimate!!female householder, no spouse present, family household!!families!!average family size",estimate!!nonfamily household!!families!!average family size
1,"State House District 1 (2018), Alaska",2.94,3.04,2.63,2.83,(X)
2,"State House District 2 (2018), Alaska",3.25,3.28,2.66,3.13,(X)
3,"State House District 3 (2018), Alaska",3.33,3.29,2.96,3.78,(X)
4,"State House District 4 (2018), Alaska",3.05,3.14,2.21,3.08,(X)
5,"State House District 5 (2018), Alaska",3.06,3.1,3.04,2.9,(X)


In [None]:
family_size = family_size.drop(family_size.iloc[:, 2:].columns, axis=1)
family_size.head()

Unnamed: 0,district,estimate!!total!!families!!average family size
1,"State House District 1 (2018), Alaska",2.94
2,"State House District 2 (2018), Alaska",3.25
3,"State House District 3 (2018), Alaska",3.33
4,"State House District 4 (2018), Alaska",3.05
5,"State House District 5 (2018), Alaska",3.06


In [None]:
family_size.columns = ["district", family_size.columns[1].split("!!")[-1]]

#### Final Cleaning and Ditrict dropping

In [None]:
family_size = clean_census_data(family_size, percentage=False)
family_size.head()

Unnamed: 0,district,average family size,state
1,1,2.94,alaska
2,2,3.25,alaska
3,3,3.33,alaska
4,4,3.05,alaska
5,5,3.06,alaska


In [None]:
family_size = drop_spare_districts(target_df, family_size)
family_size.shape

(3760, 3)

#### Reset Index and Save DataFrame

In [None]:
family_size = reset_and_save(family_size, "family_size")
family_size.tail()

Unnamed: 0,district,average family size,state
3755,56,2.89,wyoming
3756,57,2.93,wyoming
3757,58,3.19,wyoming
3758,59,2.92,wyoming
3759,60,3.31,wyoming


## Housing Tenure

Percentage of owner-occupied housing units per district. -> vs. renter-occupied housing units

In [None]:
housing_tenure = pd.read_csv("drive/MyDrive/US Elections/Census Data/Family Size & Housing Tenure 2020/family_size_housing_tenure_2020.csv")

#### First Cleaning

In [None]:
housing_tenure = first_clean(housing_tenure)
housing_tenure.head(1)

Unnamed: 0,district,estimate!!total!!households!!total households,estimate!!total!!households!!average household size,estimate!!total!!families!!total families,estimate!!total!!families!!average family size,estimate!!total!!age of own children!!households with own children of the householder under 18 years,estimate!!total!!age of own children!!households with own children of the householder under 18 years!!under 6 years only,estimate!!total!!age of own children!!households with own children of the householder under 18 years!!under 6 years and 6 to 17 years,estimate!!total!!age of own children!!households with own children of the householder under 18 years!!6 to 17 years only,estimate!!total!!total households,...,estimate!!nonfamily household!!total households,estimate!!nonfamily household!!total households!!selected households by type!!households with one or more people under 18 years,estimate!!nonfamily household!!total households!!selected households by type!!households with one or more people 60 years and over,estimate!!nonfamily household!!total households!!selected households by type!!householder living alone,estimate!!nonfamily household!!total households!!selected households by type!!householder living alone!!65 years and over,estimate!!nonfamily household!!total households!!units in structure!!1-unit structures,estimate!!nonfamily household!!total households!!units in structure!!2-or-more-unit structures,estimate!!nonfamily household!!total households!!units in structure!!mobile homes and all other types of units,estimate!!nonfamily household!!total households!!housing tenure!!owner-occupied housing units,estimate!!nonfamily household!!total households!!housing tenure!!renter-occupied housing units
1,"State House District 1 (2018), Alaska",6746,2.36,3923,2.94,1881,27.4,21.5,51.1,6746,...,2823,0.7,46.1,68.9,28.6,33.9,65.8,0.3,34.7,65.3


In [None]:
housing_tenure.shape

(4053, 91)

#### DataFrame dependent Cleaning

In [None]:
columns = [col for col in housing_tenure.columns if "housing tenure" in col or col == "district"]
housing_tenure = housing_tenure[columns]
housing_tenure.head(1)

Unnamed: 0,district,estimate!!total!!total households!!housing tenure!!owner-occupied housing units,estimate!!total!!total households!!housing tenure!!renter-occupied housing units,estimate!!married-couple family household!!total households!!housing tenure!!owner-occupied housing units,estimate!!married-couple family household!!total households!!housing tenure!!renter-occupied housing units,"estimate!!male householder, no spouse present, family household!!total households!!housing tenure!!owner-occupied housing units","estimate!!male householder, no spouse present, family household!!total households!!housing tenure!!renter-occupied housing units","estimate!!female householder, no spouse present, family household!!total households!!housing tenure!!owner-occupied housing units","estimate!!female householder, no spouse present, family household!!total households!!housing tenure!!renter-occupied housing units",estimate!!nonfamily household!!total households!!housing tenure!!owner-occupied housing units,estimate!!nonfamily household!!total households!!housing tenure!!renter-occupied housing units
1,"State House District 1 (2018), Alaska",47.4,52.6,74.6,25.4,35.2,64.8,18.7,81.3,34.7,65.3


In [None]:
housing_tenure = housing_tenure.drop(housing_tenure.iloc[:, 2:], axis=1)
housing_tenure.head(1)

Unnamed: 0,district,estimate!!total!!total households!!housing tenure!!owner-occupied housing units
1,"State House District 1 (2018), Alaska",47.4


In [None]:
housing_tenure.columns = ["district", "percent " + housing_tenure.columns[1].split("!!")[-1]]

#### Final Cleaning and Ditrict dropping

In [None]:
housing_tenure = clean_census_data(housing_tenure, percentage=False)

In [None]:
housing_tenure = drop_spare_districts(target_df, housing_tenure)
housing_tenure.shape

(3760, 3)

#### Reset Index and Save DataFrame

In [None]:
housing_tenure = reset_and_save(housing_tenure, "housing_tenure")
housing_tenure.tail()

Unnamed: 0,district,percent owner-occupied housing units,state
3755,56,62.3,wyoming
3756,57,59.5,wyoming
3757,58,77.3,wyoming
3758,59,72.7,wyoming
3759,60,79.1,wyoming


## Monthly Housing Costs

In [None]:
housing_costs = pd.read_csv("drive/MyDrive/US Elections/Census Data/Monthly Housing Costs 2020/housing_costs_2020.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
housing_costs.shape

(4054, 554)

#### First Cleaning

In [None]:
housing_costs = first_clean(housing_costs)
housing_costs.head(1)

Unnamed: 0,district,estimate!!occupied housing units!!occupied housing units,"estimate!!occupied housing units!!occupied housing units!!household income in the past 12 months (in 2020 inflation-adjusted dollars)!!less than $5,000","estimate!!occupied housing units!!occupied housing units!!household income in the past 12 months (in 2020 inflation-adjusted dollars)!!$5,000 to $9,999","estimate!!occupied housing units!!occupied housing units!!household income in the past 12 months (in 2020 inflation-adjusted dollars)!!$10,000 to $14,999","estimate!!occupied housing units!!occupied housing units!!household income in the past 12 months (in 2020 inflation-adjusted dollars)!!$15,000 to $19,999","estimate!!occupied housing units!!occupied housing units!!household income in the past 12 months (in 2020 inflation-adjusted dollars)!!$20,000 to $24,999","estimate!!occupied housing units!!occupied housing units!!household income in the past 12 months (in 2020 inflation-adjusted dollars)!!$25,000 to $34,999","estimate!!occupied housing units!!occupied housing units!!household income in the past 12 months (in 2020 inflation-adjusted dollars)!!$35,000 to $49,999","estimate!!occupied housing units!!occupied housing units!!household income in the past 12 months (in 2020 inflation-adjusted dollars)!!$50,000 to $74,999",...,"estimate!!percent renter-occupied housing units!!occupied housing units!!monthly housing costs as a percentage of household income in the past 12 months!!$50,000 to $74,999","estimate!!percent renter-occupied housing units!!occupied housing units!!monthly housing costs as a percentage of household income in the past 12 months!!$50,000 to $74,999!!less than 20 percent","estimate!!percent renter-occupied housing units!!occupied housing units!!monthly housing costs as a percentage of household income in the past 12 months!!$50,000 to $74,999!!20 to 29 percent","estimate!!percent renter-occupied housing units!!occupied housing units!!monthly housing costs as a percentage of household income in the past 12 months!!$50,000 to $74,999!!30 percent or more","estimate!!percent renter-occupied housing units!!occupied housing units!!monthly housing costs as a percentage of household income in the past 12 months!!$75,000 or more","estimate!!percent renter-occupied housing units!!occupied housing units!!monthly housing costs as a percentage of household income in the past 12 months!!$75,000 or more!!less than 20 percent","estimate!!percent renter-occupied housing units!!occupied housing units!!monthly housing costs as a percentage of household income in the past 12 months!!$75,000 or more!!20 to 29 percent","estimate!!percent renter-occupied housing units!!occupied housing units!!monthly housing costs as a percentage of household income in the past 12 months!!$75,000 or more!!30 percent or more",estimate!!percent renter-occupied housing units!!occupied housing units!!monthly housing costs as a percentage of household income in the past 12 months!!zero or negative income,estimate!!percent renter-occupied housing units!!occupied housing units!!monthly housing costs as a percentage of household income in the past 12 months!!no cash rent
1,"State House District 1 (2018), Alaska",6746,131,105,300,235,242,769,614,1268,...,18.6,1.8,11.1,5.6,28.0,25.1,2.7,0.2,0.4,5.8


#### DataFrame dependent Cleaning

In [None]:
# Select total housings (not owner or rented housings)
occupied_housing_columns = [col for col in housing_costs.columns if col.startswith("estimate!!occupied housing units") or
                            col == "district"]
housing_costs = housing_costs[occupied_housing_columns]

# Delete start of the column names which are the same everywhere
housing_costs.columns = housing_costs.columns.str.split("estimate!!").str[-1]


# Select only columns which show monthly housing costs of all housings independent of who lives in these houses
monthly_costs_columns = [col for col in housing_costs.columns if "monthly housing costs!!" in col or
                         col == "district"]
housing_costs = housing_costs[monthly_costs_columns]
housing_costs.head(1)

Unnamed: 0,district,occupied housing units!!occupied housing units!!monthly housing costs!!less than $300,occupied housing units!!occupied housing units!!monthly housing costs!!$300 to $499,occupied housing units!!occupied housing units!!monthly housing costs!!$500 to $799,occupied housing units!!occupied housing units!!monthly housing costs!!$800 to $999,"occupied housing units!!occupied housing units!!monthly housing costs!!$1,000 to $1,499","occupied housing units!!occupied housing units!!monthly housing costs!!$1,500 to $1,999","occupied housing units!!occupied housing units!!monthly housing costs!!$2,000 to $2,499","occupied housing units!!occupied housing units!!monthly housing costs!!$2,500 to $2,999","occupied housing units!!occupied housing units!!monthly housing costs!!$3,000 or more",occupied housing units!!occupied housing units!!monthly housing costs!!no cash rent,occupied housing units!!occupied housing units!!monthly housing costs!!median (dollars)
1,"State House District 1 (2018), Alaska",422,428,782,512,2277,1138,661,187,133,206,1205


In [None]:
housing_costs.shape

(4053, 12)

In [None]:
# In median column are some NAs as "-"
housing_costs.iloc[housing_costs[housing_costs.iloc[:, -1] == "-"].index, -1] = np.nan

housing_costs.iloc[:, 1:-1] = housing_costs.iloc[:, 1:-1].astype("float")

In [None]:
housing_costs["monthly housing costs - $499 or lower"] = housing_costs.iloc[:, 1:3].sum(axis=1)
housing_costs["monthly housing costs - $500 - 999"] = housing_costs.iloc[:, 3:5].sum(axis=1)
housing_costs["monthly housing costs - $1000 1999"] = housing_costs.iloc[:, 5:7].sum(axis=1)
housing_costs["monthly housing costs - $2000 or higher"] = housing_costs.iloc[:, 7:10].sum(axis=1)
housing_costs["monthly housing costs - median"] = housing_costs.iloc[:, 11]
housing_costs.drop(housing_costs.iloc[:, 1:12], axis=1, inplace=True)
housing_costs.head()

Unnamed: 0,district,monthly housing costs - $499 or lower,monthly housing costs - $500 - 999,monthly housing costs - $1000 1999,monthly housing costs - $2000 or higher,monthly housing costs - median
1,"State House District 1 (2018), Alaska",850.0,1294.0,3415.0,981.0,1205
2,"State House District 2 (2018), Alaska",224.0,923.0,2762.0,1597.0,1514
3,"State House District 3 (2018), Alaska",528.0,1057.0,3294.0,1610.0,1540
4,"State House District 4 (2018), Alaska",911.0,2092.0,2639.0,1544.0,1239
5,"State House District 5 (2018), Alaska",664.0,1824.0,2643.0,1376.0,1294


#### Final Cleaning and Ditrict dropping

In [None]:
housing_costs = clean_census_data(housing_costs, percentage=False)

In [None]:
housing_costs = drop_spare_districts(target_df, housing_costs)
housing_costs.shape

(3760, 7)

In [None]:
housing_costs.iloc[:, 1:-2] = housing_costs.iloc[:, 1:-2].apply(lambda x: round(100 * (x / housing_costs.iloc[:, 1:-2].sum(axis=1)), 2))

#### Reset Index and Save DataFrame

In [None]:
housing_costs = reset_and_save(housing_costs, "housing_costs")
housing_costs.tail()

Unnamed: 0,district,monthly housing costs - $499 or lower,monthly housing costs - $500 - 999,monthly housing costs - $1000 1999,monthly housing costs - $2000 or higher,monthly housing costs - median,state
3755,56,26.91,41.72,24.06,7.31,744.0,wyoming
3756,57,23.8,37.64,36.86,1.69,825.0,wyoming
3757,58,21.69,34.52,37.58,6.21,923.0,wyoming
3758,59,29.09,27.46,41.58,1.86,879.0,wyoming
3759,60,22.0,27.83,40.84,9.33,1003.0,wyoming


## Age

* age = percentage of population in the respective cohorts and median age of district population


In [None]:
age_sex = pd.read_csv("drive/MyDrive/US Elections/Census Data/Age 2020/age_2020.csv")

  exec(code_obj, self.user_global_ns, self.user_ns)


In [None]:
age_sex.shape

(4054, 458)

#### First Cleaning

In [None]:
age_sex = first_clean(age_sex)
age_sex.head(1)

Unnamed: 0,district,estimate!!total!!total population,estimate!!total!!total population!!age!!under 5 years,estimate!!total!!total population!!age!!5 to 9 years,estimate!!total!!total population!!age!!10 to 14 years,estimate!!total!!total population!!age!!15 to 19 years,estimate!!total!!total population!!age!!20 to 24 years,estimate!!total!!total population!!age!!25 to 29 years,estimate!!total!!total population!!age!!30 to 34 years,estimate!!total!!total population!!age!!35 to 39 years,...,estimate!!percent female!!total population!!selected age categories!!62 years and over,estimate!!percent female!!total population!!selected age categories!!65 years and over,estimate!!percent female!!total population!!selected age categories!!75 years and over,estimate!!percent female!!total population!!summary indicators!!median age (years),estimate!!percent female!!total population!!summary indicators!!sex ratio (males per 100 females),estimate!!percent female!!total population!!summary indicators!!age dependency ratio,estimate!!percent female!!total population!!summary indicators!!old-age dependency ratio,estimate!!percent female!!total population!!summary indicators!!child dependency ratio,estimate!!percent female!!total population!!percent allocated!!sex,estimate!!percent female!!total population!!percent allocated!!age
1,"State House District 1 (2018), Alaska",16391,1086,1154,752,759,1572,1853,1454,792,...,22.4,18.6,7.8,(X),(X),(X),(X),(X),(X),(X)


In [None]:
age_sex.columns

Index(['district', 'estimate!!total!!total population',
       'estimate!!total!!total population!!age!!under 5 years',
       'estimate!!total!!total population!!age!!5 to 9 years',
       'estimate!!total!!total population!!age!!10 to 14 years',
       'estimate!!total!!total population!!age!!15 to 19 years',
       'estimate!!total!!total population!!age!!20 to 24 years',
       'estimate!!total!!total population!!age!!25 to 29 years',
       'estimate!!total!!total population!!age!!30 to 34 years',
       'estimate!!total!!total population!!age!!35 to 39 years',
       ...
       'estimate!!percent female!!total population!!selected age categories!!62 years and over',
       'estimate!!percent female!!total population!!selected age categories!!65 years and over',
       'estimate!!percent female!!total population!!selected age categories!!75 years and over',
       'estimate!!percent female!!total population!!summary indicators!!median age (years)',
       'estimate!!percent female

#### DataFrame dependent Cleaning

In [None]:
age = age_sex.copy()
age = delete_male_female(age)
age.head(1)

Unnamed: 0,district,estimate!!total!!total population,estimate!!total!!total population!!age!!under 5 years,estimate!!total!!total population!!age!!5 to 9 years,estimate!!total!!total population!!age!!10 to 14 years,estimate!!total!!total population!!age!!15 to 19 years,estimate!!total!!total population!!age!!20 to 24 years,estimate!!total!!total population!!age!!25 to 29 years,estimate!!total!!total population!!age!!30 to 34 years,estimate!!total!!total population!!age!!35 to 39 years,...,estimate!!percent!!total population!!selected age categories!!60 years and over,estimate!!percent!!total population!!selected age categories!!62 years and over,estimate!!percent!!total population!!selected age categories!!65 years and over,estimate!!percent!!total population!!selected age categories!!75 years and over,estimate!!percent!!total population!!summary indicators!!median age (years),estimate!!percent!!total population!!summary indicators!!age dependency ratio,estimate!!percent!!total population!!summary indicators!!old-age dependency ratio,estimate!!percent!!total population!!summary indicators!!child dependency ratio,estimate!!percent!!total population!!percent allocated!!sex,estimate!!percent!!total population!!percent allocated!!age
1,"State House District 1 (2018), Alaska",16391,1086,1154,752,759,1572,1853,1454,792,...,21.5,19.4,16.3,5.7,(X),(X),(X),(X),0.0,1.4


In [None]:
age.columns = age.columns.str.split("estimate!!total!!total population!!").str[-1]
age_columns = [col for col in age.columns if col == "district" or col == "summary indicators!!median age (years)" or
               col.startswith("age!!")]

In [None]:
age_columns

['district',
 'age!!under 5 years',
 'age!!5 to 9 years',
 'age!!10 to 14 years',
 'age!!15 to 19 years',
 'age!!20 to 24 years',
 'age!!25 to 29 years',
 'age!!30 to 34 years',
 'age!!35 to 39 years',
 'age!!40 to 44 years',
 'age!!45 to 49 years',
 'age!!50 to 54 years',
 'age!!55 to 59 years',
 'age!!60 to 64 years',
 'age!!65 to 69 years',
 'age!!70 to 74 years',
 'age!!75 to 79 years',
 'age!!80 to 84 years',
 'age!!85 years and over',
 'summary indicators!!median age (years)']

In [None]:
age = age[age_columns]
age.head()

Unnamed: 0,district,age!!under 5 years,age!!5 to 9 years,age!!10 to 14 years,age!!15 to 19 years,age!!20 to 24 years,age!!25 to 29 years,age!!30 to 34 years,age!!35 to 39 years,age!!40 to 44 years,age!!45 to 49 years,age!!50 to 54 years,age!!55 to 59 years,age!!60 to 64 years,age!!65 to 69 years,age!!70 to 74 years,age!!75 to 79 years,age!!80 to 84 years,age!!85 years and over,summary indicators!!median age (years)
1,"State House District 1 (2018), Alaska",1086,1154,752,759,1572,1853,1454,792,808,896,874,867,853,975,761,526,150,259,32.9
2,"State House District 2 (2018), Alaska",2177,1360,928,1167,3708,2183,1803,961,838,734,834,654,409,285,249,104,58,45,24.8
3,"State House District 3 (2018), Alaska",1279,1483,1745,1219,1149,2052,1418,1832,1276,1010,915,1334,758,724,476,177,207,63,31.5
4,"State House District 4 (2018), Alaska",1185,1011,1204,1214,923,1367,1228,1202,839,1143,1321,1493,1534,1201,776,453,256,114,39.0
5,"State House District 5 (2018), Alaska",1167,1143,837,1106,1842,1272,1465,1434,885,1072,999,802,1014,874,435,361,85,70,34.0


In [None]:
age.loc[age[age[age.columns[-1]] == "-"].index, age.columns[-1]] = np.nan
age.iloc[:, 1:] = age.iloc[:, 1:].astype("float")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [None]:
age["under 20"] = age.iloc[:, 1:5].sum(axis=1)
age["20 to 34"] = age.iloc[:, 5:8].sum(axis=1)
age["35 to 49"] = age.iloc[:, 8:11].sum(axis=1)
age["50 to 64"] = age.iloc[:, 11:14].sum(axis=1)
age["65 and over"] = age.iloc[:, 14:19].sum(axis=1)
median_age = age.iloc[:, 19]
age.drop(age.iloc[:, 1:20].columns, axis=1, inplace=True)
age.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

Unnamed: 0,district,under 20,20 to 34,35 to 49,50 to 64,65 and over
1,"State House District 1 (2018), Alaska",3751.0,4879.0,2496.0,2594.0,2671.0
2,"State House District 2 (2018), Alaska",5632.0,7694.0,2533.0,1897.0,741.0
3,"State House District 3 (2018), Alaska",5726.0,4619.0,4118.0,3007.0,1647.0
4,"State House District 4 (2018), Alaska",4614.0,3518.0,3184.0,4348.0,2800.0
5,"State House District 5 (2018), Alaska",4253.0,4579.0,3391.0,2815.0,1825.0


#### Final Cleaning and Ditrict dropping

> Eingerückter Textblock



In [None]:
age = clean_census_data(age)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/u

In [None]:
age = drop_spare_districts(target_df, age)

In [None]:
age.insert(6, "median age", median_age)

In [None]:
age.iloc[:, 1:6] = age.iloc[:, 1:6].apply(lambda x: round(x*100, 2))

#### Reset Index and Save DataFrame

In [None]:
age = reset_and_save(age, "age")

In [None]:
age.tail()

Unnamed: 0,district,under 20,20 to 34,35 to 49,50 to 64,65 and over,median age,state
3755,56,22.45,23.02,16.8,20.64,17.09,38.2,wyoming
3756,57,28.34,25.03,15.24,15.31,16.09,34.1,wyoming
3757,58,23.35,22.35,15.54,25.49,13.27,38.3,wyoming
3758,59,24.52,22.3,19.91,18.37,14.9,35.8,wyoming
3759,60,29.41,18.76,19.56,18.69,13.58,36.3,wyoming


## Sex

* sex = percentage of voting age female on total voting age population

In [None]:
sex = pd.read_csv("drive/MyDrive/US Elections/Census Data/Sex 2020/sex_2020.csv")
sex.shape

  exec(code_obj, self.user_global_ns, self.user_ns)


(4054, 358)

#### First Cleaning

In [None]:
sex = first_clean(sex)
sex.head(1)

Unnamed: 0,estimate!!sex and age!!total population,percent!!sex and age!!total population,estimate!!sex and age!!total population!!male,percent!!sex and age!!total population!!male,estimate!!sex and age!!total population!!female,percent!!sex and age!!total population!!female,estimate!!sex and age!!total population!!sex ratio (males per 100 females),percent!!sex and age!!total population!!sex ratio (males per 100 females),estimate!!sex and age!!total population!!under 5 years,percent!!sex and age!!total population!!under 5 years,...,"percent!!hispanic or latino and race!!total population!!not hispanic or latino!!two or more races!!two races excluding some other race, and three or more races",estimate!!total housing units,percent!!total housing units,"estimate!!citizen, voting age population!!citizen, 18 and over population","percent!!citizen, voting age population!!citizen, 18 and over population","estimate!!citizen, voting age population!!citizen, 18 and over population!!male","percent!!citizen, voting age population!!citizen, 18 and over population!!male","estimate!!citizen, voting age population!!citizen, 18 and over population!!female","percent!!citizen, voting age population!!citizen, 18 and over population!!female",district
1,16391,16391,8516,52.0,7875,48.0,108.1,(X),1086,6.6,...,9.1,8260,(X),12564,12564,6668,53.1,5896,46.9,"State House District 1 (2018), Alaska"


#### DataFrame dependent Cleaning

In [None]:
sex_columns = [col for col in sex.columns if col.startswith("estimate!!citizen, voting age population") or col == "district"]
sex = sex[sex_columns]
sex.head(1)

Unnamed: 0,"estimate!!citizen, voting age population!!citizen, 18 and over population","estimate!!citizen, voting age population!!citizen, 18 and over population!!male","estimate!!citizen, voting age population!!citizen, 18 and over population!!female",district
1,12564,6668,5896,"State House District 1 (2018), Alaska"


In [None]:
sex = pd.concat([sex.district, sex.iloc[:, 1:-1]], axis=1)

In [None]:
sex.columns = ["district", "percent " + sex.columns[1].split("!!")[-1], "percent " + sex.columns[2].split("!!")[-1]]

#### Final Cleaning and Ditrict dropping

In [None]:
sex = clean_census_data(sex)

In [None]:
sex = drop_spare_districts(target_df, sex)
sex.shape

(3760, 4)

In [None]:
sex.drop(sex.columns[1], axis=1, inplace=True)

#### Reset Index and Save DataFrame

In [None]:
sex = reset_and_save(sex, "sex")
sex.tail()

Unnamed: 0,district,percent female,state
3755,56,0.4863,wyoming
3756,57,0.5664,wyoming
3757,58,0.4292,wyoming
3758,59,0.5263,wyoming
3759,60,0.5065,wyoming


## Race

* race = 

In [None]:
race = pd.read_csv("drive/MyDrive/US Elections/Census Data/Race 2020/race_2020.csv")
race.shape

(4054, 22)

In [None]:
race.head()

Unnamed: 0,B02001_001E,B02001_001M,B02001_002E,B02001_002M,B02001_003E,B02001_003M,B02001_004E,B02001_004M,B02001_005E,B02001_005M,...,B02001_007E,B02001_007M,B02001_008E,B02001_008M,B02001_009E,B02001_009M,B02001_010E,B02001_010M,GEO_ID,NAME
0,Estimate!!Total:,Margin of Error!!Total:,Estimate!!Total:!!White alone,Margin of Error!!Total:!!White alone,Estimate!!Total:!!Black or African American alone,Margin of Error!!Total:!!Black or African Amer...,Estimate!!Total:!!American Indian and Alaska N...,Margin of Error!!Total:!!American Indian and A...,Estimate!!Total:!!Asian alone,Margin of Error!!Total:!!Asian alone,...,Estimate!!Total:!!Some other race alone,Margin of Error!!Total:!!Some other race alone,Estimate!!Total:!!Two or more races:,Margin of Error!!Total:!!Two or more races:,Estimate!!Total:!!Two or more races:!!Two race...,Margin of Error!!Total:!!Two or more races:!!T...,Estimate!!Total:!!Two or more races:!!Two race...,Margin of Error!!Total:!!Two or more races:!!T...,id,Geographic Area Name
1,16391,905,10858,907,835,300,2024,535,598,231,...,280,168,1772,351,178,169,1594,308,620L600US02001,"State House District 1 (2018), Alaska"
2,18497,1211,12851,1199,2072,498,662,256,741,367,...,563,388,1156,321,127,95,1029,303,620L600US02002,"State House District 2 (2018), Alaska"
3,19117,1874,15188,1798,743,276,1501,633,248,137,...,228,148,1209,407,92,79,1117,394,620L600US02003,"State House District 3 (2018), Alaska"
4,18464,1725,15682,1492,106,96,563,277,650,266,...,39,63,1402,600,332,417,1070,413,620L600US02004,"State House District 4 (2018), Alaska"


#### First Cleaning

In [None]:
race = first_clean(race)

In [None]:
race.head(1)

Unnamed: 0,estimate!!total:,estimate!!total:!!white alone,estimate!!total:!!black or african american alone,estimate!!total:!!american indian and alaska native alone,estimate!!total:!!asian alone,estimate!!total:!!native hawaiian and other pacific islander alone,estimate!!total:!!some other race alone,estimate!!total:!!two or more races:,estimate!!total:!!two or more races:!!two races including some other race,"estimate!!total:!!two or more races:!!two races excluding some other race, and three or more races",district
1,16391,10858,835,2024,598,24,280,1772,178,1594,"State House District 1 (2018), Alaska"


#### DataFrame dependent Cleaning

In [None]:
# Select columns
race = race.drop(race.iloc[:, -3:-1].columns, axis=1)
race.head(1)

Unnamed: 0,estimate!!total:,estimate!!total:!!white alone,estimate!!total:!!black or african american alone,estimate!!total:!!american indian and alaska native alone,estimate!!total:!!asian alone,estimate!!total:!!native hawaiian and other pacific islander alone,estimate!!total:!!some other race alone,estimate!!total:!!two or more races:,district
1,16391,10858,835,2024,598,24,280,1772,"State House District 1 (2018), Alaska"


In [None]:
race.iloc[:, :-1] = race.iloc[:, :-1].astype("int")
race["non-white"] = race.iloc[:, 2:-1].sum(axis=1)
race.columns = race.columns.str.split("estimate!!total:!!").str[-1]
district = race.district

In [None]:
race.columns

Index(['estimate!!total:', 'white alone', 'black or african american alone',
       'american indian and alaska native alone', 'asian alone',
       'native hawaiian and other pacific islander alone',
       'some other race alone', 'two or more races:', 'district', 'non-white'],
      dtype='object')

In [None]:
race_columns = ['estimate!!total:', 'non-white', 'black or african american alone']
race = pd.concat([district, race[race_columns]], axis=1)

In [None]:
race.head()

Unnamed: 0,district,estimate!!total:,non-white,black or african american alone
1,"State House District 1 (2018), Alaska",16391,5533.0,835
2,"State House District 2 (2018), Alaska",18497,5646.0,2072
3,"State House District 3 (2018), Alaska",19117,3929.0,743
4,"State House District 4 (2018), Alaska",18464,2782.0,106
5,"State House District 5 (2018), Alaska",16863,5605.0,751


In [None]:
race.iloc[:, 1:] = race.iloc[:, 1:].astype("float")
race["non-white"] = race["non-white"] / race["estimate!!total:"]
race["black/afroamerican"] = race["black or african american alone"] / race["estimate!!total:"]

In [None]:
race = race.drop(race.columns[[1, 3]], axis=1)
race.head()

Unnamed: 0,district,non-white,black/afroamerican
1,"State House District 1 (2018), Alaska",0.337563,0.050943
2,"State House District 2 (2018), Alaska",0.305239,0.112018
3,"State House District 3 (2018), Alaska",0.205524,0.038866
4,"State House District 4 (2018), Alaska",0.150672,0.005741
5,"State House District 5 (2018), Alaska",0.332385,0.044535


#### Final Cleaning and Ditrict dropping

In [None]:
race = clean_census_data(race, percentage=False)
race.shape

(3887, 4)

In [None]:
race = drop_spare_districts(target_df, race)
race.tail()

Unnamed: 0,district,non-white,black/afroamerican,state
4049,56,0.08201,0.007179,wyoming
4050,57,0.044766,0.011075,wyoming
4051,58,0.094292,0.003622,wyoming
4052,59,0.04825,0.003974,wyoming
4053,60,0.081037,0.0,wyoming


#### Reset Index and Save DataFrame

In [None]:
race = reset_and_save(race, "race")
race.tail()

Unnamed: 0,district,non-white,black/afroamerican,state
3755,56,0.08201,0.007179,wyoming
3756,57,0.044766,0.011075,wyoming
3757,58,0.094292,0.003622,wyoming
3758,59,0.04825,0.003974,wyoming
3759,60,0.081037,0.0,wyoming


## Health Insurance

* health_insurance = percent of uninsured people per district

In [None]:
health_insurance = pd.read_csv("drive/MyDrive/US Elections/Census Data/Health Insurance 2020/health_insurance_2020.csv")
health_insurance.head(1)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,GEO_ID,NAME,S2701_C01_001E,S2701_C01_001M,S2701_C01_002E,S2701_C01_002M,S2701_C01_003E,S2701_C01_003M,S2701_C01_004E,S2701_C01_004M,...,S2701_C05_057E,S2701_C05_057M,S2701_C05_058E,S2701_C05_058M,S2701_C05_059E,S2701_C05_059M,S2701_C05_060E,S2701_C05_060M,S2701_C05_061E,S2701_C05_061M
0,id,Geographic Area Name,Estimate!!Total!!Civilian noninstitutionalized...,Margin of Error!!Total!!Civilian noninstitutio...,Estimate!!Total!!Civilian noninstitutionalized...,Margin of Error!!Total!!Civilian noninstitutio...,Estimate!!Total!!Civilian noninstitutionalized...,Margin of Error!!Total!!Civilian noninstitutio...,Estimate!!Total!!Civilian noninstitutionalized...,Margin of Error!!Total!!Civilian noninstitutio...,...,Estimate!!Percent Uninsured!!Civilian noninsti...,Margin of Error!!Percent Uninsured!!Civilian n...,Estimate!!Percent Uninsured!!Civilian noninsti...,Margin of Error!!Percent Uninsured!!Civilian n...,Estimate!!Percent Uninsured!!Civilian noninsti...,Margin of Error!!Percent Uninsured!!Civilian n...,Estimate!!Percent Uninsured!!Civilian noninsti...,Margin of Error!!Percent Uninsured!!Civilian n...,Estimate!!Percent Uninsured!!Civilian noninsti...,Margin of Error!!Percent Uninsured!!Civilian n...


In [None]:
health_insurance.shape

(4054, 612)

#### First Cleaning

In [None]:
health_insurance = first_clean(health_insurance)
health_insurance.head(1)

Unnamed: 0,district,estimate!!total!!civilian noninstitutionalized population,estimate!!total!!civilian noninstitutionalized population!!age!!under 6 years,estimate!!total!!civilian noninstitutionalized population!!age!!6 to 18 years,estimate!!total!!civilian noninstitutionalized population!!age!!19 to 25 years,estimate!!total!!civilian noninstitutionalized population!!age!!26 to 34 years,estimate!!total!!civilian noninstitutionalized population!!age!!35 to 44 years,estimate!!total!!civilian noninstitutionalized population!!age!!45 to 54 years,estimate!!total!!civilian noninstitutionalized population!!age!!55 to 64 years,estimate!!total!!civilian noninstitutionalized population!!age!!65 to 74 years,...,"estimate!!percent uninsured!!civilian noninstitutionalized population!!household income (in 2020 inflation-adjusted dollars)!!total household population!!under $25,000","estimate!!percent uninsured!!civilian noninstitutionalized population!!household income (in 2020 inflation-adjusted dollars)!!total household population!!$25,000 to $49,999","estimate!!percent uninsured!!civilian noninstitutionalized population!!household income (in 2020 inflation-adjusted dollars)!!total household population!!$50,000 to $74,999","estimate!!percent uninsured!!civilian noninstitutionalized population!!household income (in 2020 inflation-adjusted dollars)!!total household population!!$75,000 to $99,999","estimate!!percent uninsured!!civilian noninstitutionalized population!!household income (in 2020 inflation-adjusted dollars)!!total household population!!$100,000 and over",estimate!!percent uninsured!!civilian noninstitutionalized population!!ratio of income to poverty level in the past 12 months!!civilian noninstitutionalized population for whom poverty status is determined,estimate!!percent uninsured!!civilian noninstitutionalized population!!ratio of income to poverty level in the past 12 months!!civilian noninstitutionalized population for whom poverty status is determined!!below 138 percent of the poverty threshold,estimate!!percent uninsured!!civilian noninstitutionalized population!!ratio of income to poverty level in the past 12 months!!civilian noninstitutionalized population for whom poverty status is determined!!138 to 399 percent of the poverty threshold,estimate!!percent uninsured!!civilian noninstitutionalized population!!ratio of income to poverty level in the past 12 months!!civilian noninstitutionalized population for whom poverty status is determined!!at or above 400 percent of the poverty threshold,estimate!!percent uninsured!!civilian noninstitutionalized population!!ratio of income to poverty level in the past 12 months!!civilian noninstitutionalized population for whom poverty status is determined!!below 100 percent of the poverty threshold
1,"State House District 1 (2018), Alaska",15594,1328,2245,2151,2407,1488,1656,1696,1727,...,10.2,12.3,7.8,9.6,9.1,9.7,11.4,13.5,4.8,14.1


#### DataFrame dependent Cleaning

In [None]:
# Select relevant columns
civilian_columns = [col for col in health_insurance.columns if "civilian noninstitutionalized population" in col or col == "district"]

In [None]:
true_columns = health_insurance[civilian_columns].columns.str.split("!!").str.len() <= 3

In [None]:
final_columns = health_insurance[civilian_columns].columns[true_columns]
health_insurance = health_insurance[final_columns]
health_insurance.head(1)

Unnamed: 0,district,estimate!!total!!civilian noninstitutionalized population,estimate!!insured!!civilian noninstitutionalized population,estimate!!percent insured!!civilian noninstitutionalized population,estimate!!uninsured!!civilian noninstitutionalized population,estimate!!percent uninsured!!civilian noninstitutionalized population
1,"State House District 1 (2018), Alaska",15594,14082,90.3,1512,9.7


In [None]:
health_insurance.drop(health_insurance.iloc[:, 1:-1].columns, axis=1, inplace=True)
health_insurance.head(1)

Unnamed: 0,district,estimate!!percent uninsured!!civilian noninstitutionalized population
1,"State House District 1 (2018), Alaska",9.7


In [None]:
health_insurance.columns = ["district", "percent uninsured"]

#### Final Cleaning and Ditrict dropping

In [None]:
health_insurance = clean_census_data(health_insurance, percentage=False)

In [None]:
health_insurance = drop_spare_districts(target_df, health_insurance)

#### Reset Index and Save DataFrame

In [None]:
health_insurance = reset_and_save(health_insurance, "health_insurance")
health_insurance.tail()

Unnamed: 0,district,percent uninsured,state
3755,56,13.9,wyoming
3756,57,11.3,wyoming
3757,58,15.4,wyoming
3758,59,15.0,wyoming
3759,60,8.2,wyoming


## Language Spoken at Home

In [None]:
language_spoken = pd.read_csv("drive/MyDrive/US Elections/Census Data/Language Spoken at Home 2020/language_spoken_at_home_2020.csv")
language_spoken.head(1)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,GEO_ID,NAME,S1601_C01_001E,S1601_C01_001M,S1601_C01_002E,S1601_C01_002M,S1601_C01_003E,S1601_C01_003M,S1601_C01_004E,S1601_C01_004M,...,S1601_C06_020E,S1601_C06_020M,S1601_C06_021E,S1601_C06_021M,S1601_C06_022E,S1601_C06_022M,S1601_C06_023E,S1601_C06_023M,S1601_C06_024E,S1601_C06_024M
0,id,Geographic Area Name,Estimate!!Total!!Population 5 years and over,Margin of Error!!Total!!Population 5 years and...,Estimate!!Total!!Population 5 years and over!!...,Margin of Error!!Total!!Population 5 years and...,Estimate!!Total!!Population 5 years and over!!...,Margin of Error!!Total!!Population 5 years and...,Estimate!!Total!!Population 5 years and over!!...,Margin of Error!!Total!!Population 5 years and...,...,Estimate!!Percent speak English less than very...,Margin of Error!!Percent speak English less th...,Estimate!!Percent speak English less than very...,Margin of Error!!Percent speak English less th...,Estimate!!Percent speak English less than very...,Margin of Error!!Percent speak English less th...,Estimate!!Percent speak English less than very...,Margin of Error!!Percent speak English less th...,Estimate!!Percent speak English less than very...,Margin of Error!!Percent speak English less th...


#### First Cleaning

In [None]:
language_spoken = first_clean(language_spoken)
language_spoken.head(1)

Unnamed: 0,district,estimate!!total!!population 5 years and over,estimate!!total!!population 5 years and over!!speak only english,estimate!!total!!population 5 years and over!!speak a language other than english,estimate!!total!!population 5 years and over!!speak a language other than english!!spanish,estimate!!total!!population 5 years and over!!speak a language other than english!!spanish!!5 to 17 years old,estimate!!total!!population 5 years and over!!speak a language other than english!!spanish!!18 to 64 years old,estimate!!total!!population 5 years and over!!speak a language other than english!!spanish!!65 years old and over,estimate!!total!!population 5 years and over!!speak a language other than english!!other indo-european languages,estimate!!total!!population 5 years and over!!speak a language other than english!!other indo-european languages!!5 to 17 years old,...,"estimate!!percent speak english less than very well""!!percent of specified language speakers!!population 5 years and over!!speak a language other than english!!asian and pacific island languages!!65 years old and over""","estimate!!percent speak english less than very well""!!percent of specified language speakers!!population 5 years and over!!speak a language other than english!!other languages""","estimate!!percent speak english less than very well""!!percent of specified language speakers!!population 5 years and over!!speak a language other than english!!other languages!!5 to 17 years old""","estimate!!percent speak english less than very well""!!percent of specified language speakers!!population 5 years and over!!speak a language other than english!!other languages!!18 to 64 years old""","estimate!!percent speak english less than very well""!!percent of specified language speakers!!population 5 years and over!!speak a language other than english!!other languages!!65 years old and over""","estimate!!percent speak english less than very well""!!percent of specified language speakers!!citizens 18 years and over!!all citizens 18 years old and over""","estimate!!percent speak english less than very well""!!percent of specified language speakers!!citizens 18 years and over!!all citizens 18 years old and over!!speak only english""","estimate!!percent speak english less than very well""!!percent of specified language speakers!!citizens 18 years and over!!all citizens 18 years old and over!!speak a language other than english""","estimate!!percent speak english less than very well""!!percent of specified language speakers!!citizens 18 years and over!!all citizens 18 years old and over!!speak a language other than english!!spanish""","estimate!!percent speak english less than very well""!!percent of specified language speakers!!citizens 18 years and over!!all citizens 18 years old and over!!speak a language other than english!!other languages"""
1,"State House District 1 (2018), Alaska",15305,13247,2058,833,75,744,14,300,0,...,50.0,7.4,0.0,1.6,22.2,3.8,(X),30.7,30.4,30.9


#### DataFrame dependent Cleaning

In [None]:
columns = ["district", "estimate!!percent!!population 5 years and over!!speak a language other than english"]
language_spoken = language_spoken[columns]
language_spoken.head()

Unnamed: 0,district,estimate!!percent!!population 5 years and over!!speak a language other than english
1,"State House District 1 (2018), Alaska",13.4
2,"State House District 2 (2018), Alaska",13.7
3,"State House District 3 (2018), Alaska",5.8
4,"State House District 4 (2018), Alaska",4.9
5,"State House District 5 (2018), Alaska",14.0


In [None]:
language_spoken.columns = ["district", "household language not english"]

#### Final Cleaning and Ditrict dropping

In [None]:
language_spoken = clean_census_data(language_spoken, percentage=False)

In [None]:
language_spoken = drop_spare_districts(target_df, language_spoken)

#### Reset Index and Save DataFrame

In [None]:
language_spoken = reset_and_save(language_spoken, "language_spoken")
language_spoken.tail()

Unnamed: 0,district,household language not english,state
3755,56,5.2,wyoming
3756,57,4.7,wyoming
3757,58,5.6,wyoming
3758,59,7.8,wyoming
3759,60,5.9,wyoming


## Food Stamps

In [None]:
food_stamps = pd.read_csv("drive/MyDrive/US Elections/Census Data/Food Stamps 2020/food_stamps_2020.csv")
food_stamps.head(1)

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,GEO_ID,NAME,S2201_C01_001E,S2201_C01_001M,S2201_C01_002E,S2201_C01_002M,S2201_C01_003E,S2201_C01_003M,S2201_C01_004E,S2201_C01_004M,...,S2201_C06_034E,S2201_C06_034M,S2201_C06_035E,S2201_C06_035M,S2201_C06_036E,S2201_C06_036M,S2201_C06_037E,S2201_C06_037M,S2201_C06_038E,S2201_C06_038M
0,id,Geographic Area Name,Estimate!!Total!!Households,Margin of Error!!Total!!Households,Estimate!!Total!!Households!!With one or more ...,Margin of Error!!Total!!Households!!With one o...,Estimate!!Total!!Households!!No people in the ...,Margin of Error!!Total!!Households!!No people ...,Estimate!!Total!!Households!!Married-couple fa...,Margin of Error!!Total!!Households!!Married-co...,...,Estimate!!Percent households not receiving foo...,Margin of Error!!Percent households not receiv...,Estimate!!Percent households not receiving foo...,Margin of Error!!Percent households not receiv...,Estimate!!Percent households not receiving foo...,Margin of Error!!Percent households not receiv...,Estimate!!Percent households not receiving foo...,Margin of Error!!Percent households not receiv...,Estimate!!Percent households not receiving foo...,Margin of Error!!Percent households not receiv...


#### First Cleaning

In [None]:
food_stamps = first_clean(food_stamps)
food_stamps.head(1)

Unnamed: 0,district,estimate!!total!!households,estimate!!total!!households!!with one or more people in the household 60 years and over,estimate!!total!!households!!no people in the household 60 years and over,estimate!!total!!households!!married-couple family,estimate!!total!!households!!other family:,"estimate!!total!!households!!other family:!!male householder, no spouse present","estimate!!total!!households!!other family:!!female householder, no spouse present",estimate!!total!!households!!nonfamily households,estimate!!total!!households!!with children under 18 years,...,estimate!!percent households not receiving food stamps/snap!!households!!race and hispanic or latino origin of householder!!native hawaiian and other pacific islander alone,estimate!!percent households not receiving food stamps/snap!!households!!race and hispanic or latino origin of householder!!some other race alone,estimate!!percent households not receiving food stamps/snap!!households!!race and hispanic or latino origin of householder!!two or more races,estimate!!percent households not receiving food stamps/snap!!households!!race and hispanic or latino origin of householder!!hispanic or latino origin (of any race),"estimate!!percent households not receiving food stamps/snap!!households!!race and hispanic or latino origin of householder!!white alone, not hispanic or latino",estimate!!percent households not receiving food stamps/snap!!households!!household income in the past 12 months (in 2020 inflation-adjusted dollars)!!median income (dollars),estimate!!percent households not receiving food stamps/snap!!work status!!families,estimate!!percent households not receiving food stamps/snap!!work status!!families!!no workers in past 12 months,estimate!!percent households not receiving food stamps/snap!!work status!!families!!1 worker in past 12 months,estimate!!percent households not receiving food stamps/snap!!work status!!families!!2 or more workers in past 12 months
1,"State House District 1 (2018), Alaska",6746,2574,4172,2503,1420,517,903,2823,2025,...,0.2,1.8,7.0,8.2,70.0,(X),(X),8.5,34.3,57.2


#### DataFrame dependent Cleaning

In [None]:
food_stamps = food_stamps[["district", "estimate!!percent households receiving food stamps/snap!!households"]]
food_stamps.head()

Unnamed: 0,district,estimate!!percent households receiving food stamps/snap!!households
1,"State House District 1 (2018), Alaska",13.5
2,"State House District 2 (2018), Alaska",5.5
3,"State House District 3 (2018), Alaska",1.9
4,"State House District 4 (2018), Alaska",5.5
5,"State House District 5 (2018), Alaska",3.6


In [None]:
food_stamps.columns = ["district", "percent households receiving food stamps"]

#### Final Cleaning and Ditrict dropping

In [None]:
food_stamps = clean_census_data(food_stamps, percentage=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

In [None]:
food_stamps = drop_spare_districts(target_df, food_stamps)
food_stamps.tail()

Unnamed: 0,district,percent households receiving food stamps,state
4049,56,7.5,wyoming
4050,57,10.2,wyoming
4051,58,7.1,wyoming
4052,59,9.3,wyoming
4053,60,3.9,wyoming


#### Reset Index and Save DataFrame

In [None]:
food_stamps = reset_and_save(food_stamps, "food_stamps")
food_stamps.tail()

Unnamed: 0,district,percent households receiving food stamps,state
3755,56,7.5,wyoming
3756,57,10.2,wyoming
3757,58,7.1,wyoming
3758,59,9.3,wyoming
3759,60,3.9,wyoming


## Merge DataFrames

In [None]:
import os

# Get all the Census Data Files 
path = "drive/MyDrive/US Elections/Census Data/Final Files"
list_of_dfs = os.listdir(path)[1:]
list_of_dfs = [file for file in list_of_dfs if "2020" in file]


# Read in target DataFrame
target_df = pd.read_csv("drive/MyDrive/US Elections/data_target_2020.csv")
df = target_df.copy() # copy to make following for loop work

In [None]:
# Loop through files to merge them with target DataFrame
%%time
for file in list_of_dfs:
  census = pd.read_csv(path+"/"+file)
  df = pd.merge(left=census, right=df, how="inner", on=["district", "state"])

CPU times: user 194 ms, sys: 8.93 ms, total: 203 ms
Wall time: 225 ms


In [None]:
df.head()

Unnamed: 0,district,percent households receiving food stamps,state,household language not english,percent uninsured,non-white,black/afroamerican,percent female,under 20,20 to 34,...,service occupations,sales and office occupations,blue collar occupations,less than highschool,highschool,college or associate,bachelor or higher,office,year,target
0,1,13.5,alaska,13.4,9.7,0.337563,0.050943,0.4693,22.88,29.77,...,0.183,0.2563,0.2719,0.0818,0.3062,0.3811,0.2309,state house,2020,0
1,2,5.5,alaska,13.7,7.6,0.305239,0.112018,0.3915,30.45,41.6,...,0.1976,0.2242,0.2321,0.0373,0.3659,0.4154,0.1814,state house,2020,0
2,3,1.9,alaska,5.8,10.2,0.205524,0.038866,0.4584,29.95,24.16,...,0.1598,0.2203,0.2764,0.0817,0.2525,0.4291,0.2367,state house,2020,0
3,4,5.5,alaska,4.9,8.1,0.150672,0.005741,0.484,24.99,19.05,...,0.1808,0.1744,0.2096,0.0491,0.205,0.3406,0.4053,state house,2020,1
4,5,3.6,alaska,14.0,11.8,0.332385,0.044535,0.4534,25.22,27.15,...,0.1769,0.1851,0.1985,0.0403,0.2231,0.4156,0.321,state house,2020,1


In [None]:
df.shape

(3760, 40)

In [None]:
target_df.state.unique()

array(['alaska', 'arkansas', 'california', 'colorado', 'connecticut',
       'delaware', 'florida', 'georgia', 'hawaii', 'illinois', 'indiana',
       'iowa', 'kansas', 'kentucky', 'maine', 'massachusetts', 'michigan',
       'minnesota', 'missouri', 'montana', 'nevada', 'new mexico',
       'new york', 'north carolina', 'ohio', 'oklahoma', 'oregon',
       'pennsylvania', 'rhode island', 'south carolina', 'tennessee',
       'texas', 'utah', 'vermont', 'west virginia', 'wisconsin',
       'wyoming'], dtype=object)

In [None]:
# Save DataFrame
df.to_csv("drive/MyDrive/US Elections/data_final_2020.csv", index=False)