# Data preparation
Since we have so much data, we will be selecting certain attributes for each problem we will be looking at.
The dataset created in this file will be used for the classification problem. 
It will share many attributes with the other datasets.

In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
acs = pd.read_csv("acs_alldata.csv", dtype=str, na_values=["(X)", "N", "-"])

In [3]:
acs.head()

Unnamed: 0,GEO_ID,NAME,DP02_0001E,DP02_0001M,DP02_0001PE,DP02_0001PM,DP02_0002E,DP02_0002M,DP02_0002PE,DP02_0002PM,...,DP05_0087PE,DP05_0087PM,DP05_0088E,DP05_0088M,DP05_0088PE,DP05_0088PM,DP05_0089E,DP05_0089M,DP05_0089PE,DP05_0089PM
0,1600000US0100100,"Abanda CDP, Alabama",70,40,70,,14,22,20.0,35.8,...,131,,41,61,31.3,30.6,90,47,68.7,30.6
1,1600000US0100124,"Abbeville city, Alabama",982,113,982,,615,74,62.6,6.8,...,2083,,1079,111,51.8,4.0,1004,93,48.2,4.0
2,1600000US0100460,"Adamsville city, Alabama",1601,148,1601,,1085,87,67.8,6.7,...,3437,,1584,179,46.1,3.6,1853,147,53.9,3.6
3,1600000US0100484,"Addison town, Alabama",339,71,339,,198,55,58.4,10.2,...,579,,309,85,53.4,5.8,270,73,46.6,5.8
4,1600000US0100676,"Akron town, Alabama",110,33,110,,62,29,56.4,16.7,...,260,,120,67,46.2,12.7,140,51,53.8,12.7


In [4]:
acs.shape

(29568, 2086)

## Selecting relevant columns
We now select the specific columns that are relevant for the problems we are looking at

In [5]:
attributes = {
    "DP02_0015E": "ave_household_size",
    "DP02_0095PE": "p_not_citizen",
    "DP03_0062E": "median_income",
    "DP04_0005E": "rental_vacancy_rate",
    "DP04_0007PE": "p_units_1_detached",
    "DP04_0008PE": "p_units_1",
    "DP04_0009PE": "p_units_2",
    "DP04_0010PE": "p_units_34",
    "DP04_0011PE": "p_units_59",
    "DP04_0012PE": "p_units_10",
    "DP04_0013PE": "p_units_20",
    "DP04_0017PE": "p_built_2014",
    "DP04_0018PE": "p_built_2010",
    "DP04_0019PE": "p_built_2000",
    "DP04_0020PE": "p_built_1990",
    "DP04_0021PE": "p_built_1980",
    "DP04_0022PE": "p_built_1970",
    "DP04_0023PE": "p_built_1960",
    "DP04_0024PE": "p_built_1950",
    "DP04_0025PE": "p_built_1940",
    "DP04_0026PE": "p_built_1930",
    "DP04_0037E": "median_rooms",
    "DP04_0134E": "median_rent",
    "DP04_0089E": "median-value"
    
}

In [6]:
# select the colums
acs = acs.filter(attributes)
acs

NameError: name 'attributes' is not defined

In [5]:
# remove the last row
acs = acs.drop([29567])

In [8]:
# create more readable column names
col_names = {
    "DP02_0015E": "ave_household_size",
    "DP02_0066PE": "p_highschool_plus",
    "DP02_0067PE": "p_bachelors_plus",
    "DP02_0087PE": "p_native_born",
    "DP03_0009PE": "p_unemployed",
    "DP03_0027PE": "p_occ_business_science_art",
    "DP03_0028PE": "p_occ_service",
    "DP03_0029PE": "p_occ_sales_office",
    "DP03_0030PE": "p_occ_resources_construction",
    "DP03_0031PE": "p_occ_production_transport",
    "DP03_0062E": "med_household_income",
    "DP03_0063E": "mean_household_income",
    "DP03_0119PE": "p_families_poverty",
    "DP04_0046PE": "p_household_owner_occupied",
    "DP04_0047PE": "p_household_renter_occupied",
    "DP04_0137PE": "p_grapi_15",
    "DP04_0138PE": "p_grapi_15-19.9",
    "DP04_0139PE": "p_grapi_20-24.9",
    "DP04_0140PE": "p_grapi_25-29.9",
    "DP04_0141PE": "p_grapi_30-34.9",
    "DP04_0142PE": "p_grapi_35",
    "DP05_0063E": "total_pop",
    "DP05_0064PE": "p_race_white",
    "DP05_0065PE": "p_race_black",
    "DP05_0066PE": "p_race_am_indian",
    "DP05_0067PE": "p_race_asian",
    "DP05_0068PE": "p_race_nat_hawaiian_pac_islander",
    "DP05_0069PE": "p_race_other",
    "DP05_0071PE": "p_race_hispanic"
}

In [9]:
# rename the columns of the dataset
acs = acs.rename(columns=col_names)

In [10]:
# output to csv
acs.to_csv("acs.csv")

In [11]:
for col in acs.iloc[:, 2:]:
    acs[col] = pd.to_numeric(acs[col], errors="coerce")

In [12]:
acs.describe().transpose().to_csv("summary.csv")