# Collect Census Data from US Census API
This file loads yearly median house price data from the US Census using the API. To get an API key use [this](https://api.census.gov/data/key_signup.html) link. This notebook also uses an external file called `state_and_county_fips_master.csv`. This csv file is sourced from this public github [repository](https://github.com/kjhealy/fips-codes). 

In [3]:
import requests
import time
import os
import pandas as pd

def fetch_census_data(api_key, year, state_code, county_codes):
    base_url = f"https://api.census.gov/data/{year}/acs/acs5"
    county_query = ','.join([f'{state_code}{county_code.zfill(3)}' for county_code in county_codes])
    params = {
        'get': 'NAME,B25077_001E',
        'for': 'county:*',
        'in': f'state:{state_code}',
        'key': api_key
    }
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        return response.json()
    else:
        return None


In [8]:
# Group counties by state
fips_df = pd.read_csv('census_data/state_and_county_fips_master.csv')

# split FIPS codes
def split_fips(row):
    fips = str(row['fips']).zfill(5) 
    return pd.Series({
        'state_fips': fips[:2],  
        'county_fips': fips[2:], 
    })

# function to each row
fips_df[['state_fips', 'county_fips']] = fips_df.apply(split_fips, axis=1)


In [None]:
counties_by_state = fips_df.groupby('state_fips')['county_fips'].apply(list).to_dict()

api_key = 'fake_key'  
years = range(2010, 2024)

all_data = []
filename = 'census_data/fake.csv' #save census data into this csv

for year in years:
    for state_code, county_codes in counties_by_state.items():
        # Fetch in chunks if there are too many counties in a state
        chunk_size = 50  # Adjust based on experimentation
        for i in range(0, len(county_codes), chunk_size):
            county_chunk = county_codes[i:i + chunk_size]
            data = fetch_census_data(api_key, year, state_code, county_chunk)
            if data:
                year_data = [record + [year] for record in data[1:]]
                all_data.extend(year_data)
                if len(all_data) >= 200:
                    print(data[1])
                    file_exists = os.path.isfile(filename)
                    temp_df = pd.DataFrame(all_data, columns=['Name', 'MedianHouseValue', 'State', 'County', 'year'])
                    temp_df.to_csv(filename, mode='a', index=False, header=not file_exists)
                    all_data = []
            time.sleep(0.5)

if all_data:
    file_exists = os.path.isfile(filename)
    temp_df = pd.DataFrame(all_data, columns=['Name', 'MedianHouseValue', 'State', 'County', 'year'])
    temp_df.to_csv(filename, mode='a', index=False, header=not file_exists)