In [None]:
# SE_A00001_001:     Total Population (New col name county_population)
# SE_A00002_001:     Total Population   (drop)
# SE_A00002_002:     Population Density (Per Sq. Mile) (New col name county_pop_density)
# SE_A00002_003:     Area (Land) (drop)
# SE_A00003_001:     Area Total:  (New col name county_area_total)
# SE_A00003_002:     Area Total: Area (Land) (New col name county_area_land)
# SE_A00003_003:     Area Total: Area (Water)  (New col name county_area_water)


# Data Link: https://www.socialexplorer.com/explore-tables

In [20]:
# All data is in the Data/CountyPopulation folder and each csv is Pop5yr1Est<XX>.<YY>.csv
# Where XX is the start year (2019 -> 19) and YY is end year (2023 -> 23) We will use end year as the year column


"""Function
- Iterate thru the csv files in the CountyPopulation folder
- Read the csv file
- rename Geo_STUSAB to state_code
- rename Geo_STATE to state_fips
- rename Geo_COUNTY to county_fips
- rename Geo_QNAME to county_state_name
- rename SE_A00001_001 to   county_population
same for other columns below
    # SE_A00002_002:     Population Density (Per Sq. Mile) (New col name county_pop_density)
    # SE_A00003_001:     Area Total:  (New col name county_area_total)
    # SE_A00003_002:     Area Total: Area (Land) (New col name county_area_land)
    # SE_A00003_003:     Area Total: Area (Water)  (New col name county_area_water)

- drop the columns that are not needed
- add the year column based on the file name add 2000 to the end year
- combine all the dataframes into one
- save the dataframe as a csv file in the Data folder
"""


import os
import pandas as pd

# Directory containing county population data
DATA_DIR = "Data/CountyPopulation"
OUTPUT_FILE = "Data/county_population_processed.csv"

# Columns to rename
rename_cols = {
    "Geo_STUSAB": "state_code",
    "Geo_STATE": "state_fips",
    "Geo_COUNTY": "county_fips",
    "Geo_QName": "county_state_name",
    "SE_A00001_001": "county_population",
    "SE_A00002_002": "county_pop_density",
    "SE_A00003_001": "county_area_total",
    "SE_A00003_002": "county_area_land",
    "SE_A00003_003": "county_area_water"
}

# List to store dataframes
df_list = []

# Iterate through files in the directory
for file in os.listdir(DATA_DIR):
    if file.endswith(".csv"):  # Ignore txt files
        file_path = os.path.join(DATA_DIR, file)
        file_name = file_path.split("/")[2]
        print(f"Processing {file_name}")
        
        # Extract year from filename
        year_suffix = file_name.split(".")[1]
        year = 2000 + int(year_suffix)  # Convert to full year format
        
        # Read CSV
        df = pd.read_csv(file_path, encoding="ISO-8859-1")

        # Rename columns
        df.rename(columns=rename_cols, inplace=True)
        
        # Drop unnecessary columns (keep only renamed columns)
        df = df[list(rename_cols.values())]
        
        # Add year column
        df["year"] = year
        
        # Append dataframe to list
        df_list.append(df)
        print(f"Processed {file_path}")

# Combine all dataframes
print("Combining dataframes...")
final_df = pd.concat(df_list, ignore_index=True)

# capitalize state_code
final_df["state_code"] = final_df["state_code"].str.upper()
# Save to CSV
final_df.to_csv(OUTPUT_FILE, index=False)
print(f"Processed county population data saved to {OUTPUT_FILE}")


Processing Pop5yr1Est19.23.csv
Processed Data/CountyPopulation/Pop5yr1Est19.23.csv
Processing Pop5yr1Est12.16.csv
Processed Data/CountyPopulation/Pop5yr1Est12.16.csv
Processing Pop5yr1Est15.19.csv
Processed Data/CountyPopulation/Pop5yr1Est15.19.csv
Processing Pop5yr1Est17.21.csv
Processed Data/CountyPopulation/Pop5yr1Est17.21.csv
Processing Pop5yr1Est13.17.csv
Processed Data/CountyPopulation/Pop5yr1Est13.17.csv
Processing Pop5yr1Est18.22.csv
Processed Data/CountyPopulation/Pop5yr1Est18.22.csv
Processing Pop5yr1Est14.18.csv
Processed Data/CountyPopulation/Pop5yr1Est14.18.csv
Processing Pop5yr1Est16.20.csv
Processed Data/CountyPopulation/Pop5yr1Est16.20.csv
Combining dataframes...
Processed county population data saved to Data/county_population_processed.csv


In [22]:
final_df
# Print out each col name on a new line
for col in final_df.columns:
    print(col)

state_code
state_fips
county_fips
county_state_name
county_population
county_pop_density
county_area_total
county_area_land
county_area_water
year


In [18]:
# import os
# import pandas as pd

# # Directory containing county population data
# DATA_DIR = "Data/CountyPopulation"
# OUTPUT_FILE = "Data/county_population_test.csv"

# # Columns to rename
# rename_cols = {
#     "Geo_STUSAB": "state_code",
#     "Geo_STATE": "state_fips",
#     "Geo_COUNTY": "county_fips",
#     "Geo_QName": "county_state_name",
#     "SE_A00001_001": "county_population",
#     "SE_A00002_002": "county_pop_density",
#     "SE_A00003_001": "county_area_total",
#     "SE_A00003_002": "county_area_land",
#     "SE_A00003_003": "county_area_water"
# }

# # File to process
# file_name = "Pop5yr1Est19.23.csv"
# file_path = os.path.join(DATA_DIR, file_name)


# print(f"Processing {file_path}")

# # Extract year from filename

# year_suffix = file_name.split(".")[1]
# year = 2000 + int(year_suffix)  # Convert to full year format

# # Read CSV
# df = pd.read_csv(file_path, encoding="ISO-8859-1")

# # Rename columns
# df.rename(columns=rename_cols, inplace=True)



#     # Drop unnecessary columns (keep only renamed columns)
# df = df[list(rename_cols.values())]


# # Add year column
# df["year"] = year


# df

# #     # Save to CSV
# #     df.to_csv(OUTPUT_FILE, index=False)
# #     print(f"Processed county population data saved to {OUTPUT_FILE}")
# # else:
# #     print(f"File {file_path} not found.")