In [12]:
import pandas as pd
import os


In [39]:
filename = "2023 County Health Rankings Georgia Data - v3.xlsx"

if os.path.exists(filename):
    print(f"The file {filename} exists and is accessible.")
else:
    print(f"The file {filename} does not exist or is not accessible.")

The file 2023 County Health Rankings Georgia Data - v3.xlsx exists and is accessible.


In [40]:
ranked_measure_data = pd.read_excel(filename, sheet_name="Ranked Measure Data", header=1)
additional_measure_data = pd.read_excel(filename, sheet_name="Additional Measure Data", engine="openpyxl")

In [41]:
print(ranked_measure_data.head())
print(additional_measure_data.head())

    FIPS    State    County Unreliable    Deaths  \
0  13000  Georgia       NaN        NaN  143179.0   
1  13001  Georgia   Appling        NaN     379.0   
2  13003  Georgia  Atkinson        NaN     154.0   
3  13005  Georgia     Bacon        NaN     241.0   
4  13007  Georgia     Baker          x       NaN   

   Years of Potential Life Lost Rate  95% CI - Low  95% CI - High   Z-Score  \
0                        8016.730946   7955.977709    8077.484183       NaN   
1                       11113.699673   9345.538818   12881.860528  0.501568   
2                       11429.804405   8738.789750   14120.819061  0.644947   
3                       12649.824162  10392.750491   14906.897832  1.198325   
4                                NaN           NaN            NaN  0.000000   

   YPLL Rate (AIAN)  ...  % Drive Alone (Hispanic) 95% CI - Low  \
0       2896.759045  ...                              65.483151   
1               NaN  ...                                    NaN   
2          

In [42]:
print(ranked_measure_data.columns.tolist())

['FIPS', 'State', 'County', 'Unreliable', 'Deaths', 'Years of Potential Life Lost Rate', '95% CI - Low', '95% CI - High', 'Z-Score', 'YPLL Rate (AIAN)', 'YPLL Rate (AIAN) 95% CI - Low', 'YPLL Rate (AIAN) 95% CI - High', 'YPLL Rate (AIAN) Unreliable', 'YPLL Rate (Asian)', 'YPLL Rate (Asian) 95% CI - Low', 'YPLL Rate (Asian) 95% CI - High', 'YPLL Rate (Asian) Unreliable', 'YPLL Rate (Black)', 'YPLL Rate (Black) 95% CI - Low', 'YPLL Rate (Black) 95% CI - High', 'YPLL Rate (Black) Unreliable', 'YPLL Rate (Hispanic)', 'YPLL Rate (Hispanic) 95% CI - Low', 'YPLL Rate (Hispanic) 95% CI - High', 'YPLL Rate (Hispanic) Unreliable', 'YPLL Rate (White)', 'YPLL Rate (White) 95% CI - Low', 'YPLL Rate (White) 95% CI - High', 'YPLL Rate (White) Unreliable', '% Fair or Poor Health', '95% CI - Low.1', '95% CI - High.1', 'Z-Score.1', 'Average Number of Physically Unhealthy Days', '95% CI - Low.2', '95% CI - High.2', 'Z-Score.2', 'Average Number of Mentally Unhealthy Days', '95% CI - Low.3', '95% CI - High

In [43]:
# Initialize a dictionary to store the data
data_dict = {}

# Extract correct column names
column_names = ranked_measure_data.columns.to_list()

# Iterate over each row to populate the dictionary
for i, row in ranked_measure_data.iterrows():
    if i < 2:  # Skip the metadata rows
        continue
    
    # Extract the county name
    county = row['County']  # Extract using the correct column name
    if pd.isna(county):  # Skip rows where county name is NaN
        continue
    
    # Initialize a dictionary for the county
    data_dict[county] = {}
    
    # Iterate over each variable and extract the valid average value and its corresponding 95% confidence interval
    variable = None
    for idx in range(3, len(column_names) - 1):  # Start iterating from the 4th column and leave room for CI columns
        col = column_names[idx]
        value = row[col]
        
        # Check if the column is a valid variable with 'Rate' or 'Ratio' and has both 95% CI columns
        if ('Rate' in col or 'Ratio' in col) and '95% CI' in column_names[idx + 1] and '95% CI' in column_names[idx + 2]:
            variable = col
            data_dict[county][variable] = {'average': value,
                                           '95% CI - Low': row[column_names[idx + 1]],
                                           '95% CI - High': row[column_names[idx + 2]]}


In [44]:
# Print the dictionary for a couple of counties to preview the data structure
for county in list(data_dict.keys())[:2]:
    print(county)
    print(data_dict[county])

Atkinson
{'Years of Potential Life Lost Rate': {'average': 11429.804405, '95% CI - Low': 8738.7897501, '95% CI - High': 14120.819061}, 'YPLL Rate (AIAN)': {'average': nan, '95% CI - Low': nan, '95% CI - High': nan}, 'YPLL Rate (Asian)': {'average': nan, '95% CI - Low': nan, '95% CI - High': nan}, 'YPLL Rate (Black)': {'average': nan, '95% CI - Low': nan, '95% CI - High': nan}, 'YPLL Rate (Hispanic)': {'average': nan, '95% CI - Low': nan, '95% CI - High': nan}, 'YPLL Rate (White)': {'average': nan, '95% CI - Low': nan, '95% CI - High': nan}, 'Teen Birth Rate': {'average': 56.640625, '95% CI - Low': 46.333082986, '95% CI - High': 66.948167014}, 'Teen Birth Rate (AIAN)': {'average': nan, '95% CI - Low': nan, '95% CI - High': nan}, 'Teen Birth Rate (Asian)': {'average': nan, '95% CI - Low': nan, '95% CI - High': nan}, 'Teen Birth Rate (Black)': {'average': 45.226130653, '95% CI - Low': 26.803871308, '95% CI - High': 71.476784592}, 'Teen Birth Rate (Hispanic)': {'average': 44.709388972, '95