In [7]:
import pandas as pd
import os
import json
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


In [8]:
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, unquote

# URL of the page containing the link
url = "https://www.countyhealthrankings.org/explore-health-rankings/georgia/data-and-resources"

# Sending a GET request to the webpage
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Finding and downloading Excel files
for link in soup.find_all('a'):
    href = link.get('href')
    #print("href is: " + href)
    if href and (href.endswith('.xlsx') or href.endswith('.xls')):
        # Sending a GET request to download the Excel file
        full_url = urljoin(url, href)
        file_response = requests.get(full_url)
        
        # Saving the file
        file_name = unquote(os.path.basename(full_url))
        
        # Ensure the directory exists
        os.makedirs('./County_Health_Rankings_Data/', exist_ok=True)
        
        # Save the content in the specified directory
        with open(f"./County_Health_Rankings_Data/{file_name}", 'wb') as file:
            file.write(file_response.content)
        
        print(f"Downloaded file: {file_name}")


Downloaded file: 2023 County Health Rankings Georgia Data - v3.xlsx
Downloaded file: 2022 County Health Rankings Georgia Data - v2.xlsx
Downloaded file: 2021 County Health Rankings Georgia Data - v1.xlsx
Downloaded file: 2020 County Health Rankings Georgia Data - v1_0.xlsx
Downloaded file: 2019 County Health Rankings Georgia Data - v1_0.xls
Downloaded file: 2018 County Health Rankings Georgia Data - v3.xls
Downloaded file: 2017 County Health Rankings Georgia Data - v2.xls
Downloaded file: 2016 County Health Rankings Georgia Data - v3.xls
Downloaded file: 2015 County Health Rankings Georgia Data - v3.xls
Downloaded file: 2014 County Health Rankings Georgia Data - v6.xls
Downloaded file: 2013 County Health Ranking Georgia Data - v1_0.xls
Downloaded file: 2012 County Health Ranking Georgia Data - v4.xls
Downloaded file: 2011 County Health Ranking Georgia Data - v4.xls
Downloaded file: 2010 County Health Ranking Georgia Data - v2.xls


In [18]:
def process_data(data_dict, full_path):
    try:
        ranked_measure_data = pd.read_excel(full_path, sheet_name="Ranked Measure Data", header=1)
        additional_measure_data = pd.read_excel(full_path, sheet_name="Additional Measure Data", header=1)
    except Exception as e:
        print(f"Error processing file: {full_path}")
        print(e)
        return
        
    #1. Extract information from the Ranked Measure Data
    
    # Extract correct column names for ranked measure data
    ranked_measure_data_cols = ranked_measure_data.columns.to_list()
    relevant_cols = ['Average Number of Mentally Unhealthy Days', 'Average Number of Physically Unhealthy Days', '% Adults Reporting Currently Smoking'
                     '% Adults with Obesity', '% Excessive Drinking', 'Teen Births', '% Uninsured', 'Preventable Hospital Stays',
                     '% Unemployed', 'Injury Death Rate', 'Drug Overdose Deaths', 'Suicide Rate (Age-Adjusted)',
                     '% Rural']

    for i, row in ranked_measure_data.iterrows():
        county = row['County']
        if pd.isna(county): continue
        
        if county not in data_dict: data_dict[county] = {}
        
        for idx in range(3, len(ranked_measure_data_cols) - 1):
            col = ranked_measure_data_cols[idx] 
            if col in relevant_cols:
                value = row[col]
                ci_low = row[ranked_measure_data_cols[idx+1]] if "CI" in ranked_measure_data_cols[idx+1] else None
                ci_high = row[ranked_measure_data_cols[idx+2]] if 'CI' in ranked_measure_data_cols[idx+2] else None
                
                if ci_low and ci_high:
                    data_dict[county][col] = {
                        'value': value,
                        '95% CI - Low': ci_low,
                        '95% CI - High': ci_high
                    }
                else:
                    data_dict[county][col] = {
                        'value': value,
                    }
    
    print(data_dict)
    return data_dict

                
            

            

In [19]:
if __name__ == "__main__":
    folder_name = "County_Health_Rankings_Data"
    
    # for filename in os.listdir(folder_name):
    #     full_path = os.path.join(folder_name, filename)
    #     data_dict = process_data(data_dict, full_path)
    
    full_path = os.path.join(folder_name, "2020 County Health Rankings Georgia Data - v1_0.xlsx")
    data_dict = {}
    process_data(data_dict, full_path)
    

{'Appling': {'Average Number of Physically Unhealthy Days': {'value': 4.1561778933, '95% CI - Low': 3.9869022184, '95% CI - High': 4.3269248169}, 'Average Number of Mentally Unhealthy Days': {'value': 4.2975712831, '95% CI - Low': 4.1134513348, '95% CI - High': 4.4724933167}, '% Excessive Drinking': {'value': 14.310234745, '95% CI - Low': 13.72369577, '95% CI - High': 14.964168012}, '% Uninsured': {'value': 19.145356662, '95% CI - Low': 17.000675811, '95% CI - High': 21.290037513}, '% Unemployed': {'value': 4.3454223795}, 'Injury Death Rate': {'value': 93.023255814, '95% CI - Low': 74.406597344, '95% CI - High': 114.88296418}}, 'Atkinson': {'Average Number of Physically Unhealthy Days': {'value': 4.570275717, '95% CI - Low': 4.4059996414, '95% CI - High': 4.7332004214}, 'Average Number of Mentally Unhealthy Days': {'value': 4.6174046313, '95% CI - Low': 4.4403051929, '95% CI - High': 4.7914613436}, '% Excessive Drinking': {'value': 13.949385202, '95% CI - Low': 13.386528481, '95% CI - 