In [1]:
import pandas as pd
import os
import json
import sqlite3
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3


In [8]:
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin, unquote

# URL of the page containing the link
url = "https://www.countyhealthrankings.org/explore-health-rankings/georgia/data-and-resources"

# Sending a GET request to the webpage
response = requests.get(url)
soup = BeautifulSoup(response.content, 'html.parser')

# Finding and downloading Excel files
for link in soup.find_all('a'):
    href = link.get('href')
    #print("href is: " + href)
    if href and (href.endswith('.xlsx') or href.endswith('.xls')):
        # Sending a GET request to download the Excel file
        full_url = urljoin(url, href)
        file_response = requests.get(full_url)
        
        # Saving the file
        file_name = unquote(os.path.basename(full_url))
        
        # Ensure the directory exists
        os.makedirs('./County_Health_Rankings_Data/', exist_ok=True)
        
        # Save the content in the specified directory
        with open(f"./County_Health_Rankings_Data/{file_name}", 'wb') as file:
            file.write(file_response.content)
        
        print(f"Downloaded file: {file_name}")


Downloaded file: 2023 County Health Rankings Georgia Data - v3.xlsx
Downloaded file: 2022 County Health Rankings Georgia Data - v2.xlsx
Downloaded file: 2021 County Health Rankings Georgia Data - v1.xlsx
Downloaded file: 2020 County Health Rankings Georgia Data - v1_0.xlsx
Downloaded file: 2019 County Health Rankings Georgia Data - v1_0.xls
Downloaded file: 2018 County Health Rankings Georgia Data - v3.xls
Downloaded file: 2017 County Health Rankings Georgia Data - v2.xls
Downloaded file: 2016 County Health Rankings Georgia Data - v3.xls
Downloaded file: 2015 County Health Rankings Georgia Data - v3.xls
Downloaded file: 2014 County Health Rankings Georgia Data - v6.xls
Downloaded file: 2013 County Health Ranking Georgia Data - v1_0.xls
Downloaded file: 2012 County Health Ranking Georgia Data - v4.xls
Downloaded file: 2011 County Health Ranking Georgia Data - v4.xls
Downloaded file: 2010 County Health Ranking Georgia Data - v2.xls


In [2]:
def process_data(data_dict, full_path):
    try:
        ranked_measure_data = pd.read_excel(full_path, sheet_name="Ranked Measure Data", header=1)
        additional_measure_data = pd.read_excel(full_path, sheet_name="Additional Measure Data", header=1)
    except Exception as e:
        print(f"Error processing file: {full_path}")
        print(e)
        return
        
    #1. Extract information from the Ranked Measure Data
    ranked_measure_data_cols = ranked_measure_data.columns.to_list()
    relevant_cols = ['Average Number of Mentally Unhealthy Days', 'Average Number of Physically Unhealthy Days', '% Adults Reporting Currently Smoking',
                     '% Adults with Obesity', '% Excessive Drinking', 'Teen Birth Rate', '% Uninsured', 'Preventable Hospitalization Rate',
                     '% Unemployed', 'Injury Death Rate', '% Children in Poverty', 'Violent Crime Rate', '% Smokers']

    for i, row in ranked_measure_data.iterrows():
        county = row['County']
        if pd.isna(county): continue
        
        if county not in data_dict: data_dict[county] = {}
        
        for idx in range(3, len(ranked_measure_data_cols) - 1):
            col = ranked_measure_data_cols[idx] 
            value, ci_low, ci_high = None, None, None
            if col in relevant_cols:
                value = row[col]
                ci_low = row[ranked_measure_data_cols[idx+1]] if "CI" in ranked_measure_data_cols[idx+1] else None
                ci_high = row[ranked_measure_data_cols[idx+2]] if 'CI' in ranked_measure_data_cols[idx+2] else None
                if ci_low and ci_high:
                    data_dict[county][col] = {
                        'value': value,
                        '95% CI - Low': ci_low,
                        '95% CI - High': ci_high
                    }
                else:
                    data_dict[county][col] = {
                        'value': value,
                    }
    #1. Extract information from the Additional Measure Data
    additional_measure_data_cols = additional_measure_data.columns.to_list()
    relevant_cols2 = ['Drug Overdose Mortality Rate', 'Suicide Rate (Age-Adjusted)', '% rural', 'Median Household Income']
    
    for i, row in additional_measure_data.iterrows():
        county = row['County']
        if pd.isna(county): continue
        if county not in data_dict: data_dict[county] = {}
        
        for idx in range(3, len(additional_measure_data_cols) - 1):
            col = additional_measure_data_cols[idx] 
            value, ci_low, ci_high = None, None, None
            if col in relevant_cols2:
                value = row[col]
                ci_low = row[additional_measure_data_cols[idx+1]] if "CI" in additional_measure_data_cols[idx+1] else None
                ci_high = row[additional_measure_data_cols[idx+2]] if 'CI' in additional_measure_data_cols[idx+2] else None
                if ci_low and ci_high:
                    data_dict[county][col] = {
                        'value': value,
                        '95% CI - Low': ci_low,
                        '95% CI - High': ci_high
                    }
                else:
                    data_dict[county][col] = {
                        'value': value,
                    }
    
    #print(data_dict)
    return data_dict

                
            

            

In [3]:
def create_database(db_name):
    #Connect to SQLite database
    conn = sqlite3.connect(db_name)
    c = conn.cursor()
    
    #Create Tables
    c.execute('''
            CREATE TABLE IF NOT EXISTS Counties (
                id INTEGER PRIMARY KEY,
                name TEXT NOT NULL UNIQUE
            )
              ''')
    
    c.execute('''
              CREATE TABLE IF NOT EXISTS Years (
                  id INTEGER PRIMARY KEY,
                  year INTEGER NOT NULL UNIQUE
              )
              ''')
    c.execute('''
              CREATE TABLE IF NOT EXISTS Metrics (
                  id INTEGER PRIMARY KEY,
                  county_id INTEGER,
                  year_id INTEGER,
                  variable TEXT NOT NULL,
                  average REAL,
                  ci_low REAL,
                  ci_high REAL,
                  FOREIGN KEY (county_id) REFERENCES Counties(id),
                  FOREIGN KEY (year_id) REFERENCES Years(id)
              )
              ''')
    
    conn.commit()
    conn.close()

In [4]:
def insert_data(db_name, data_dict, year):
    conn = sqlite3.connect(db_name)
    c = conn.cursor()
    
    #Insert or ignore into Counties and Years
    for county in data_dict.keys():
        c.execute("INSERT OR IGNORE INTO Counties (name) VALUES (?)", (county,))
        c.execute("INSERT OR IGNORE INTO Years (year) VALUES (?)", (year,))
        
        
    #Insert metrics into Metrics table 
    for county, metrics in data_dict.items():
        for variable, values in metrics.items():
            c.execute('''
                      INSERT INTO Metrics (county_id, year_id, variable, average, ci_low, ci_high)
                      VALUES ((SELECT id FROM Counties WHERE name = ?),
                               (SELECT id FROM Years WHERE year = ?), 
                                ?, ?, ?, ?)
                      ''', (county, year, variable, values.get('value'), values.get('95% CI - Low'), values.get('95% CI - High')))
            
    conn.commit()
    conn.close()
        

In [5]:
def export_to_csv(db_name, output_file):
    conn = sqlite3.connect(db_name)
    
    query = """
    SELECT c.name as  County, y.year as Year, m.variable as Variable, m.average as Average, m.ci_low as CI_Low, m.ci_high as CI_High
    FROM Metrics m
    JOIN Counties c ON m.county_id = c.id
    JOIN Years y on m.year_id = y.id
    """
    
    #Load the data into a pandas DataFrame
    data = pd.read_sql(query, conn)
    
    #Save the data to a CSV file
    data.to_csv(output_file, index=False)
    
    #Close the connection
    conn.close()

In [6]:
if __name__ == "__main__":
    folder_name = "County_Health_Rankings_Data"
    create_database('Health_Rankings_2.db')

    for filename in os.listdir(folder_name):
        full_path = os.path.join(folder_name, filename)
        year = filename.split()[0] #Extract year from filename
        data_dict = {}
        process_data(data_dict, full_path)
        #print(data_dict)
        # Insert data into database
        insert_data('Health_Rankings_2.db',data_dict, year)
            
        # Export database
        export_to_csv('Health_Rankings_2.db', 'full_database_export_2.csv')
    
    os.remove('Health_Rankings_2.db')

Error processing file: County_Health_Rankings_Data/2010 County Health Ranking Georgia Data - v2.xls
Worksheet named 'Ranked Measure Data' not found
