In [11]:
# Importing essential libraries for analysis
import pandas as pd        # For data manipulation and analysis
import matplotlib.pyplot as plt  # For plotting and visualization
import datetime as dt      # For handling date and time
import numpy as np         # For numerical computations
import seaborn as sns      # For statistical data visualization
import folium              # For creating interactive maps
import json


In [12]:
VSRR_Provisional ="VSRR_Provisional_County-Level_Drug_Overdose_Death_Counts_20240910.csv"

In [13]:
VSRR_Provisional_df = pd.read_csv("VSRR_Provisional_County-Level_Drug_Overdose_Death_Counts_20240910.csv")
VSRR_Provisional_df

Unnamed: 0,Data as of,Year,Month,ST_ABBREV,STATE_NAME,COUNTYNAME,FIPS,STATEFIPS,COUNTYFIPS,CODE2013,Provisional Drug Overdose Deaths,Footnote,Percentage Of Records Pending Investigation,HistoricalDataCompletenessNote,MonthEndingDate,Start Date,End Date
0,7/7/2024,2020,1,AL,Alabama,Autauga,1001,1,1,3,,One or more data cells have counts between 1-9...,0.56,,1/31/2020,1/31/2020,12/31/2023
1,7/7/2024,2020,2,AL,Alabama,Autauga,1001,1,1,3,,One or more data cells have counts between 1-9...,0.55,,2/29/2020,1/31/2020,12/31/2023
2,7/7/2024,2020,3,AL,Alabama,Autauga,1001,1,1,3,,One or more data cells have counts between 1-9...,0.55,,3/31/2020,1/31/2020,12/31/2023
3,7/7/2024,2020,4,AL,Alabama,Autauga,1001,1,1,3,,One or more data cells have counts between 1-9...,0.54,,4/30/2020,1/31/2020,12/31/2023
4,7/7/2024,2020,5,AL,Alabama,Autauga,1001,1,1,3,,One or more data cells have counts between 1-9...,0.54,,5/31/2020,1/31/2020,12/31/2023
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
150907,7/7/2024,2021,12,WY,Wyoming,Weston,56045,56,45,6,0.0,,0.00,,12/31/2021,1/31/2020,12/31/2023
150908,7/7/2024,2020,1,WY,Wyoming,Weston,56045,56,45,6,0.0,,0.00,,1/31/2020,1/31/2020,12/31/2023
150909,7/7/2024,2020,2,WY,Wyoming,Weston,56045,56,45,6,0.0,,0.00,,2/29/2020,1/31/2020,12/31/2023
150910,7/7/2024,2020,3,WY,Wyoming,Weston,56045,56,45,6,0.0,,0.00,,3/31/2020,1/31/2020,12/31/2023


In [14]:
# Selecting only the required columns from the DataFrame
selected_columns = ["Year", "STATE_NAME", "COUNTYNAME", "Provisional Drug Overdose Deaths"]

# Filtering the DataFrame to include only these columns
VSRR_filtered_df = VSRR_Provisional_df[selected_columns]

# Displaying the filtered DataFrame
VSRR_filtered_df



Unnamed: 0,Year,STATE_NAME,COUNTYNAME,Provisional Drug Overdose Deaths
0,2020,Alabama,Autauga,
1,2020,Alabama,Autauga,
2,2020,Alabama,Autauga,
3,2020,Alabama,Autauga,
4,2020,Alabama,Autauga,
...,...,...,...,...
150907,2021,Wyoming,Weston,0.0
150908,2020,Wyoming,Weston,0.0
150909,2020,Wyoming,Weston,0.0
150910,2020,Wyoming,Weston,0.0


In [15]:
# Dropping rows where "Provisional Drug Overdose Deaths" is NaN
VSRR_filtered_df = VSRR_filtered_df.dropna(subset=["Provisional Drug Overdose Deaths"])

# Display the updated DataFrame after dropping NaN values
VSRR_filtered_df


Unnamed: 0,Year,STATE_NAME,COUNTYNAME,Provisional Drug Overdose Deaths
30,2022,Alabama,Autauga,10.0
48,2020,Alabama,Baldwin,27.0
49,2020,Alabama,Baldwin,28.0
50,2020,Alabama,Baldwin,34.0
51,2020,Alabama,Baldwin,37.0
...,...,...,...,...
150907,2021,Wyoming,Weston,0.0
150908,2020,Wyoming,Weston,0.0
150909,2020,Wyoming,Weston,0.0
150910,2020,Wyoming,Weston,0.0


In [16]:
# Filtering the DataFrame to include only rows where STATE_NAME is 'Ohio'
ohio_df = VSRR_filtered_df[VSRR_filtered_df["STATE_NAME"] == "Ohio"]

# Display the filtered DataFrame for Ohio
ohio_df


Unnamed: 0,Year,STATE_NAME,COUNTYNAME,Provisional Drug Overdose Deaths
98064,2020,Ohio,Adams,13.0
98065,2020,Ohio,Adams,10.0
98068,2020,Ohio,Adams,10.0
98069,2020,Ohio,Adams,12.0
98070,2020,Ohio,Adams,12.0
...,...,...,...,...
102239,2021,Ohio,Wood,34.0
102276,2023,Ohio,Wyandot,0.0
102277,2023,Ohio,Wyandot,0.0
102278,2023,Ohio,Wyandot,0.0


In [17]:
# Grouping by COUNTYNAME and Year, then aggregating the Provisional Drug Overdose Deaths
aggregated_data = ohio_df.groupby(["COUNTYNAME", "Year"])["Provisional Drug Overdose Deaths"].sum().reset_index()
aggregated_data["Provisional Drug Overdose Deaths"] = aggregated_data["Provisional Drug Overdose Deaths"].astype(int)

# Displaying the aggregated DataFrame
aggregated_data


Unnamed: 0,COUNTYNAME,Year,Provisional Drug Overdose Deaths
0,Adams,2020,132
1,Adams,2021,188
2,Adams,2022,209
3,Adams,2023,223
4,Allen,2020,322
...,...,...,...
304,Wood,2020,322
305,Wood,2021,382
306,Wood,2022,270
307,Wood,2023,242


In [21]:
# Load the GeoJSON file
with open('Boundaries.geojson') as f:
    geojson_data = json.load(f)

# Create a function to extract coordinates and county name from the GeoJSON
geo_data_list = []

for feature in geojson_data['features']:
    county_name = feature['properties']['name']  # The county name is stored in 'name'
    coordinates = feature['geometry']['coordinates']  # Store the coordinates
    geo_data_list.append({
        'COUNTYNAME': county_name,
        'Coordinates': coordinates
    })

# Convert the geojson data to a DataFrame
geo_df = pd.DataFrame(geo_data_list)

# Merge the cleaned data with the geographical data on the county name
merged_data = pd.merge(aggregated_data, geo_df, on='COUNTYNAME', how='left')

# Check if any counties didn't get matched
unmatched_counties = merged_data[merged_data['Coordinates'].isnull()]
if not unmatched_counties.empty:
    print("These counties were not matched with geo data:", unmatched_counties['COUNTYNAME'].unique())


In [22]:
merged_data.to_csv('cleandata.csv', index=True)