In [136]:
# Step-by-step testing notebook for county comparison logic

import pandas as pd
import numpy as np
import pickle
from preprocessed_data import features, racial_features, get_pool_and_scaled

# --------------------------
# Inputs
# --------------------------
state_input = "South Dakota"
county_input = "Oglala Lakota"
variable_input = "Income"


| Left Behind County                  | Mirror Counties                      |
|------------------------------------|-------------------------------------|
| Imperial County, California        | Guadalupe County, Texas              |
| Oglala Lakota County, South Dakota | Big Horn County, Montana            |
| Holmes County, Mississippi         | Brunswick County, Virginia              |
| McDowell County, West Virginia     | Yancey County, North Carolina
| Bronx County, New York             | Fulton County, Georgia               |

In [137]:
df = pd.read_csv("preprocessed_data.csv")

In [138]:

# Step 1: Filter selected county
selected_row = df[(df['State'] == state_input) & (df['County'] == county_input)]
if selected_row.empty:
    raise ValueError(f"No data found for {county_input}, {state_input}")

population_value = selected_row['Population'].values[0]
print(f"Selected county: {county_input}, Population: {population_value}")
selected_row

Selected county: Oglala Lakota, Population: 13519


Unnamed: 0,FIPS,State,County,Population,% Black,% American Indian or Alaska Native,% Asian,% Native Hawaiian or Other Pacific Islander,% Hispanic,% Non-Hispanic White,...,"Arts, entertainment, and recreation",Accommodation and food services,"Other services, except public administration",Public administration,Top 1 industry,Top 2 industry,Top 1 Industry String,Top 2 Industry String,population_percentile,More Info
2413,46102,South Dakota,Oglala Lakota,13519,0.3,89.5,0.3,0.1,5.4,4.9,...,0.0581,0.03,0.0206,0.1535,Educational services,Health care and social assistance,Educational services: 23.35%,Health care and social assistance: 15.51%,30.544066,


In [139]:
print('Population' in features)


True


In [140]:
# --------------------------
# Step 2: Load scaler and weights
# --------------------------
with open('scaler.pkl', 'rb') as f:
    scaler = pickle.load(f)

with open('weights.pkl', 'rb') as f:
    all_weights = pickle.load(f)


In [141]:
# --------------------------
# Step 3: Scale features
# --------------------------
selected_scaled = scaler.transform(selected_row[features]).flatten()
df_scaled = scaler.transform(df[features])
print("Scaled shapes:", df_scaled.shape, selected_scaled.shape)


Scaled shapes: (3143, 28) (28,)


In [142]:
# --------------------------
# Step 4: Get weighted pool
# --------------------------
df_pool, df_scaled_pool, weights = get_pool_and_scaled(
    population_value, df, scaler, features, racial_features, all_weights
)
print("Pool shape:", df_pool.shape)

Pool shape: (3143, 42)


In [143]:
# --------------------------
# Step 5: Compute weighted Euclidean distances
# --------------------------
distances = np.linalg.norm((df_scaled_pool - selected_scaled) * weights, axis=1)
print("Distances shape:", distances.shape)


Distances shape: (3143,)


In [144]:
# --------------------------
# Step 6: Get k nearest neighbors
# --------------------------
k = 200
indices = np.argsort(distances)[:k]
similar_counties = df_pool.iloc[indices].copy()
print("Top k similar counties shape:", similar_counties.shape)
print(similar_counties[['State','County','Population']].head(10))

Top k similar counties shape: (200, 42)
             State         County  Population
2413  South Dakota  Oglala Lakota       13519
2370  South Dakota        Buffalo        1861
2030  North Dakota        Rolette       11933
2422  South Dakota           Todd        9220
1616       Montana        Glacier       13681
2382  South Dakota          Dewey        5140
97         Arizona         Apache       65432
1741      Nebraska       Thurston        6507
1600       Montana       Big Horn       12851
83          Alaska       Kusilvak        8278


In [145]:
# --------------------------
# Step 7: Remove same state
# --------------------------
similar_counties = similar_counties[similar_counties['State'] != state_input]
print("After state filter shape:", similar_counties.shape)
similar_counties


After state filter shape: (182, 42)


Unnamed: 0,FIPS,State,County,Population,% Black,% American Indian or Alaska Native,% Asian,% Native Hawaiian or Other Pacific Islander,% Hispanic,% Non-Hispanic White,...,"Arts, entertainment, and recreation",Accommodation and food services,"Other services, except public administration",Public administration,Top 1 industry,Top 2 industry,Top 1 Industry String,Top 2 Industry String,population_percentile,More Info
2030,38079,North Dakota,Rolette,11933,0.5,77.7,0.3,0.0,2.4,17.5,...,0.0324,0.0259,0.0150,0.0972,Educational services,Health care and social assistance,Educational services: 21.53%,Health care and social assistance: 17.99%,27.362393,
1616,30035,Montana,Glacier,13681,0.3,65.0,0.4,0.0,3.6,29.4,...,0.0304,0.0866,0.0270,0.1533,Educational services,Public administration,Educational services: 16.09%,Public administration: 15.33%,30.766783,
97,4001,Arizona,Apache,65432,0.6,73.4,0.5,0.1,7.2,18.9,...,0.0166,0.0577,0.0193,0.1149,Health care and social assistance,Educational services,Health care and social assistance: 22.68%,Educational services: 17.64%,73.687560,
1741,31173,Nebraska,Thurston,6507,0.5,58.5,0.8,0.1,8.6,34.3,...,0.0529,0.0460,0.0290,0.1592,Health care and social assistance,Public administration,Health care and social assistance: 16.61%,Public administration: 15.91%,14.237989,
1600,30003,Montana,Big Horn,12851,0.4,66.7,0.8,0.0,6.6,25.7,...,0.0191,0.0924,0.0248,0.1155,Health care and social assistance,Educational services,Health care and social assistance: 14.92%,Educational services: 14.77%,29.462297,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2958,53009,Washington,Clallam,77805,0.9,5.8,2.1,0.2,7.6,81.3,...,0.0342,0.0776,0.0620,0.0817,Health care and social assistance,Retail trade,Health care and social assistance: 16.99%,Retail trade: 12.91%,76.710150,
1308,26153,Michigan,Schoolcraft,8188,0.4,9.3,0.3,0.0,1.7,84.8,...,0.0093,0.0983,0.0362,0.0865,Health care and social assistance,Manufacturing,Health care and social assistance: 14.95%,Manufacturing: 14.36%,18.485523,
85,2170,Alaska,Matanuska-Susitna,113325,1.2,7.0,1.7,0.5,5.8,77.3,...,0.0157,0.0555,0.0492,0.0915,Health care and social assistance,Construction,Health care and social assistance: 15.4%,Construction: 12.86%,82.596246,
1207,24029,Maryland,Kent,19320,13.6,0.5,1.5,0.1,5.0,77.9,...,0.0219,0.0566,0.0590,0.0604,Educational services,Retail trade,Educational services: 14.63%,Retail trade: 11.78%,40.820872,


In [146]:
# --------------------------
# Step 8: Apply percentile filter for small counties
# --------------------------
if population_value <= 700_000:
    selected_percentile = selected_row['population_percentile'].values[0]
    percentile_min = selected_percentile - 3
    percentile_max = selected_percentile + 3
    similar_counties = similar_counties[
        (similar_counties['population_percentile'] >= percentile_min) &
        (similar_counties['population_percentile'] <= percentile_max)
    ]
print("After percentile filter shape:", similar_counties.shape)



After percentile filter shape: (13, 42)


In [147]:

# --------------------------
# Step 9: Rank by Income, exclude selected county
# --------------------------
ranked_counties = similar_counties.sort_values(by='Income', ascending=False)
ranked_counties = ranked_counties[ranked_counties.index != index]
top_10_counties = ranked_counties.head(10)
print("Top 10 counties shape:", top_10_counties.shape)

Top 10 counties shape: (10, 42)


In [148]:

# --------------------------
# Step 10: Add links (like More Info column)
# --------------------------
def make_compare_link(original_fips, compare_fips):
    original_fips = str(original_fips).zfill(5)
    compare_fips = str(compare_fips).zfill(5)
    return f"https://www.countyhealthrankings.org/health-data/compare-counties?year=2025&compareCounties={original_fips},{compare_fips}"

original_fips = selected_row['FIPS'].values[0]
top_10_counties['More Info'] = top_10_counties['FIPS'].apply(lambda fips: f"[Link]({make_compare_link(original_fips, fips)})")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_10_counties['More Info'] = top_10_counties['FIPS'].apply(lambda fips: f"[Link]({make_compare_link(original_fips, fips)})")


In [149]:
# --------------------------
# Step 11: Display top 10 with same columns as dashboard
# --------------------------
display_columns = ['State', 'County', 'Population', 'Income',
                   'Top 1 Industry String','Top 2 Industry String',
                   '% Rural', '% Black', '% American Indian or Alaska Native',
                   '% Asian', '% Native Hawaiian or Other Pacific Islander', '% Hispanic', 'More Info']

top_10_counties[display_columns]

Unnamed: 0,State,County,Population,Income,Top 1 Industry String,Top 2 Industry String,% Rural,% Black,% American Indian or Alaska Native,% Asian,% Native Hawaiian or Other Pacific Islander,% Hispanic,More Info
81,Alaska,Ketchikan Gateway,13741,82648.0,Health care and social assistance: 16.49%,Retail trade: 9.91%,14.1,1.0,14.3,8.6,0.4,5.8,[Link](https://www.countyhealthrankings.org/he...
931,Kansas,Jackson,13286,70445.0,Manufacturing: 14.39%,Health care and social assistance: 14.02%,100.0,1.0,8.8,0.6,0.0,5.7,[Link](https://www.countyhealthrankings.org/he...
2181,Oklahoma,Murray,13672,60307.0,Retail trade: 17.05%,Health care and social assistance: 12.17%,65.1,1.8,14.6,0.7,0.1,7.6,[Link](https://www.countyhealthrankings.org/he...
1977,North Carolina,Swain,13967,55462.0,Health care and social assistance: 14.35%,"Arts, entertainment, and recreation: 11.78%",100.0,1.4,30.6,0.8,0.1,7.3,[Link](https://www.countyhealthrankings.org/he...
1516,Missouri,Dent,14467,52945.0,Health care and social assistance: 18.98%,Educational services: 12.74%,67.5,0.6,1.2,0.8,0.1,2.2,[Link](https://www.countyhealthrankings.org/he...
2796,Utah,San Juan,14359,50452.0,Educational services: 19.08%,Health care and social assistance: 15.28%,100.0,0.4,47.1,0.6,0.1,6.7,[Link](https://www.countyhealthrankings.org/he...
1600,Montana,Big Horn,12851,49113.0,Health care and social assistance: 14.92%,Educational services: 14.77%,100.0,0.4,66.7,0.8,0.0,6.6,[Link](https://www.countyhealthrankings.org/he...
2149,Oklahoma,Craig,14123,48922.0,Health care and social assistance: 17.77%,Retail trade: 10.92%,64.1,2.8,21.9,1.4,0.2,5.2,[Link](https://www.countyhealthrankings.org/he...
1616,Montana,Glacier,13681,46270.0,Educational services: 16.09%,Public administration: 15.33%,100.0,0.3,65.0,0.4,0.0,3.6,[Link](https://www.countyhealthrankings.org/he...
2134,Oklahoma,Atoka,14262,46160.0,Retail trade: 12.48%,Health care and social assistance: 12.46%,100.0,5.3,15.3,0.9,0.0,4.6,[Link](https://www.countyhealthrankings.org/he...


In [150]:
print("Features to scale:", features)
print("Columns in df:", df.columns.tolist())

Features to scale: ['% Black', '% American Indian or Alaska Native', '% Asian', '% Native Hawaiian or Other Pacific Islander', '% Hispanic', '% Non-Hispanic White', 'Population', '% Rural', 'Agriculture, forestry, fishing and hunting', 'Mining, quarrying, and oil and gas extraction', 'Construction', 'Manufacturing', 'Wholesale trade', 'Retail trade', 'Transportation and warehousing', 'Utilities', 'Information', 'Finance and insurance', 'Real estate and rental and leasing', 'Professional, scientific, and technical services', 'Management of companies', 'Administrative and support and waste management services', 'Educational services', 'Health care and social assistance', 'Arts, entertainment, and recreation', 'Accommodation and food services', 'Other services, except public administration', 'Public administration']
Columns in df: ['FIPS', 'State', 'County', 'Population', '% Black', '% American Indian or Alaska Native', '% Asian', '% Native Hawaiian or Other Pacific Islander', '% Hispanic