In [14]:
# using Random Forest to assign features which are suitable to calculate access scores rather than manually assigning them
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [45]:
pre_df = pd.read_csv('/Users/manika/Desktop/Python : Learning/healthcare-accessibility-index/data/clean/merge_healthcare_accessibility_index.csv')
df = pre_df[['state','median_income($1k)', 'insured', 'broadband', 'facility_100k','avg_travel_time_min','access_score']]


In [47]:
# not using population as a metric as we have facilities_100k which is derrived from population
features = ['median_income($1k)', 'insured', 'broadband', 'facility_100k','avg_travel_time_min']
target = 'access_score'

In [48]:
X = df[features]
y = df[target]
state = df['state'] # for comparison later

In [49]:
# Split the dataset (80% train, 20% test)
X_train, X_test,y_train,y_test,state_train,state_test = train_test_split(X,y,state,test_size=0.2,random_state=42)

In [50]:
# Initialize and train the model

model = RandomForestRegressor(n_estimators=100,random_state=42)
model.fit(X_train,y_train)

In [51]:
# Make predictions
y_pred = model.predict(X_test)

In [52]:
# Evaluate the model
rmse = mean_squared_error(y_test,y_pred,squared=False)
r2 = r2_score(y_test,y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

RMSE: 5.45
R² Score: 0.34




In [53]:
# merging results into dataframe

comparison = pd.DataFrame({
    'state' : state_test.values,
    'actual_access_score': y_test.values,
    'predicted_scores':y_pred


})

In [54]:
comparison

Unnamed: 0,state,actual_access_score,predicted_scores
0,Wyoming,44.25,44.3546
1,New Mexico,44.75,45.848
2,Alaska,42.15,56.6165
3,North Dakota,58.3,56.84
4,Iowa,52.94,54.9543
5,Florida,48.41,40.6714
6,Delaware,57.0,61.6116
7,South Carolina,47.99,46.517
8,New Jersey,64.29,64.2194
9,Missouri,50.13,48.3031


The result is a mixture of accurate values and also some off values so extracting feature importance 

In [55]:
import matplotlib.pyplot as plt
importances = model.feature_importances_
feature_names = X.columns 


In [56]:
# dataframe to show the feature score given by the model
importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': importances
}).sort_values(by='importance', ascending=False)

In [57]:
importance_df

Unnamed: 0,feature,importance
0,median_income($1k),0.503357
1,insured,0.364395
2,broadband,0.062813
4,avg_travel_time_min,0.046789
3,facility_100k,0.022647


In [58]:
# exporting feature importance 

import json

# Assuming 'model' is your trained RandomForestRegressor
ml_feature_importance = dict(zip(X.columns, model.feature_importances_))

# Save to a JSON file
with open("/Users/manika/Desktop/Python : Learning/healthcare-accessibility-index/data/clean/ml_feature_importances.json", "w") as f:
    json.dump(ml_feature_importance, f)



In [59]:
# exporting model 
import pickle

# Assuming your trained model is named 'model'
with open("/Users/manika/Desktop/Python : Learning/healthcare-accessibility-index/model/rf_model.pkl", "wb") as f:
    pickle.dump(model, f)
