In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, mean_squared_error

In [2]:
df = pd.read_csv('../Data/clustered_df.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'PhysicalHealthDays', 'SleepHours', 'HeightInMeters',
       'GeneralHealth', 'RemovedTeeth', 'AgeCategory', 'SmokerStatus_ord',
       'AlcoholDrinkers', 'BlindOrVisionDifficulty', 'ChestScan', 'CovidPosNo',
       'CovidPosYes', 'DifficultyConcentrating', 'DifficultyDressingBathing',
       'DifficultyErrands', 'DifficultyWalking', 'FluVaxLast12', 'HIVTesting',
       'HadAngina', 'HadArthritis', 'HadDiabetesOther', 'HadDiabetesYes',
       'HadKidneyDisease', 'HadSkinCancer', 'HadStroke', 'LastCheckupTime',
       'PhysicalActivities', 'PneumoVaxEver', 'RaceEthnicityCategoryHispanic',
       'RaceEthnicityCategoryMultiracial._Non.Hispanic',
       'RaceEthnicityCategoryOther_race_only._Non.Hispanic',
       'RaceEthnicityCategoryWhite_only._Non.Hispanic',
       'RegionEast_South_Central', 'RegionMiddle_Atlantic',
       'RegionMountain_Region', 'RegionNew_England', 'RegionTerritories',
       'Sex', 'TetanusLast10Tdap', 'HadHeartAttack', 'cluster'],
      dtype

In [4]:
df.drop(columns=['Unnamed: 0'], inplace=True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 246022 entries, 0 to 246021
Data columns (total 41 columns):
 #   Column                                              Non-Null Count   Dtype  
---  ------                                              --------------   -----  
 0   PhysicalHealthDays                                  246022 non-null  int64  
 1   SleepHours                                          246022 non-null  int64  
 2   HeightInMeters                                      246022 non-null  float64
 3   GeneralHealth                                       246022 non-null  int64  
 4   RemovedTeeth                                        246022 non-null  int64  
 5   AgeCategory                                         246022 non-null  int64  
 6   SmokerStatus_ord                                    246022 non-null  int64  
 7   AlcoholDrinkers                                     246022 non-null  int64  
 8   BlindOrVisionDifficulty                             246022 non-n

In [6]:
cluster = df[df['cluster'] == 1]
def get_significant_vars(df, cluster_num, ):
    log_model = LogisticRegression(max_iter=1000)
    y = cluster['HadHeartAttack']
    prep_cluster = df.drop(columns=['HadHeartAttack', 'cluster'])
    
    log_model.fit(prep_cluster, y)
    
    y_pred = log_model.predict_proba(prep_cluster)[:, 1] > 0.58
    
    print(f'Results for Cluster {cluster_num}')
    # Summary
    print(classification_report(y, y_pred))
    # Calculate RMSE
    rmse = np.sqrt(mean_squared_error(y, y_pred))
    print('RMSE:', rmse)
    
    # Return the feature importances and p values for the significant variables
    feature_importances = pd.Series(log_model.coef_[0], index=prep_cluster.columns)
    return feature_importances

significant_vars = {}
for i in range(1, df['cluster'].max()+1):
    print(f'Cluster {i}')
    cluster = df[df['cluster'] == i]
    significant_vars[f'cluster_{i}'] = get_significant_vars(cluster, i)


Cluster 1
Results for Cluster 1
              precision    recall  f1-score   support

           0       0.96      1.00      0.98     48641
           1       0.46      0.06      0.11      1915

    accuracy                           0.96     50556
   macro avg       0.71      0.53      0.55     50556
weighted avg       0.95      0.96      0.95     50556

RMSE: 0.19558785430259287
Cluster 2
Results for Cluster 2
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     44419
           1       0.41      0.04      0.07       536

    accuracy                           0.99     44955
   macro avg       0.70      0.52      0.53     44955
weighted avg       0.98      0.99      0.98     44955

RMSE: 0.11000449935692475
Cluster 3
Results for Cluster 3
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     48766
           1       0.44      0.01      0.01       620

    accuracy                      

In [7]:
importance_df = pd.DataFrame(significant_vars)
importance_df.to_csv('../Data/cluster_feat_importance.csv')