In [28]:
import pandas as pd
import statsmodels.api as sm
from sklearn.metrics import mean_squared_error as MSE
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

# read in all three filled datasets
mean = pd.read_csv('hospitaldata_mean.csv')  # filled with mean values
iterative = pd.read_csv('hospitaldata_iterative.csv')  # filled with linear regression
knn = pd.read_csv('hospitaldata_knn.csv')  # filled with knn

mean.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Provider ID,COMP_HIP_KNEE,MORT_30_AMI,MORT_30_CABG,MORT_30_COPD,MORT_30_HF,MORT_30_PN,MORT_30_STK,...,H_QUIET_LINEAR_SCORE,H_RECMND_LINEAR_SCORE,Hospital Name,Address,City,State,ZIP Code,Number of Completed Surveys,Survey Response Rate Percent,STAR_RATING
0,0,0,10001,3.0,14.5,4.5,8.9,12.5,16.2,13.7,...,86,84,SOUTHEAST ALABAMA MEDICAL CENTER,1108 ROSS CLARK CIRCLE,DOTHAN,AL,36301,506,21,2
1,1,1,10005,3.0,15.5,3.230405,10.9,15.3,20.8,15.7,...,85,88,MARSHALL MEDICAL CENTER SOUTH,2505 U S HIGHWAY 431 NORTH,BOAZ,AL,35957,1135,34,3
2,2,2,10006,3.6,14.9,3.2,8.8,13.2,17.2,17.2,...,82,84,ELIZA COFFEE MEMORIAL HOSPITAL,205 MARENGO STREET,FLORENCE,AL,35631,579,22,2
3,3,3,10007,2.8,13.172227,3.230405,8.3,13.3,21.6,15.9,...,86,85,MIZELL MEMORIAL HOSPITAL,702 N MAIN ST,OPP,AL,36467,185,27,2
4,4,4,10011,2.6,14.1,3.7,8.9,13.1,16.1,12.4,...,81,86,ST VINCENT'S EAST,50 MEDICAL PARK EAST DRIVE,BIRMINGHAM,AL,35235,2193,33,3


In [13]:
def run_OLS(df):
    
    # split train and test data
    xtrain, xtest, ytrain, ytest = train_test_split(
        df[['COMP_HIP_KNEE', 'MORT_30_AMI', 'MORT_30_CABG', 'MORT_30_COPD', 'MORT_30_HF', 'MORT_30_PN',
                  'MORT_30_STK', 'PSI_10_POST_KIDNEY', 'PSI_11_POST_RESP', 'PSI_12_POSTOP_PULMEMB_DVT', 'PSI_13_POST_SEPSIS',
                  'PSI_14_POSTOP_DEHIS', 'PSI_15_ACC_LAC', 'PSI_3_ULCER', 'PSI_4_SURG_COMP', 'PSI_6_IAT_PTX',
                  'PSI_8_POST_HIP', 'PSI_90_SAFETY', 'PSI_9_POST_HEM', 'H_CLEAN_LINEAR_SCORE', 'H_COMP_1_LINEAR_SCORE', 'H_COMP_2_LINEAR_SCORE', 'H_COMP_3_LINEAR_SCORE',
                  'H_COMP_5_LINEAR_SCORE', 'H_COMP_6_LINEAR_SCORE', 'H_COMP_7_LINEAR_SCORE', 'H_HSP_RATING_LINEAR_SCORE',
                  'H_QUIET_LINEAR_SCORE', 'Number of Completed Surveys', 'Survey Response Rate Percent']], 
        df[['H_RECMND_LINEAR_SCORE']], test_size=0.2, random_state=0)

    X = xtrain
    y = ytrain

    # fit multiple regression model
    model = sm.OLS(y, X).fit()
    predictions = model.predict(X) # make the predictions by the model

    # Print out the statistics
    model.summary()
    
    # Compute y_pred_ols
    y_pred_ols = model.predict(xtest)

    # Compute mse_dt
    mse_ols = MSE(ytest, y_pred_ols)

    # Compute rmse_dt
    rmse_ols = mse_ols**(1/2)

    # Print rmse_dt
    print("Test set RMSE of OLS: {:.2f}".format(rmse_ols))

In [19]:
run_OLS(mean)

Test set RMSE of OLS: 1.34


In [20]:
run_OLS(iterative)

Test set RMSE of OLS: 1.34


In [21]:
run_OLS(knn)

Test set RMSE of OLS: 1.34


No difference between the 3 filled datasets when using multiple linear regression. All achieved RMSE of 1.34. 

In [40]:
def run_SVM(df, kernel_type):
    
    # split trainn and test data
    xtrain, xtest, ytrain, ytest = train_test_split(
        df[['COMP_HIP_KNEE', 'MORT_30_AMI', 'MORT_30_CABG', 'MORT_30_COPD', 'MORT_30_HF', 'MORT_30_PN',
                  'MORT_30_STK', 'PSI_10_POST_KIDNEY', 'PSI_11_POST_RESP', 'PSI_12_POSTOP_PULMEMB_DVT', 'PSI_13_POST_SEPSIS',
                  'PSI_14_POSTOP_DEHIS', 'PSI_15_ACC_LAC', 'PSI_3_ULCER', 'PSI_4_SURG_COMP', 'PSI_6_IAT_PTX',
                  'PSI_8_POST_HIP', 'PSI_90_SAFETY', 'PSI_9_POST_HEM', 'H_CLEAN_LINEAR_SCORE', 'H_COMP_1_LINEAR_SCORE', 'H_COMP_2_LINEAR_SCORE', 'H_COMP_3_LINEAR_SCORE',
                  'H_COMP_5_LINEAR_SCORE', 'H_COMP_6_LINEAR_SCORE', 'H_COMP_7_LINEAR_SCORE', 'H_HSP_RATING_LINEAR_SCORE',
                  'H_QUIET_LINEAR_SCORE', 'Number of Completed Surveys', 'Survey Response Rate Percent']], 
        df[['STAR_RATING']], test_size=0.2, random_state=0)
    
    # scale data
    scaler = StandardScaler()
    xtrain = scaler.fit_transform(xtrain)
    xtest = scaler.fit_transform(xtest)

    # run SVM
    svc = svm.SVC(kernel=kernel_type, C=1)
    svc.fit(xtrain, ytrain)
    y_pred_svm = svc.predict(xtest)
    print(classification_report(ytest, y_pred_svm))

In [41]:
run_SVM(mean, 'linear')

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           1       0.97      0.75      0.85        40
           2       0.82      0.72      0.77       143
           3       0.77      0.88      0.82       302
           4       0.82      0.74      0.78       190
           5       0.73      0.71      0.72        31

    accuracy                           0.80       706
   macro avg       0.82      0.76      0.79       706
weighted avg       0.80      0.80      0.80       706



In [38]:
df = knn
xtrain, xtest, ytrain, ytest = train_test_split(
    df[['COMP_HIP_KNEE', 'MORT_30_AMI', 'MORT_30_CABG', 'MORT_30_COPD', 'MORT_30_HF', 'MORT_30_PN',
              'MORT_30_STK', 'PSI_10_POST_KIDNEY', 'PSI_11_POST_RESP', 'PSI_12_POSTOP_PULMEMB_DVT', 'PSI_13_POST_SEPSIS',
              'PSI_14_POSTOP_DEHIS', 'PSI_15_ACC_LAC', 'PSI_3_ULCER', 'PSI_4_SURG_COMP', 'PSI_6_IAT_PTX',
              'PSI_8_POST_HIP', 'PSI_90_SAFETY', 'PSI_9_POST_HEM', 'H_CLEAN_LINEAR_SCORE', 'H_COMP_1_LINEAR_SCORE', 'H_COMP_2_LINEAR_SCORE', 'H_COMP_3_LINEAR_SCORE',
              'H_COMP_5_LINEAR_SCORE', 'H_COMP_6_LINEAR_SCORE', 'H_COMP_7_LINEAR_SCORE', 'H_HSP_RATING_LINEAR_SCORE',
              'H_QUIET_LINEAR_SCORE', 'Number of Completed Surveys', 'Survey Response Rate Percent']], 
    df[['STAR_RATING']], test_size=0.2, random_state=0)
scaler = StandardScaler()
xtrain = scaler.fit_transform(xtrain)
xtest = scaler.fit_transform(xtest)

# run SVM
svc = svm.SVC(kernel='rbf', C=1)
svc.fit(xtrain, ytrain)
y_pred_svm = svc.predict(xtest)
print(classification_report(ytest, y_pred_svm))

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

         1.0       0.88      0.38      0.53        40
         2.0       0.71      0.63      0.67       143
         3.0       0.72      0.89      0.80       302
         4.0       0.81      0.70      0.75       190
         5.0       0.73      0.61      0.67        31

    accuracy                           0.75       706
   macro avg       0.77      0.64      0.68       706
weighted avg       0.75      0.75      0.74       706



In [42]:
run_SVM(mean, 'rbf')

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           1       0.88      0.38      0.53        40
           2       0.71      0.62      0.66       143
           3       0.72      0.89      0.80       302
           4       0.80      0.72      0.76       190
           5       0.81      0.55      0.65        31

    accuracy                           0.75       706
   macro avg       0.78      0.63      0.68       706
weighted avg       0.75      0.75      0.74       706



In [43]:
run_SVM(mean, 'poly')

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           1       0.88      0.53      0.66        40
           2       0.60      0.35      0.44       143
           3       0.60      0.91      0.72       302
           4       0.80      0.47      0.59       190
           5       0.76      0.61      0.68        31

    accuracy                           0.64       706
   macro avg       0.72      0.57      0.62       706
weighted avg       0.67      0.64      0.62       706



In [44]:
run_SVM(iterative, 'linear')

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

         1.0       0.97      0.75      0.85        40
         2.0       0.82      0.73      0.77       143
         3.0       0.78      0.89      0.83       302
         4.0       0.82      0.76      0.79       190
         5.0       0.78      0.68      0.72        31

    accuracy                           0.80       706
   macro avg       0.83      0.76      0.79       706
weighted avg       0.81      0.80      0.80       706



In [45]:
run_SVM(iterative, 'rbf')

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

         1.0       0.89      0.42      0.58        40
         2.0       0.71      0.63      0.67       143
         3.0       0.72      0.88      0.79       302
         4.0       0.79      0.71      0.75       190
         5.0       0.78      0.58      0.67        31

    accuracy                           0.75       706
   macro avg       0.78      0.65      0.69       706
weighted avg       0.75      0.75      0.74       706



In [46]:
run_SVM(iterative, 'poly')

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

         1.0       0.92      0.57      0.71        40
         2.0       0.61      0.37      0.46       143
         3.0       0.60      0.89      0.72       302
         4.0       0.79      0.49      0.60       190
         5.0       0.79      0.61      0.69        31

    accuracy                           0.65       706
   macro avg       0.74      0.59      0.64       706
weighted avg       0.68      0.65      0.63       706



In [47]:
run_SVM(knn, 'linear')

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

         1.0       0.97      0.70      0.81        40
         2.0       0.80      0.72      0.76       143
         3.0       0.77      0.88      0.83       302
         4.0       0.81      0.76      0.78       190
         5.0       0.74      0.65      0.69        31

    accuracy                           0.80       706
   macro avg       0.82      0.74      0.77       706
weighted avg       0.80      0.80      0.79       706



In [48]:
run_SVM(knn, 'rbf')

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

         1.0       0.88      0.38      0.53        40
         2.0       0.71      0.63      0.67       143
         3.0       0.72      0.89      0.80       302
         4.0       0.81      0.70      0.75       190
         5.0       0.73      0.61      0.67        31

    accuracy                           0.75       706
   macro avg       0.77      0.64      0.68       706
weighted avg       0.75      0.75      0.74       706



In [49]:
run_SVM(knn, 'poly')

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

         1.0       0.84      0.53      0.65        40
         2.0       0.58      0.35      0.44       143
         3.0       0.60      0.90      0.72       302
         4.0       0.80      0.49      0.61       190
         5.0       0.79      0.61      0.69        31

    accuracy                           0.64       706
   macro avg       0.72      0.58      0.62       706
weighted avg       0.67      0.64      0.63       706



The filled datasets also performed roughly the same, with a linear kernel achieving accuracy of .80, rbf .75, and poly .64-.65. 