In [6]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier


In [None]:
import cms_procedures

### Assume the number of procedure ids /observations in the dataset is n (which is an integer).

#### The outputs for these three functions are like:

#### a. get_procedure_attributes(procedure_id = None) 
= {'procedure_id': value1, 'type of procedure': value2, 'how long it lasted' : value3, 'severity of the condition being addressed' : value4}


#### b. get_procedure_success(procedure_id) 

= 'True' / "False' (returns True if the procedure succeeded and False otherwise('outcome' : "Success"/"Failure"))



#### c. get_procedure_outcomes(procedure_id) 

= more granular measures of the outcome

= {'severity of post procedure complications' : value1 , 'pain', value2, 'recurrence of original condition': value3}




In [None]:
# Extracting the data using the specified API (three functions):

#lists of attributes:
Procedure_ids, Type_of_procedure, How_long_it_lasted, Severity_of_the_condition_being_addressed  = [], [], [], []

#lists of outcomes:
Severity_of_post_procedure_complications, Pain, Recurrence_of_original_condition = [], [], []

#list of the outcome(True/False) of procedures:
Outcomes = []


while len(Procedure_ids) <= n:# where n is the number of procedure ids / observations in the dataset by my assumption initially

    procedure_id = get_procedure_attributes(procedure_id = None)['procedure_id']
    if procedure_id not in Procedure_ids:
        Procedure_ids.append(procedure_id)
        Type_of_procedure.append(get_procedure_attributes(procedure_id)['type of procedure'])
        How_long_it_lasted.append(get_procedure_attributes(procedure_id)['how long it lasted'])
        Severity_of_the_condition_being_addressed.append(get_procedure_attributes(procedure_id)['severity of the condition being addressed'])

        Severity_of_post_procedure_complications.append(get_procedure_outcomes(procedure_id)['severity of post procedure complications'])
        Pain.append(get_procedure_outcomes(procedure_id)['pain'])
        Recurrence_of_original_condition.append(get_procedure_outcomes(procedure_id)['recurrence of original condition'])            

        Outcomes.append(get_procedure_success(procedure_id))


In [None]:
# Construct dataframe using the data polled out above:

d = {'procedure_id': Procedure_ids, 'type of procedure': Type_of_procedure, 'how long it lasted': How_long_it_lasted, 'severity of the condition being addressed': Severity_of_the_condition_being_addressed,
    'severity of post procedure complications': Severity_of_post_procedure_complications, 'pain': Pain, 'recurrence of original condition': Recurrence_of_original_condition, 'outcome': Outcomes}

df = pd.DataFrame(data = d)

# Dealing with missing values:

#if the percentage of missing values in the dataframe is not so high(< 30%), we chose to drop those:
if len(df.dropna())/len(df) > 0.7:
    new_df = df.dropna()
    
#otherwise, only keep the columns having less missing values with threshold 30% for each column:
else:  
    missing_values_perc = df.isnull().sum()/len(df) # saving missing values' percentages of different columns
    new_variables = [ ]
    for i in range(len(df.columns)):
        if missing_values_perc[i] < 0.3: # threshold 30%
            new_variables.append(df.columns[i])
    new_df = df[new_variables]


#replace 'False' and 'True' in the labels - 'outcome' by 0 and 1 respectively:

new_df['outcome'] = new_df['outcome'].apply(lambda x: 0 if x == 'False' else 1)


In [None]:
labels = new_df["outcome"]
procedure_ids = new_df["procedure_id"]

#features X:
X = df.drop(columns = ['procedure_id', 'outcome'], axis = 1)

# Changing categorical variables in features X into dummy variables:

string_variables = []
for i in range(new_df.columns):
    if not float(new_df[i][0]): # if the column's type is string, I transfer the variable into dummies variables
        X = pd.get_dummies(X,columns = [i])
        string_variables.append(i)
    
X_features = X.columns # all features


## Feature evaluation using Logistic Regression model with Minimum AIC

 - I am going to find a model with the minimum AIC. 
 - I first split the data into training and test, use the training data to select variables to include in the model, and then estimate the model parameters on the test data.


In [None]:
# this function returns a logistic regression model with the minimum AIC
def minAIC_Logit(X,y):
    variables = X.columns
    model = sm.Logit(y,X[variables]).fit()
    while True:
        print(f'old model aic: {model.aic}')
        maxp = np.max(model.pvalues)
        newvariables = variables[model.pvalues < maxp]
        removed = variables[model.pvalues == maxp].values
        print(f'considering a model with these variables removed: {removed}')
        newmodel = sm.Logit(y,X[newvariables]).fit()
        print(f'new model aic: {newmodel.aic}')
        if newmodel.aic < model.aic:
            model = newmodel
            variables = newvariables
        else:
            break
    return model,variables


X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=2)

# now call the minAIC function on the predictors and response variables
new_train_model, logit_variables = minAIC_Logit(X_train.astype(float), y_train)

# Now fit the variables selected, using the test data
new_model = sm.Logit(y_train, X_train[logit_variables]).fit()
results = new_model.summary()
results

In [None]:
AIC_test_all_pred = sm.Logit(y_test,X_test).fit().aic, 

AIC_test_selected_pred = sm.Logit(y_test,X_test[logit_variables]).fit().aic, 

AIC_test_all_pred, AIC_test_selected_pred
#the AIC is supposed to decrease when using logit_variables.

## Modeling and modeling evaluation


In [None]:
X_train, X_test, y_train, y_test  = train_test_split(X[logit_variables], labels, test_size=0.3, random_state=42)

X_train.shape, y_train.shape, X_test.shape, y_test.shape


## Use RandomForest Classifier:
 - Reasons: 
     1. this classifier algorithm can overcome the overfitting problem in decision trees;
     2. it works good for both categorical and continuous values.
     3. it can automatically handle missing values in the data.
     
     
 - In order to get a good model, we need to tune the hyper-parameters:



 ### rfc is the trained model

In [None]:
cvs = float('-inf')
estimator = 0
for i in range(100,3000,50):
    rfc = RandomForestClassifier(random_state=42,n_estimators=i)
    cv = cross_val_score(rfc,X_train,y_train,cv=5,scoring='accuracy')
    if cv > cvs:
        cvs = cv
        estimator = i

        
rfc = RandomForestClassifier(random_state=42,n_estimators=estimator)
rfc = rfc.fit(X_train,y_train) # rfc is the trained model
predictions = rfc.predict(X_test)

#### Accuracy on test set:

In [None]:
import sklearn.metrics as metrics
from sklearn.metrics import f1_score
score = 100 * metrics.f1_score(y_test, predictions)
score