In [None]:
pip install PyYAML==5.4.1 --no-build-isolation


In [None]:
pip install kfp==1.8.11

In [None]:
pip install scikit-learn

In [None]:
import kfp
import kfp.dsl as dsl
import kfp.notebook
import kfp.components as comp
from kfp import compiler
from kfp.components import func_to_container_op, InputPath, OutputPath
from kubernetes import client as k8s_client

In [None]:
def Preprocess(out_dir: OutputPath(str),dataOrigin:str):
    import pandas as pd
    import os 
    
    dataURL = "https://raw.githubusercontent.com/Durbek-Gafur/noshowdata/main/"+dataOrigin
    NoShowData = pd.read_csv(dataURL)
    #Convert the variable "Gender" to category
    NoShowData['Gender'] = NoShowData['Gender'].astype('category')

    #Convert the variable "DOW" to category
    NoShowData['DOW'] = NoShowData['DOW'].astype('category')

    #Convert the variable "SMS_received" to category
    NoShowData['SMS_received'] = NoShowData['SMS_received'].astype('category')

    #Convert the variable "Scholarship" to category
    NoShowData['Scholarship'] = NoShowData['Scholarship'].astype('category')

    #Convert the variable "Smoking_Status" to category
    NoShowData['Smoking_Status'] = NoShowData['Smoking_Status'].astype('category')

    #Convert the variable "Hypertension" to category
    NoShowData['Hypertension'] = NoShowData['Hypertension'].astype('category')

    #Convert the variable "Diabetes" to category
    NoShowData['Diabetes'] = NoShowData['Diabetes'].astype('category')

    #Convert the variable "Alcoholism" to category
    NoShowData['Alcoholism'] = NoShowData['Alcoholism'].astype('category')

    #Convert the variable "Tuberculosis" to category
    NoShowData['Tuberculosis'] = NoShowData['Tuberculosis'].astype('category')
    
    #Dummy code the columns
    try:
        NoShowData = pd.get_dummies(NoShowData,
        columns=["Gender","DOW","SMS_received", "Scholarship", "Smoking_Status", "Hypertension", "Diabetes", "Alcoholism", "Tuberculosis", "Status"],
        prefix=["Gender","DOW","SMS_received", "Scholarship", "Smoking_Status", "Hypertension", "Diabetes", "Alcoholism", "Tuberculosis", "Status"], 
                                 drop_first = True)
    except:
        NoShowData = pd.get_dummies(NoShowData,
        columns=["Gender","DOW","SMS_received", "Scholarship", "Smoking_Status", "Hypertension", "Diabetes", "Alcoholism", "Tuberculosis"],
        prefix=["Gender","DOW","SMS_received", "Scholarship", "Smoking_Status", "Hypertension", "Diabetes", "Alcoholism", "Tuberculosis"], 
                                 drop_first = True)
    NoShowData.to_csv(out_dir, index=False)


In [None]:
def SelectFeatureAndSplit(in_dir: InputPath(),
                          x_train: OutputPath(str), 
                          x_test: OutputPath(str),
                          y_train: OutputPath(str), 
                          y_test: OutputPath(str)):
    import pandas as pd
    import os
    from sklearn.model_selection import train_test_split

    # Read the input dataset
    NoShowData = pd.read_csv(in_dir)
    
    # Split the data into predictors and outcome
    NoShow_Predictors = pd.DataFrame(NoShowData.iloc[:,:-1])
    NoShow_Outcome = pd.DataFrame(NoShowData.iloc[:,-1])
    
    # Perform train/test split
    X_Train_NoShow, X_Test_NoShow, y_Train_NoShow, y_Test_NoShow = train_test_split(
        NoShow_Predictors, 
        NoShow_Outcome, 
        test_size=0.25, 
        random_state=8810
    )

    # Ensure the directories exist for saving outputs
    os.makedirs(os.path.dirname(x_train), exist_ok=True)
    os.makedirs(os.path.dirname(x_test), exist_ok=True)
    os.makedirs(os.path.dirname(y_train), exist_ok=True)
    os.makedirs(os.path.dirname(y_test), exist_ok=True)
    
    # Save the output datasets
    X_Train_NoShow.to_csv(x_train, index=False)
    X_Test_NoShow.to_csv(x_test, index=False)
    y_Train_NoShow.to_csv(y_train, index=False)
    y_Test_NoShow.to_csv(y_test, index=False)



In [None]:
def TrainClassifier(x_train_dir: InputPath(),y_train_dir: InputPath(), out_dir: OutputPath(str),classifierName:str):
    import pandas as pd
    import os 
    if classifierName == "DecisionTreeClassifier":
        from sklearn.tree import DecisionTreeClassifier as Classifier
    elif classifierName == "RandomForestClassifier":
        from sklearn.ensemble import RandomForestClassifier as Classifier
    import pickle
    
    X_Train_NoShow = pd.read_csv(x_train_dir)
    y_Train_NoShow = pd.read_csv(y_train_dir)
    model = Classifier()
    if classifierName == "DecisionTreeClassifier":
        model = model.fit(X_Train_NoShow, y_Train_NoShow)
    elif classifierName == "RandomForestClassifier":
        model = model.fit(X_Train_NoShow, y_Train_NoShow.values.ravel())
    
    with open(out_dir, 'wb') as handle:
        pickle.dump(model, handle)


In [None]:
def TestClassifier(pickle_dir: InputPath(),x_test_dir: InputPath(),y_test_dir: InputPath(),classifierName: str) -> float:
    import pandas as pd
    import os 
    if classifierName == "DecisionTreeClassifier":
        from sklearn.tree import DecisionTreeClassifier as Classifier
    elif classifierName == "RandomForestClassifier":
        from sklearn.ensemble import RandomForestClassifier as Classifier
    from sklearn import metrics
    import pickle
    
    with open(pickle_dir, 'rb') as handle:
        model = pickle.load(handle)
    X_Test_NoShow = pd.read_csv(x_test_dir)
    y_Test_NoShow = pd.read_csv(y_test_dir)
    
    y_pred = model.predict(X_Test_NoShow)
    fpr, tpr, thresholds = metrics.roc_curve(y_Test_NoShow, y_pred, pos_label=1)
    auc = metrics.auc(fpr, tpr)
    return float(auc)

In [None]:
def SmartScheduling(pickle_dir: InputPath(), data_dir: InputPath(),classifierName:str) -> str:
    import pandas as pd
    import numpy as np
    from sklearn import metrics
    import pickle
    
    with open(pickle_dir, 'rb') as handle:
        model = pickle.load(handle)
        
    NoShowData_NewExamples = pd.read_csv(data_dir)
    #simulate 20 patient calls and predict their no-show risk
    tot_patients = 20
    deferred = 0 #initialize deferred to 0
    patient_info = NoShowData_NewExamples.sample(tot_patients) #randomly sample 20 patient from the excel file for scheduling
    risk_predictions = model.predict(patient_info)

    #model parameters:
    tot_slots = 10 #total number of available slots for booking
    DB = 0 #initilize total double-booked slots to 0
    DB_max = 3 #only 3 slots can be double booked (30% of 10 slots)
    deferred = 0 #initialize deferred appointments to zero

    #initialize the appointment schedule
    appointment_schedule = np.empty((tot_slots))
    appointment_schedule[:] = np.NaN

    appointment_schedule_DB = np.empty((tot_slots))
    appointment_schedule_DB[:] = np.NaN

    slot_capacity = np.zeros((tot_slots))
    slot_capacity.fill(2) #no more than 2 patients per slot

    slot_risktype = np.zeros((tot_slots)) #risk type of patient scheduled in a slot
    slot_risktype.fill(2) 

    for p in range(tot_patients): #simulates sequential patient call-in. (i.e., for each patient calling for an appointment)
        assignment = 0
        if risk_predictions[p] == 1: #patient is low-risk
            for slot in range(tot_slots): #start from beginning to search for a slot
                if slot_capacity[slot] == 2 and assignment==0:
                    appointment_schedule[slot] = p
                    assignment = 1
                    slot_risktype[slot] = 1 #risk type of patient single booked in this slot is low-risk
                    slot_capacity[slot] = slot_capacity[slot] - 1

            if assignment == 0 and DB < DB_max: #scan for double-booking 
                for slot in range(tot_slots): #start from beginning to search for first feasible slot according to overbooking policy
                    if slot_capacity[slot] == 1 and slot_risktype[slot] == 0 and assignment==0:
                        appointment_schedule_DB[slot] = p
                        assignment = 1
                        slot_capacity[slot] = slot_capacity[slot] - 1
                        DB = DB + 1

            if assignment == 0: #if patient is still not scheduled then assign it to 
                deferred = deferred+1


        if risk_predictions[p] == 0: #patient is high-risk
            for slot in range(tot_slots-1,-1,-1): #start from end to search for a slot
                if slot_capacity[slot] == 2 and assignment==0:
                    appointment_schedule[slot] = p
                    assignment = 1
                    slot_risktype[slot] = 0 #risk type of patient single booked in this slot is high-risk
                    slot_capacity[slot] = slot_capacity[slot] - 1

            if assignment == 0 and DB < DB_max: #scan for double-booking 
                for slot in range(tot_slots-1,-1,-1): #start from beginning to search for first feasible slot according to overbooking policy
                    if slot_capacity[slot] == 1 and slot_risktype[slot] == 1 and assignment==0:
                        appointment_schedule_DB[slot] = p
                        assignment = 1
                        slot_capacity[slot] = slot_capacity[slot] - 1
                        DB = DB + 1

            if assignment == 0: #if patient is still not scheduled then assign it to 
                deferred = deferred+1
    with open('output.txt', 'w') as f:
        print(f"Schedule Generated: {appointment_schedule}\nDoubleBooked Slots: {appointment_schedule_DB}\nDeferred Patients: {deferred}",file=f)
        
    return f"Schedule Generated: {appointment_schedule}\nDoubleBooked Slots: {appointment_schedule_DB}\nDeferred Patients: {deferred}"

In [None]:
def GetImportantFeatures(pickle_dir: InputPath(), data_dir: InputPath(), classifierName: str ) -> str:
    import pandas as pd
    import os 
    import numpy as np
    import matplotlib.pyplot as plt 
    import pickle
    import sklearn
    
    with open(pickle_dir, 'rb') as handle:
        model = pickle.load(handle)
    
    NoShowData = pd.read_csv(data_dir)
    features = list(NoShowData.columns[:-1])
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    plt.title('Feature Importances')
    r = ""
    for i in indices:
        r += f"{features[i]}\t:\t{importances[i]}\n"
            
    return r


In [None]:
def EvalClassifiers(r1:float, r2:float, pickle1: InputPath(),pickle2: InputPath(), out_dir: OutputPath(str)):
    import pickle
    import sklearn
    if r1>r2:
        p = pickle1
    else:
        p = pickle2
    with open(p, 'rb') as handle:
        model = pickle.load(handle)
    with open(out_dir, 'wb') as handle:
        pickle.dump(model, handle)

## Turning function to container_operations

In [None]:
preprocess_op = comp.func_to_container_op(Preprocess,
                                              base_image='tensorflow/tensorflow:latest',
                                              packages_to_install=['pandas'])  

selectFeatureAndSplit_op = comp.func_to_container_op(SelectFeatureAndSplit,
                                              base_image='tensorflow/tensorflow:latest',
                                              packages_to_install=['pandas','scikit-learn'])  

trainClassifier_op = comp.func_to_container_op(TrainClassifier,
                                              base_image='tensorflow/tensorflow:latest',
                                              packages_to_install=['pandas','scikit-learn'])  

testClassifier_op = comp.func_to_container_op(TestClassifier,
                                              base_image='tensorflow/tensorflow:latest',
                                              packages_to_install=['pandas','scikit-learn','pickle-mixin'])  

smartScheduling_op = comp.func_to_container_op(SmartScheduling,
                                              base_image='tensorflow/tensorflow:latest',
                                              packages_to_install=['pandas','scikit-learn','numpy','pickle-mixin']) 

getImportantFeatures_op = comp.func_to_container_op(GetImportantFeatures,
                                              base_image='tensorflow/tensorflow:latest',
                                              packages_to_install=['pandas','matplotlib','numpy','pickle-mixin','scikit-learn'])  

evalClassifiers_op = comp.func_to_container_op(EvalClassifiers,
                                              base_image='tensorflow/tensorflow:latest',
                                              packages_to_install=['pickle-mixin','scikit-learn'])  

## Define pipeline

In [None]:
@dsl.pipeline(
    name="Smart Scheduling ",
    description="Smart Outpatient Appointment Scheduling System"
)
def smart_scheduling():
    
    # Pipeline's task 1 : Download and preprocess data
    preprocess_task = preprocess_op("No-show_Data.csv")
 
    # Pipeline's task 2 : Feature Selection and Split Data into Training and Testing
    selectFeatureAndSplit_task = selectFeatureAndSplit_op(preprocess_task.output)

    # Pipeline's task 3 : Decision Tree Classififer Training
    trainClassifier_op_DT_task = trainClassifier_op(selectFeatureAndSplit_task.outputs["x_train"],selectFeatureAndSplit_task.outputs["y_train"],"DecisionTreeClassifier")
 
    # Pipeline's task 3 : Random Forest Classifier Training
    trainClassifier_op_RF_task = trainClassifier_op(selectFeatureAndSplit_task.outputs["x_train"],selectFeatureAndSplit_task.outputs["y_train"],"RandomForestClassifier")

    # Pipeline's task 4 : Test Decision Tree Classififer 
    testClassifier_op_DT_task = testClassifier_op(trainClassifier_op_DT_task.output,selectFeatureAndSplit_task.outputs["x_test"],selectFeatureAndSplit_task.outputs["y_test"],"DecisionTreeClassifier")
 
    # Pipeline's task 4 : Test Random Forest Classifier 
    testClassifier_op_RF_task = testClassifier_op(trainClassifier_op_RF_task.output,selectFeatureAndSplit_task.outputs["x_test"],selectFeatureAndSplit_task.outputs["y_test"],"RandomForestClassifier")
    
    evalClassifiers_task = evalClassifiers_op(testClassifier_op_DT_task.output,testClassifier_op_RF_task.output,trainClassifier_op_DT_task.output,trainClassifier_op_RF_task.output)
    # Select Best Classifier 
    if evalClassifiers_task.output == trainClassifier_op_DT_task.output:
        best_classifier = "DecisionTreeClassifier"
    else:
        best_classifier = "RandomForestClassifier"
        
    # Pipeline's task 5 : Identify Variables Important for Predicting No-shows
    getImportantFeatures_task = getImportantFeatures_op(evalClassifiers_task.output,preprocess_task.output, best_classifier)
    
    # Pipeline's task 6 : Predict New Examples
    preprocess_new_task = preprocess_op("No-show_Data_Testing.csv")

    # Pipeline's task 7 : Smart Scheduling according to prediction
    smartScheduling_op(evalClassifiers_task.output, preprocess_new_task.output, best_classifier)


## Execute pipeline

In [None]:
kfp.compiler.Compiler().compile(smart_scheduling, "smart_scheduling.zip")

In [None]:
!ls -al ./smart_scheduling.zip

In [None]:
!unzip -o ./smart_scheduling.zip

In [None]:
# !pygmentize pipeline.yaml

In [None]:

EXPERIMENT_NAME = "Smart Scheduling Experiment 3"
client = kfp.Client()
try:
    experiment = client.get_experiment(experiment_name=EXPERIMENT_NAME)
except:
    experiment = client.create_experiment(EXPERIMENT_NAME)
my_run = client.run_pipeline(experiment.id, "smart-scheduling-pipeline", "smart_scheduling.zip")