In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import joblib

from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("Resources/admissions.csv")
df.head()

Unnamed: 0,patient_id,date_admission,ciprofloxacin,gentamicin,amoxicillin_clavulanic_acid
0,7923,3,S,S,S
1,7825,1,S,S,R
2,5375,2,S,R,S
3,2118,2,S,R,R
4,12,1,S,R,S


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10108 entries, 0 to 10107
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   patient_id                   10108 non-null  int64 
 1   date_admission               10108 non-null  int64 
 2   ciprofloxacin                10108 non-null  object
 3   gentamicin                   10108 non-null  object
 4   amoxicillin_clavulanic_acid  10108 non-null  object
dtypes: int64(2), object(3)
memory usage: 395.0+ KB


In [4]:
df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

df_train_full = df_train_full.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)

In [5]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2022 entries, 0 to 2021
Data columns (total 5 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   patient_id                   2022 non-null   int64 
 1   date_admission               2022 non-null   int64 
 2   ciprofloxacin                2022 non-null   object
 3   gentamicin                   2022 non-null   object
 4   amoxicillin_clavulanic_acid  2022 non-null   object
dtypes: int64(2), object(3)
memory usage: 79.1+ KB


In [6]:
df_test.head()

Unnamed: 0,patient_id,date_admission,ciprofloxacin,gentamicin,amoxicillin_clavulanic_acid
0,4440,1,S,S,S
1,3190,1,S,R,S
2,8334,1,S,R,S
3,2786,1,R,R,S
4,1176,3,S,S,S


In [7]:
# convert above code into a function for all antibiotics ciprofloxacin, gentamicin, and amoxicillin_clavulanic_acid
def prepare(df, antibiotic):
    df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=1)

    df_train_full = df_train_full.reset_index(drop=True)
    df_test = df_test.reset_index(drop=True)

    df_train, df_val = train_test_split(df_train_full, test_size=0.33, random_state=11)

    df_train = df_train.reset_index(drop=True)
    df_val = df_val.reset_index(drop=True)

    y_train = df_train[antibiotic].values
    y_val = df_val[antibiotic].values

    del df_train[antibiotic]
    del df_val[antibiotic]

    categorical = []
    numerical = ['date_admission']

    return df_train, df_val, y_train, y_val, categorical, numerical


2. Create a model of your choice to predict AMR (i.e., antibiotics which are R). The specifics of your model are up to you, it could be a frequentist approach, bayesian approach, or machine learning approach. 'Black box' approaches are also fine.

In [8]:
categorical = []
numerical = ['date_admission']
# combine all the code above into a function for three antibiotics
def train(df, y, C=1.0):
    cat = df[categorical + numerical].to_dict(orient='rows')
    
    dv = DictVectorizer(sparse=False)
    dv.fit(cat)

    X = dv.transform(cat)

    model = LogisticRegression(solver='liblinear', C=C)
    model.fit(X, y)

    return dv, model

def predict(df, dv, model):
    cat = df[categorical + numerical].to_dict(orient='rows')
    
    X = dv.transform(cat)

    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

def train_and_predict(df_train, df_test, antibiotic):
    y_train = df_train[antibiotic].values
    y_test = df_test[antibiotic].values

    dv, model = train(df_train, y_train, C=0.5)
    y_pred = predict(df_test, dv, model)

    auc = roc_auc_score(y_test, y_pred)
    print(antibiotic, auc)

    joblib.dump(model, f"model_{antibiotic}.pkl")
    joblib.dump(dv, f"dv_{antibiotic}.pkl")

    return dv, model

In [9]:
# run the function for all three antibiotics using a for loop
antibiotics = ["ciprofloxacin", "gentamicin", "amoxicillin_clavulanic_acid"]
for antibiotic in antibiotics:
    print(antibiotic)
    df_train, df_val, y_train, y_val, categorical, numerical = prepare(df, antibiotic)
    dv, model = train(df_train, y_train, C=0.5)
    y_pred = predict(df_val, dv, model)
    auc = roc_auc_score(y_val, y_pred)
    print(auc)
    if y_pred[0] > 0.5:
        print("Resistant")
    else:
        print("Sensitive")

    joblib.dump(model, f"model_{antibiotic}.pkl")
    joblib.dump(dv, f"dv_{antibiotic}.pkl")

ciprofloxacin
0.5190745365261886
Resistant
gentamicin
0.5065583423116433
Sensitive
amoxicillin_clavulanic_acid
0.5205069459101854
Resistant


In [10]:
# To use the training script, first create a directory where you will store the file.
import os
app_dir = "./app"
os.makedirs(app_dir, exist_ok=True)

3. Develop an algorithm that determines whether each patient has had AMR to a particular antibiotic at any point in the past. 

In [11]:
# get 10 customers data

customers = df_test[categorical + numerical].iloc[:10].to_dict(orient='records')

# predict the probability of the 10 customers for all three antibiotics ciprofloxacin, gentamicin, and amoxicillin_clavulanic_acid
dv_ciprofloxacin = joblib.load("dv_ciprofloxacin.pkl")
model_ciprofloxacin = joblib.load("model_ciprofloxacin.pkl")
y_pred_ciprofloxacin = predict(df_test, dv_ciprofloxacin, model_ciprofloxacin)

dv_gentamicin = joblib.load("dv_gentamicin.pkl")
model_gentamicin = joblib.load("model_gentamicin.pkl")
y_pred_gentamicin = predict(df_test, dv_gentamicin, model_gentamicin)

dv_amoxicillin_clavulanic_acid = joblib.load("dv_amoxicillin_clavulanic_acid.pkl")
model_amoxicillin_clavulanic_acid = joblib.load("model_amoxicillin_clavulanic_acid.pkl")
y_pred_amoxicillin_clavulanic_acid = predict(df_test, dv_amoxicillin_clavulanic_acid, model_amoxicillin_clavulanic_acid)

# if the probability is greater than 0.5, then create a new column called ciprofloxacin_previous_R and set the value to True else False
df_test['ciprofloxacin_previous_R'] = y_pred_ciprofloxacin > 0.5
df_test['gentamicin_previous_R'] = y_pred_gentamicin > 0.5
df_test['amoxicillin_clavulanic_acid_previous_R'] = y_pred_amoxicillin_clavulanic_acid > 0.5
df_test.head(10)

Unnamed: 0,patient_id,date_admission,ciprofloxacin,gentamicin,amoxicillin_clavulanic_acid,ciprofloxacin_previous_R,gentamicin_previous_R,amoxicillin_clavulanic_acid_previous_R
0,4440,1,S,S,S,True,False,True
1,3190,1,S,R,S,True,False,True
2,8334,1,S,R,S,True,False,True
3,2786,1,R,R,S,True,False,True
4,1176,3,S,S,S,True,False,True
5,1276,1,S,R,S,True,False,True
6,4333,1,S,S,S,True,False,True
7,1259,1,S,S,S,True,False,True
8,840,2,S,R,S,True,False,True
9,5861,1,S,S,S,True,False,True


4. Create a function, called previous_resistance that accepts patient_id and antibiotic name as an input, and returns TRUE if patient has had antibiotic resistance to a specific antibiotic (provided by antibiotic name argument) at any time point within the dataset. Function should return FALSE if the patient does not have resistance to the antibiotic, or if the patient is not in the dataset (we assume they do not have resistance in this case).

In [12]:
# Create a function, called previous_resistance that accepts patient_id and antibiotic name as an input, and returns TRUE if patient has had antibiotic resistance to a specific antibiotic (provided by antibiotic name argument) at any time point within the dataset. Function should return FALSE if the patient does not have resistance to the antibiotic, or if the patient is not in the dataset (we assume they do not have resistance in this case).
def previous_resistance(patient_id, antibiotic):
    if patient_id in df_test.patient_id.values:
        if antibiotic == "ciprofloxacin":
            return df_test[df_test.patient_id == patient_id].ciprofloxacin_previous_R.values[0]
        elif antibiotic == "gentamicin":
            return df_test[df_test.patient_id == patient_id].gentamicin_previous_R.values[0]
        elif antibiotic == "amoxicillin_clavulanic_acid":
            return df_test[df_test.patient_id == patient_id].amoxicillin_clavulanic_acid_previous_R.values[0]
    else:
        return False

In [13]:
print(previous_resistance(1, "ciprofloxacin"))

False


5. Use your model and previous_resistance function and create a dashboard/web frontend for a doctor to use. The doctor should input patient_id and an antibiotic name (ciprofloxacin, gentamicin or amoxicillin-clavulanic acid), and the algorithm should do the following:

if the patient has resistance within the dataset to the antibiotic, then it should alert the doctor of this
otherwise, if resistance to the agent is not present in the dataset, or the patient is not in the dataset, the algorithm should use the prediction model from above to predict the risk of R for the antibiotic, and return this prediction to the doctor.

In [17]:
pip install streamlit

Note: you may need to restart the kernel to use updated packages.


In [14]:
%%writefile app_dir/app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib

df_test = pd.read_csv("Resources/admissions.csv")

categorical = []
numerical = ['date_admission']

customers = df_test[categorical + numerical].iloc[:10].to_dict(orient='records')

def predict(df, dv, model):
    cat = df[categorical + numerical].to_dict(orient='rows')
    
    X = dv.transform(cat)

    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

# predict the probability of the 10 customers for all three antibiotics ciprofloxacin, gentamicin, and amoxicillin_clavulanic_acid
dv_ciprofloxacin = joblib.load("dv_ciprofloxacin.pkl")
model_ciprofloxacin = joblib.load("model_ciprofloxacin.pkl")
y_pred_ciprofloxacin = predict(df_test, dv_ciprofloxacin, model_ciprofloxacin)

dv_gentamicin = joblib.load("dv_gentamicin.pkl")
model_gentamicin = joblib.load("model_gentamicin.pkl")
y_pred_gentamicin = predict(df_test, dv_gentamicin, model_gentamicin)

dv_amoxicillin_clavulanic_acid = joblib.load("dv_amoxicillin_clavulanic_acid.pkl")
model_amoxicillin_clavulanic_acid = joblib.load("model_amoxicillin_clavulanic_acid.pkl")
y_pred_amoxicillin_clavulanic_acid = predict(df_test, dv_amoxicillin_clavulanic_acid, model_amoxicillin_clavulanic_acid)

# if the probability is greater than 0.5, then create a new column called ciprofloxacin_previous_R and set the value to True else False
df_test['ciprofloxacin_previous_R'] = y_pred_ciprofloxacin > 0.5
df_test['gentamicin_previous_R'] = y_pred_gentamicin > 0.5
df_test['amoxicillin_clavulanic_acid_previous_R'] = y_pred_amoxicillin_clavulanic_acid > 0.5


def previous_resistance(patient_id, antibiotic):
    if patient_id in df_test.patient_id.values:
        if antibiotic == "ciprofloxacin":
            return df_test[df_test.patient_id == patient_id].ciprofloxacin_previous_R.values[0]
        elif antibiotic == "gentamicin":
            return df_test[df_test.patient_id == patient_id].gentamicin_previous_R.values[0]
        elif antibiotic == "amoxicillin_clavulanic_acid":
            return df_test[df_test.patient_id == patient_id].amoxicillin_clavulanic_acid_previous_R.values[0]
    else:
        return False
    
def predict(df, dv, model):
    cat = df[categorical + numerical].to_dict(orient='rows')
    
    X = dv.transform(cat)

    y_pred = model.predict_proba(X)[:, 1]

    return y_pred

def doctor_alert(patient_id, antibiotic):
    if previous_resistance(patient_id, antibiotic):
        return "Patient has resistance to " + antibiotic
    else:
        if antibiotic == "ciprofloxacin":
            y_pred = predict(df_test[df_test.patient_id == patient_id], dv_ciprofloxacin, model_ciprofloxacin)
            return "Patient has a " + str(y_pred[0]) + " chance of resistance to " + antibiotic
        elif antibiotic == "gentamicin":
            y_pred = predict(df_test[df_test.patient_id == patient_id], dv_gentamicin, model_gentamicin)
            return "Patient has a " + str(y_pred[0]) + " chance of resistance to " + antibiotic
        elif antibiotic == "amoxicillin_clavulanic_acid":
            y_pred = predict(df_test[df_test.patient_id == patient_id], dv_amoxicillin_clavulanic_acid, model_amoxicillin_clavulanic_acid)
            return "Patient has a " + str(y_pred[0]) + " chance of resistance to " + antibiotic
        
st.title('Antibiotic Resistance Prediction')

patient_id = st.number_input('Enter patient ID')
antibiotic = st.selectbox('Select antibiotic', ["ciprofloxacin", "gentamicin", "amoxicillin_clavulanic_acid"])

if st.button('Predict'):
    result = doctor_alert(patient_id, antibiotic)
    st.write(result)

Overwriting app_dir/app.py


In [16]:
!streamlit run app_dir/app.py

^C
