In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import io
import boto3
import os
import mlflow

In [None]:
mlflow.set_tracking_uri("file:\\"+mlflow_tracking_path)

In [None]:
bucket_name = "lneg-loka"
data_file_name = "patient_data_raw/patient_data_raw.csv"
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket_name, Key=data_file_name)
csv_string = obj['Body'].read().decode('utf-8')
data = pd.read_csv(io.StringIO(csv_string))


DecisionTree is a suitable first approach for tabular data. Class-conditional feature plots and statistical tests suggest features are not informative for prediction of Chronic Obstructive Pulmonary Disease, so I didn't bother with cross-validation or hyperparameter tuning.

In [None]:
model_features =['age','bmi','smoker','sex','chronic_obstructive_pulmonary_disease']
df = data[model_features]
df

In [None]:
target_name = 'chronic_obstructive_pulmonary_disease'
X = df.drop(target_name, axis=1)
y = df[target_name]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

In [None]:
model = DecisionTreeClassifier(
    criterion='gini',
    max_depth=5,
    random_state=42
)


In [None]:
preprocessor = ColumnTransformer(transformers=[('cat',OneHotEncoder(drop='first'),['sex','smoker'])],remainder='passthrough')

In [None]:
pipe = Pipeline([("preprocess",preprocessor),("model", model)])

In [None]:
mlflow.set_experiment("COPD_classifier_experiments")

In [None]:
with mlflow.start_run(run_name="COPD_DecisionTreeClassifier") as mlrun:
    max_depth = pipe.named_steps['model'].get_params()['max_depth']
    criterion = pipe.named_steps['model'].get_params()['criterion']
    pipe.fit(X_train,y_train)
    y_pred = pipe.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    mlflow.log_metric("accuracy",accuracy)
    mlflow.log_param('max_depth',max_depth)
    mlflow.log_param('criterion',criterion)
    mlflow.sklearn.log_model(pipe,name="COPD_Classifier",input_example = X_train.head(1))