In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import io
import boto3

In [None]:
bucket_name = "lneg-loka"
file_name = "patient_data_raw.csv"
processed_file_name = "patient_data_processed.csv"
s3 = boto3.client('s3')
obj = s3.get_object(Bucket=bucket_name, Key=processed_file_name)
csv_string = obj['Body'].read().decode('utf-8')
data = pd.read_csv(io.StringIO(csv_string))


In [None]:
data

In [None]:
data=data.drop('Unnamed: 0',axis=1)

In [None]:
data

In [None]:
target_name = 'chronic_obstructive_pulmonary_disease'
X = data.drop(target_name, axis=1)
y = data[target_name]

In [None]:
X

In [None]:
y

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.1, random_state=42
)

In [None]:
model = DecisionTreeClassifier(
    criterion='entropy',
    max_depth=5,
    random_state=42
)
model.fit(X_train,y_train)

In [None]:
model.get_depth()

In [None]:
model.get_n_leaves()

In [None]:
y_pred = model.predict(X_test)

In [None]:
print("Accuracy:", accuracy_score(y_test, y_pred)) #informative since classes are balanced


In [None]:
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
plt.figure(figsize=(20,10))
tree.plot_tree(model, feature_names=X.columns, class_names=True, filled=True)
plt.show()

In [None]:
import joblib
import os

In [None]:
model_dir = '.\..\models'
model_name = 'decision_tree_classifier.joblib'
joblib.dump(model,os.path.join(model_dir,model_name))

In [None]:
model_copy = joblib.load(os.path.join(model_dir,model_name))
model_copy