In [10]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [27]:
import numpy as np
import pandas as pd 

from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, ConfusionMatrixDisplay, accuracy_score, precision_score, recall_score, f1_score

In [28]:
#Load csv
df = pd.read_csv("./dataset/tbpred.csv")

In [29]:
#Label Encode 'Country of Birth' 
le = preprocessing.LabelEncoder()
df['encoded_country'] = le.fit_transform(df['Country of Birth'])

In [30]:
#One Hot Encode 'Gender' 
df_encoded = pd.get_dummies(data = df, columns=['Gender'])
df_encoded['Gender_male'] = df_encoded['Gender_male'].astype(int)
df_encoded['Gender_female'] = df_encoded['Gender_female'].astype(int)

In [31]:
X = df_encoded.drop(['Name','Country of Birth'], axis=1)

In [32]:
def metrics_calculator(y_test, y_pred, model_name):
    '''
    This function calculates all desired performance metrics for a given model.
    '''
    result = pd.DataFrame(data=[accuracy_score(y_test, y_pred),
                                precision_score(y_test, y_pred, average='macro'),
                                recall_score(y_test, y_pred, average='macro'),
                                f1_score(y_test, y_pred, average='macro')],
                          index=['Accuracy','Precision','Recall','F1-score'],
                          columns = [model_name])
    return result

In [None]:
X1 = X.drop('tb_pred', axis=1)
Y1 = X['tb_pred']

In [None]:
scaler = MinMaxScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X1), columns=X1.columns)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, Y1, test_size = 0.15)

In [26]:
#Train model
dt_classifier = DecisionTreeClassifier(random_state=20)
dt_classifier.fit(X_train, y_train)

NameError: name 'predictions5' is not defined

In [None]:
prediction_dt = dt_classifier.predict(X_train)
accuracy_dt = accuracy_score(y_train, prediction_dt)
print(f"Train Accuracy: {accuracy_dt}")
predictions_dt = dt_classifier.predict(X_test)
accuracy_dt = accuracy_score(y_test, predictions_dt)
print(f"Test Accuracy: {accuracy_dt}")

In [None]:
X3 = X.drop('tb_pred', axis=1)
Y3 = X['tb_pred'].values.reshape(-1, 1)

In [None]:
# Check the shape of X and y
print ('X:', X3.shape,'\nY:', Y3.shape)

In [None]:
# Split X and y into training and testing sets
X_train2, X_test2, y_train2, y_test2 = train_test_split (X3, Y3, stratify=Y2, test_size = 0.25, random_state = 0)

In [None]:
# Instantiate the model
DT_model = DecisionTreeClassifier(max_depth=3)

# Fit the model to the training set
DT_model.fit(X_train2, y_train2)

In [None]:
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test2, y_pred3)))

In [None]:
print(classification_report(y_test2, y_pred3))

In [None]:
BaseDT_result = metrics_calculator(y_test2, y_pred3, 'Base Decision Tree')
BaseDT_result

In [None]:
# Finding optimal hyperparameters(GridSearchCV)

# Define model
model = DecisionTreeClassifier()

# Define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)

# Define search parameters
max_depth = range(2, 19, 1)
criterion = ['entropy', 'gini']   

#min_samples_split = [2, 3, 4]
#min_samples_leaf = [1, 2, 3]

param_grid = {'max_depth': max_depth, 'criterion':criterion}

# Define search
search = GridSearchCV(model, param_grid, scoring='accuracy', n_jobs=-1, cv=cv)

# Execute search
GridSearchCV = search.fit(X_train2, y_train2)

# Set the clf to the best combination of parameters
DT_modelcv = GridSearchCV.best_estimator_

# Summarize result
print('Best Score: %s' % GridSearchCV.best_score_)
print('Best Hyperparameters: %s' % GridSearchCV.best_params_)

In [None]:
DT_modelcv.fit(X_train2, y_train2)

In [None]:
y_pred4 = DT_modelcv.predict(X_test2)

In [None]:
print('Model accuracy score: {0:0.4f}'. format(accuracy_score(y_test2, y_pred4)))

In [None]:
print(classification_report(y_test2, y_pred4))

In [None]:
TunedDT_result = metrics_calculator(y_test2, y_pred4, 'Tuned Decision Tree')
TunedDT_result