## Import Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score, balanced_accuracy_score, average_precision_score, recall_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder

## Split Train and Testing Set

In [2]:
X, y = load_breast_cancer(return_X_y=(True))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

In [3]:
print("Size of train set: {}".format(len(y_train)))
print("Size of test set: {}".format(len(y_test)))
print("Unique classes: {}".format(len(set(y_test))))

Size of train set: 381
Size of test set: 188
Unique classes: 2


## Training Simple Classifiers

In [4]:
classifier_gini = DecisionTreeClassifier()
classifier_igain = DecisionTreeClassifier(criterion = "entropy")

classifier_gini.fit(X_train, y_train)
classifier_igain.fit(X_train, y_train)

prediction_gini = classifier_gini.predict(X_test)
prediction_igain = classifier_igain.predict(X_test)

f_measure_gini = f1_score(y_test, prediction_gini)
f_measure_igain = f1_score(y_test, prediction_igain)

In [5]:
print("F-Measure Gini: {}".format(f_measure_gini))
print("F-Measure Information Gain: {}".format(f_measure_igain))

F-Measure Gini: 0.9641434262948207
F-Measure Information Gain: 0.952


### Training Different Classifiers based on max_depth

In [6]:
depth = classifier_gini.get_depth()
fscores = {}
fscores['train'] = np.zeros(depth)
fscores['test'] = np.zeros(depth) 

for i in range(depth):
    classifier_gini = DecisionTreeClassifier(max_depth = i+1)
    classifier_gini.fit(X_train, y_train)
    prediction_gini = classifier_gini.predict(X_test)
    prediction_gini_train = classifier_gini.predict(X_train)
    fscores['test'][i] = round(100*f1_score(y_test, prediction_gini),2)
    fscores['train'][i] = round(100*f1_score(y_train, prediction_gini_train),2)

In [7]:
print("Fscores Train: {}".format(fscores['train']))
print("Fscores Test:  {}".format(fscores['test']))

Fscores Train: [ 93.94  96.52  97.6   98.71  99.35 100.  ]
Fscores Test:  [92.5  92.56 92.5  96.41 93.88 96.39]


## Setting Up Pipelines

In [8]:
data = pd.read_csv("income_train_trees.csv", header = 0)
train_set = data.iloc[:, :-1].values
y_train = data.iloc[:, -1].values

# any other code you need
data_test = pd.read_csv("income_test_trees.csv", header = 0)
test_set = data_test.iloc[:, :-1].values
y_test = data_test.iloc[:, -1].values

### Imputer and One-hot Encoder for Preprocessing + Decision Tree Classifier

In [9]:
categorical_cols = [1,3,5,6,7,8,9]
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent') # Handle Missing Values
ct = ColumnTransformer(transformers=[('encoder', OneHotEncoder(sparse_output = False), [1,3,5,6,7,8,9])], remainder='passthrough') # Handle Categorical Values
classifier = DecisionTreeClassifier()    
clf = Pipeline([
    ('imp', imp),
    ('ct', ct),
    ('classifier', classifier),
])
clf.fit(train_set, y_train)
y_predict =  clf.predict(test_set)

In [10]:
print("Model score Accuracy: %.3f" % accuracy_score(y_test, y_predict))
print("Model score F1 Weighted: %.3f" % f1_score(y_test, y_predict,average='weighted'))

Model score Accuracy: 0.806
Model score F1 Weighted: 0.807


### Grid Search for Optimal Parameters

In [11]:
param_grid = {
    "classifier__criterion": ["entropy", "gini"],
    "classifier__max_depth": list(range(1,depth+1)),
}

grid_search = GridSearchCV(clf, param_grid)
grid_search.fit(train_set, y_train)
y_predict = grid_search.predict(test_set)

print("Best params:")
print(grid_search.best_params_)

  _data = np.array(data, dtype=dtype, copy=copy,


Best params:
{'classifier__criterion': 'gini', 'classifier__max_depth': 6}


In [12]:
print("Model score Accuracy: %.3f" % accuracy_score(y_test,y_predict))
print("Model score F1 Weighted: %.3f" % f1_score(y_test,y_predict,average='weighted'))

Model score Accuracy: 0.852
Model score F1 Weighted: 0.841


## Pipeline for Custom Data

In [13]:
columns = ['age','fnlwgt','education_num','hours-per-week',"capital-loss","capital-gain","income"]
data = pd.read_csv('income_train_trees.csv',usecols=columns)
data_test = pd.read_csv('income_test_trees.csv',usecols=columns)
# Convert target variable to 0 and 1
data["income"] = data["income"].map({ "<=50K": 0, ">50K": 1 })
data_test["income"] = data_test["income"].map({ "<=50K": 0, ">50K": 1 })
# Create X and y
X_train = data.drop(["income"],axis=1)
y_train = data['income'].values
X_test = data_test.drop(["income"],axis=1)
y_test = data_test['income'].values
# Classifier
classifier = DecisionTreeClassifier(min_samples_leaf=4)
classifier.fit(X_train,y_train)
y_predict = classifier.predict(X_test)
accuracy = accuracy_score(y_test,y_predict)
print("Model score accuracy: %.3f" % accuracy)

Model score accuracy: 0.791


In [14]:
y_predict = classifier.predict(X_test)

avg_precision = average_precision_score(y_test, y_predict)
balanced_acc = balanced_accuracy_score(y_test, y_predict)
recall = recall_score(y_test, y_predict)

In [15]:
print("Model Average Precision: %.3f" % avg_precision)
print("Model Balanced Accuracy Score: %.3f" % balanced_acc)
print("Model Recall: %.3f" % recall)

Model Average Precision: 0.414
Model Balanced Accuracy Score: 0.688
Model Recall: 0.485


In [18]:
# Imbalanced Dataset - Optimize Params
depth = {"max_depth": range(1,11)}
skf = StratifiedKFold(n_splits=10, shuffle = True)
classifier = DecisionTreeClassifier(max_depth = 7)
clf = GridSearchCV(estimator=classifier, param_grid=depth, cv=skf.split(X_train, y_train))
clf.fit(X_train, y_train)
pred = clf.predict(X_test)

accuracy = accuracy_score(y_test,pred)
avg_precision = average_precision_score(y_test, pred)
balanced_acc = balanced_accuracy_score(y_test, pred)
recall = recall_score(y_test, pred)

print(clf.best_params_) # Best Depth

{'max_depth': 9}


In [19]:
print("Model score accuracy: %.3f" % accuracy)
print("Model Average Precision: %.3f" % avg_precision)
print("Model Balanced Accuracy Score: %.3f" % balanced_acc)
print("Model Recall: %.3f" % recall)

Model score accuracy: 0.824
Model Average Precision: 0.462
Model Balanced Accuracy Score: 0.689
Model Recall: 0.421


Despite the optimized parameters, the model still struggles due to the class imbalance