In [82]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, cross_val_score

def read_data_from_csv(path):
  assert os.path.exists(path), f'File not found: {path}!'
  assert os.path.splitext(path)[-1] == '.csv', f'Unsupported file type {os.path.splitext(path)[-1]}!'
  data = pd.read_csv(path)
  column_list = data.columns.values.tolist()
  if 'Label' in column_list:
    # for the public dataset, label column is provided.
    column_list.remove('Label')
    X = data[column_list].values
    y = data['Label'].astype('int').values
    return X, y
  else:
    # for the private dataset, label column is not provided.
    X = data[column_list].values
    return X

# Load the Features and Labels from the public dataset
X_public, y_public = read_data_from_csv('A3_dataset/assignment_3_public.csv')

# Normalize the Features
scaler = MinMaxScaler()
normalized_X_public = scaler.fit_transform(X_public)

# Split the public dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(normalized_X_public, y_public, test_size=0.2, random_state=42)


param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [3, 4, 5, 6, 7, 8, 9],
    'min_samples_leaf': [2, 3, 4, 5, 6, 7, 8, 9],
    'min_samples_split': [2, 3, 4, 5, 6, 7, 8, 9]
}

grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=5)

X_train = normalized_X_public
y_train = y_public
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Use the pruned model to make predictions on the testing set
# predictions = best_model.predict(X_test)

# Calculate the accuracy
# accuracy = accuracy_score(y_test, predictions)
# print("Best parameters:", best_params)
# print("Accuracy:", accuracy)

# X_private = read_data_from_csv('A3_dataset/assignment_3_private.csv')

# preds = best_model.predict(X_private)

# submission = pd.DataFrame({'Label': preds})
# submission.to_csv('assignment_3.csv', index=True, index_label='Id')

In [83]:
grid_search.best_params_
# 1
# {'criterion': 'gini',
#  'max_depth': 5,
#  'min_samples_leaf': 4,
#  'min_samples_split': 7,
#  'splitter': 'random'}


# 2
# {'criterion': 'gini',
#  'max_depth': 5,
#  'min_samples_leaf': 3,
#  'min_samples_split': 5,
#  'splitter': 'random'}

# 3
# {'criterion': 'entropy',
#  'max_depth': 7,
#  'min_samples_leaf': 7,
#  'min_samples_split': 4,
#  'splitter': 'random'}

{'criterion': 'gini',
 'max_depth': 8,
 'min_samples_leaf': 8,
 'min_samples_split': 9,
 'splitter': 'random'}

In [136]:
def validatingModel(tree_cv: DecisionTreeClassifier):
    k_folds = KFold(n_splits = 5)

    scores = cross_val_score(tree_cv, X_train, y_train, cv = k_folds)
    print("Cross Validation Scores: ", scores)
    print("Average CV Score: ", scores.mean())

    tree_cv.fit(X_train, y_train)

    X_private = read_data_from_csv('A3_dataset/assignment_3_private.csv')
    preds = tree_cv.predict(X_private)

    # Count the number of '0' and '1'
    count_0 = (preds == 0).sum()
    count_1 = (preds == 1).sum()

    # Print the counts
    print("Count of '0':", count_0)
    print("Count of '1':", count_1)

# test different cases 

In [209]:
tree01_cv = DecisionTreeClassifier(
    criterion='gini',
    max_depth= 5,
    min_samples_leaf= 4,
    min_samples_split= 7,
    splitter= 'best'
)

validatingModel(tree01_cv)

Cross Validation Scores:  [0.8220339  0.81355932 0.87288136 0.79661017 0.8559322 ]
Average CV Score:  0.8322033898305085
Count of '0': 38
Count of '1': 62


In [228]:
tree02_cv = DecisionTreeClassifier(
    criterion='gini',
    max_depth= 5,
    min_samples_leaf= 3,
    min_samples_split= 5,
    splitter= 'best'
)

validatingModel(tree02_cv)

Cross Validation Scores:  [0.8220339  0.81355932 0.86440678 0.79661017 0.8559322 ]
Average CV Score:  0.8305084745762713
Count of '0': 38
Count of '1': 62


In [234]:
tree03_cv = DecisionTreeClassifier(
    criterion='entropy',
    max_depth= 7,
    min_samples_leaf= 7,
    min_samples_split= 4,
    splitter= 'best'
)

validatingModel(tree03_cv)


Cross Validation Scores:  [0.86440678 0.81355932 0.80508475 0.78813559 0.84745763]
Average CV Score:  0.8237288135593221
Count of '0': 37
Count of '1': 63
