In [56]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler

def read_data_from_csv(path):
  assert os.path.exists(path), f'File not found: {path}!'
  assert os.path.splitext(path)[-1] == '.csv', f'Unsupported file type {os.path.splitext(path)[-1]}!'
  data = pd.read_csv(path)
  column_list = data.columns.values.tolist()
  if 'Label' in column_list:
    # for the public dataset, label column is provided.
    column_list.remove('Label')
    X = data[column_list].values
    y = data['Label'].astype('int').values
    return X, y
  else:
    # for the private dataset, label column is not provided.
    X = data[column_list].values
    return X

# Load the Features and Labels from the public dataset
X_public, y_public = read_data_from_csv('A3_dataset/assignment_3_public.csv')

# Normalize the Features
scaler = MinMaxScaler()
normalized_X_public = scaler.fit_transform(X_public)

# Split the public dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(normalized_X_public, y_public, test_size=0.2, random_state=42)


param_grid = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [3, 5, 7, 9],
    'min_samples_leaf': [1, 3, 5],
    'min_samples_split': [2, 4, 6]
}

grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=5)

X_train = normalized_X_public
y_train = y_public
grid_search.fit(X_train, y_train)

# Get the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

# Use the pruned model to make predictions on the testing set
# predictions = best_model.predict(X_test)

# Calculate the accuracy
# accuracy = accuracy_score(y_test, predictions)
# print("Best parameters:", best_params)
# print("Accuracy:", accuracy)

X_private = read_data_from_csv('A3_dataset/assignment_3_private.csv')

preds = best_model.predict(X_private)

submission = pd.DataFrame({'Label': preds})
submission.to_csv('assignment_3.csv', index=True, index_label='Id')

In [59]:
grid_search.best_params_

{'criterion': 'entropy',
 'max_depth': 7,
 'min_samples_leaf': 5,
 'min_samples_split': 4,
 'splitter': 'random'}

In [58]:
# Print the cross-validation score of the best model
print("Best Cross-Validation Score:", grid_search.best_score_)

print("n_splits_:", grid_search.n_splits_)



print("Best Cross-Validation Score:", grid_search.best_score_)

Best Cross-Validation Score: 0.8661016949152541
n_splits_: 5
Best Cross-Validation Score: 0.8661016949152541


In [53]:
pd.DataFrame(grid_search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_min_samples_leaf,param_min_samples_split,param_splitter,params,...,split43_test_score,split44_test_score,split45_test_score,split46_test_score,split47_test_score,split48_test_score,split49_test_score,mean_test_score,std_test_score,rank_test_score
0,0.000924,0.000275,0.000200,0.000400,gini,3,1,2,best,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",...,0.813559,0.830508,0.830508,0.864407,0.813559,0.847458,0.838983,0.840678,0.029308,93
1,0.000432,0.000513,0.000158,0.000363,gini,3,1,2,random,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",...,0.830508,0.838983,0.889831,0.855932,0.855932,0.855932,0.872881,0.854068,0.032966,7
2,0.000959,0.000344,0.000242,0.000431,gini,3,1,4,best,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",...,0.813559,0.830508,0.830508,0.864407,0.813559,0.847458,0.838983,0.841186,0.029347,85
3,0.000340,0.000474,0.000230,0.000415,gini,3,1,4,random,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",...,0.830508,0.830508,0.872881,0.864407,0.881356,0.847458,0.822034,0.850847,0.030836,21
4,0.001045,0.000420,0.000260,0.000438,gini,3,1,6,best,"{'criterion': 'gini', 'max_depth': 3, 'min_sam...",...,0.813559,0.830508,0.830508,0.864407,0.813559,0.847458,0.838983,0.841695,0.028885,75
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,0.000602,0.000492,0.000181,0.000386,entropy,9,5,2,random,"{'criterion': 'entropy', 'max_depth': 9, 'min_...",...,0.838983,0.847458,0.864407,0.855932,0.881356,0.813559,0.822034,0.846271,0.029698,50
140,0.001835,0.000417,0.000119,0.000322,entropy,9,5,4,best,"{'criterion': 'entropy', 'max_depth': 9, 'min_...",...,0.830508,0.872881,0.779661,0.906780,0.881356,0.754237,0.822034,0.838305,0.031383,106
141,0.000480,0.000500,0.000252,0.000431,entropy,9,5,4,random,"{'criterion': 'entropy', 'max_depth': 9, 'min_...",...,0.813559,0.847458,0.898305,0.864407,0.906780,0.855932,0.855932,0.851864,0.028875,16
142,0.001812,0.000681,0.000117,0.000318,entropy,9,5,6,best,"{'criterion': 'entropy', 'max_depth': 9, 'min_...",...,0.838983,0.864407,0.779661,0.906780,0.881356,0.771186,0.838983,0.837288,0.028513,110


In [57]:
result = pd.read_csv('assignment_3.csv', index_col=0)
# result.head()

label_counts = result['Label'].value_counts()

# Count the number of '0' and '1'
count_0 = (result['Label'] == 0).sum()
count_1 = (result['Label'] == 1).sum()

# Print the counts
print("Count of '0':", count_0)
print("Count of '1':", count_1)


Count of '0': 38
Count of '1': 62
