In [1]:
import os
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold, cross_val_score

def read_data_from_csv(path):
  assert os.path.exists(path), f'File not found: {path}!'
  assert os.path.splitext(path)[-1] == '.csv', f'Unsupported file type {os.path.splitext(path)[-1]}!'
  data = pd.read_csv(path)
  column_list = data.columns.values.tolist()
  if 'Label' in column_list:
    # for the public dataset, label column is provided.
    column_list.remove('Label')
    X = data[column_list].values
    y = data['Label'].astype('int').values
    return X, y
  else:
    # for the private dataset, label column is not provided.
    X = data[column_list].values
    return X

# Load the Features and Labels from the public dataset
X_public, y_public = read_data_from_csv('assignment_3_public.csv')

# Normalize the Features
scaler = MinMaxScaler()
normalized_X_public = scaler.fit_transform(X_public)

# Split the public dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(normalized_X_public, y_public, test_size=0.2, random_state=42)


# param_grid = {
#     'criterion': ['gini', 'entropy'],
#     'splitter': ['best'],
#     'max_depth': [4, 5, 6, 7, 8, 9],
#     'min_samples_leaf': [4, 5, 6, 7, 8, 9],
#     'min_samples_split': [4, 5, 6, 7, 8, 9]
# }

# grid_search = GridSearchCV(estimator=DecisionTreeClassifier(), param_grid=param_grid, cv=8)

# After the comparsion of GridSearch, the best parameters is the below one.

X_train = normalized_X_public
y_train = y_public

tree_clf = DecisionTreeClassifier(
    criterion='gini',
    max_depth= 5,
    min_samples_leaf= 3,
    min_samples_split= 5,
    splitter= 'best'
)

k_folds = KFold(n_splits = 8)

scores = cross_val_score(tree_clf, X_train, y_train, cv = k_folds)
print("Cross Validation Scores: ", scores)
print("Average CV Score: ", scores.mean())


tree_clf.fit(X_train, y_train)






Cross Validation Scores:  [0.86486486 0.86486486 0.78378378 0.89189189 0.83783784 0.7972973
 0.80821918 0.8630137 ]
Average CV Score:  0.8389716771566087


In [6]:
# Use the pruned model to make predictions on the testing set
X_private = read_data_from_csv('assignment_3_private.csv')
preds = tree_clf.predict(X_private)

# # Count the number of '0' and '1'
# count_0 = (preds == 0).sum()
# count_1 = (preds == 1).sum()

# # Print the counts
# print("Count of '0':", count_0)
# print("Count of '1':", count_1)

submission = pd.DataFrame({'Label': preds})
submission.to_csv('assignment_3.csv', index=True, index_label='Id')

Count of '0': 38
Count of '1': 62
