In [19]:
import os
import pandas as pd
from sklearn import datasets
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import accuracy_score

In [18]:
def read_data_from_csv(path):
    """Load datasets from CSV files.
    Args:
        path (str): Path to the CSV file.
    Returns:
        X (np.ndarray): Features of samples.
        y (np.ndarray): Labels of samples, only provided in the public datasets.
    """
    assert os.path.exists(path), f'File not found: {path}!'
    assert os.path.splitext(path)[-1] == '.csv', f'Unsupported file type {os.path.splitext(path)[-1]}!'

    data = pd.read_csv(path)
    column_list = data.columns.values.tolist()

    if 'Label' in column_list:
        # for the public dataset, label column is provided.
        column_list.remove('Label')
        X = data[column_list].values
        y = data['Label'].astype('int').values
        return X, y
    else:
        # for the private dataset, label column is not provided.
        X = data[column_list].values
        return X


X_public, y_public = read_data_from_csv('assignment_3_public.csv')
print('Shape of X_public:', X_public.shape)  # n_sample, m_feature (590, 14)
print('Shape of y_public:', y_public.shape)  # n_sample (590,)

'''
CODE HERE!
'''

# Build the Regression tree
reg_tree = DecisionTreeRegressor(criterion='squared_error', splitter='best', max_depth=5, min_samples_split=10)

# Fit the model to the training data
reg_tree.fit(X_public, y_public)


X_private = read_data_from_csv('assignment_3_private.csv')
print('Shape of X_private:', X_private.shape)  # k_sample, m_feature (100, 14)



# Predict on the testing set
preds = reg_tree.predict(X_private)

'''
CODE HERE!
e.g.,
preds = np.full(len(X_private), -1, dtype=int)
'''

submission = pd.DataFrame({'Label': preds})
submission.to_csv('assignment_3.csv', index=True, index_label='Id')

Shape of X_public: (590, 14)
Shape of y_public: (590,)
Shape of X_private: (100, 14)
