In [14]:
import pandas as pd
import os

def read_data_from_csv(path):
    """Load datasets from CSV files.
    Args:
        path (str): Path to the CSV file.
    Returns:
        X (np.ndarray): Features of samples.
        y (np.ndarray): Labels of samples, only provided in the public datasets.
    """
    assert os.path.exists(path), f'File not found: {path}!'
    assert os.path.splitext(path)[
        -1] == '.csv', f'Unsupported file type {os.path.splitext(path)[-1]}!'

    data = pd.read_csv(path)
    column_list = data.columns.values.tolist()

    if 'Label' in column_list:
        # for the public dataset, label column is provided.
        column_list.remove('Label')
        X = data[column_list].values
        y = data['Label'].astype('int').values
        return X, y
    else:
        # for the private dataset, label column is not provided.
        X = data[column_list].values
        return X


X_public, y_public = read_data_from_csv('assignment_5_public.csv')
print('Shape of X_public:', X_public.shape)  # n_sample, m_feature (30000, 58)
print('Shape of y_public:', y_public.shape)  # n_sample (30000,)

'''
CODE HERE!
'''

X_private = read_data_from_csv('assignment_5_private.csv')
print('Shape of X_private:', X_private.shape)  # k_sample, m_feature (5000, 58)

import numpy as np

# remove and make your own predictions.
preds = np.full(len(X_private), -1,
                dtype=int)
'''
CODE HERE!
e.g.,
preds = np.full(len(X_private), -1, dtype=int)
'''

submission = pd.DataFrame({'Label': preds})
submission.to_csv('assignment_5.csv', index=True, index_label='Id')

Shape of X_public: (30000, 58)
Shape of y_public: (30000,)
Shape of X_private: (5000, 58)


In [19]:
# Step 6: Implement feature scaling
from sklearn.preprocessing import MinMaxScaler

# Exclude the output column from feature scaling
columns_to_scale = [58]

# Initialize the scaler
scaler = MinMaxScaler()

# Fit and transform the data
X_public[columns_to_scale] = scaler.fit_transform(X_public[columns_to_scale])

# Display the modified dataset after feature scaling
print(X_public)

[[8.00000000e+00 2.06000000e+02 6.37305696e-01 ... 8.00000000e-01
  5.00000000e-01 8.00000000e-01]
 [8.00000000e+00 2.10000000e+02 6.66666663e-01 ... 0.00000000e+00
  5.00000000e-01 0.00000000e+00]
 [9.00000000e+00 2.64000000e+02 5.66265058e-01 ... 0.00000000e+00
  5.00000000e-01 0.00000000e+00]
 ...
 [9.00000000e+00 3.25000000e+02 5.89506171e-01 ... 0.00000000e+00
  5.00000000e-01 0.00000000e+00]
 [1.10000000e+01 1.52800000e+03 4.25333333e-01 ... 0.00000000e+00
  3.75000000e-01 0.00000000e+00]
 [1.00000000e+01 1.27000000e+02 6.98412693e-01 ... 3.50000000e-01
  5.00000000e-02 3.50000000e-01]]


In [21]:
import warnings
warnings.filterwarnings("ignore",category=FutureWarning,module="sklearn")

from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split


# Step 8: Split the dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X_public, y_public, test_size=0.2, random_state=42)

# Step 9: Define the parameter grid
param_grid = {
    'n_neighbors': [3, 5, 7, 10],
    'metric': ['euclidean', 'manhattan']
}

# Step 10: Initialize the K-Nearest Neighbors classifier
knn = KNeighborsClassifier()

# Step 11: Perform grid search to find the best parameters
grid_search = GridSearchCV(knn, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Step 12: Make predictions on the test data using the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

# Step 13: Evaluate the accuracy of the best model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy (Best Model): {:.2f}%".format(accuracy * 100))

# Step 14: Print the best parameters found by grid search
print("Best Parameters:", grid_search.best_params_)

Accuracy (Best Model): 57.58%
Best Parameters: {'metric': 'manhattan', 'n_neighbors': 7}


In [22]:
pred=grid_search.predict(X_private)

In [23]:
pred

array([0, 0, 1, ..., 0, 1, 0])