<a href="https://colab.research.google.com/github/nicolai5965/Galaxy-Redshift-Prediction-using-K-Nearest-Neighbors/blob/main/K_Nearest_neighbors_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from google.colab import drive

# Mount Google Drive to load the dataset
drive.mount('/content/drive') 

Mounted at /content/drive


In [2]:
# Define the file path and load the train and test data
filepath = '/content/drive/My Drive/Colab Notebooks/Machine Learning/K_Nearest_Neighbors/'
train_data = pd.read_csv(f"{filepath}galaxies_train.csv")
test_data = pd.read_csv(f"{filepath}galaxies_test.csv")

In [3]:
# Extract the features and target variable from the train and test data
X_train = train_data.iloc[:, 1:].values
y_train = train_data.iloc[:, 0].values
X_test = test_data.iloc[:, 1:].values
y_test = test_data.iloc[:, 0].values
print("Number of training instances: %i" % X_train.shape[0])
print("Number of test instances: %i" % X_test.shape[0])
print("Number of attributes: %i" % X_train.shape[1])

Number of training instances: 500
Number of test instances: 500
Number of attributes: 10


In [4]:
# Display the first few rows and summary statistics of the train data
display(train_data.head())

Unnamed: 0,redshift,norm(psf_u),norm(psf_g),norm(psf_r),norm(psf_i),norm(psf_z),norm(model_u),norm(model_g),norm(model_r),norm(model_i),norm(model_z)
0,0.398406,0.376969,0.490428,0.518303,0.486947,0.558504,0.389864,0.404095,0.433127,0.372722,0.466342
1,0.978559,0.45393,0.54779,0.555669,0.530337,0.624752,0.452256,0.455277,0.46843,0.412959,0.527218
2,1.444914,0.425427,0.533386,0.549183,0.500099,0.586683,0.428505,0.442912,0.461761,0.3875,0.493657
3,1.935515,0.472868,0.589674,0.611404,0.574513,0.661662,0.46768,0.489993,0.51515,0.445684,0.559775
4,0.859419,0.472581,0.556951,0.574008,0.554864,0.643174,0.465837,0.462316,0.483696,0.430512,0.543073


In [5]:
display(train_data.describe())

Unnamed: 0,redshift,norm(psf_u),norm(psf_g),norm(psf_r),norm(psf_i),norm(psf_z),norm(model_u),norm(model_g),norm(model_r),norm(model_i),norm(model_z)
count,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0,500.0
mean,1.530397,0.508189,0.585233,0.598743,0.56542,0.657623,0.49207,0.482041,0.4988,0.433464,0.549286
std,1.015298,0.113784,0.066347,0.055955,0.062742,0.062334,0.093213,0.055912,0.049474,0.051918,0.05881
min,0.053544,0.1606,0.323741,0.36081,0.277867,0.389103,0.218967,0.266641,0.288442,0.212495,0.296608
25%,0.857138,0.443429,0.54773,0.566612,0.532446,0.623552,0.441367,0.450879,0.470741,0.409644,0.519541
50%,1.448329,0.482858,0.576151,0.593819,0.561914,0.654375,0.471931,0.474597,0.497387,0.432526,0.549716
75%,1.953585,0.538532,0.621724,0.637215,0.60507,0.699897,0.513634,0.514286,0.534556,0.467385,0.589345
max,6.786635,0.937156,0.925228,0.886462,0.926476,0.931657,0.862775,0.740183,0.686946,0.722382,0.790126


In [6]:
# KNearestNeighbors class

class KNearestNeighbors:
    def __init__(self, n_neighbors=3, metric='euclidean', p=None, weighted=False):
        self.n_neighbors = n_neighbors
        self.metric = metric
        self.p = p
        self.weighted = weighted

    @staticmethod
    def euclidean_distance(a, b):
        return np.sqrt(np.sum((a - b)**2))

    @staticmethod
    def manhattan_distance(a, b):
        return np.sum(np.abs(a - b))

    @staticmethod
    def minkowski_distance(a, b, p):
        return np.power(np.sum(np.power(np.abs(a - b), p)), 1/p)

    def fit(self, X_train, y_train):
        self.X_train = X_train
        self.y_train = y_train

    def predict_instance(self, X_test_instance):
        if self.metric == 'euclidean':
            distance_func = self.euclidean_distance
        elif self.metric == 'manhattan':
            distance_func = self.manhattan_distance
        elif self.metric == 'minkowski':
            if self.p is None:
                raise ValueError("Parameter 'p' must be provided for Minkowski distance.")
            distance_func = lambda a, b: self.minkowski_distance(a, b, self.p)
        else:
            raise ValueError("Invalid distance metric. Choose from 'euclidean', 'manhattan', or 'minkowski'.")

        distances = np.array([distance_func(X_test_instance, T) for T in self.X_train])
        sorted_indices = np.argsort(distances)
        nearest_indices = sorted_indices[:self.n_neighbors]
        nearest_labels = self.y_train[nearest_indices]

        if self.weighted:
            nearest_distances = distances[nearest_indices]
            weights = 1 / (nearest_distances + 1e-10)  # Adding a small constant to avoid division by zero
            predicted_label = np.sum(nearest_labels * weights) / np.sum(weights)
        else:
            predicted_label = np.mean(nearest_labels)

        return predicted_label

    def predict(self, X_test):
        return np.array([self.predict_instance(x) for x in X_test])

# Define the evaluation metric: Root Mean Squared Error (RMSE)
def RMSE(t_test, predictions):
    return np.sqrt(((predictions - y_test) ** 2).mean())

In [7]:
# Find the best value of k based on the RMSE for the validation set
def find_best_k(X_train, y_train, X_val, y_val, min_k=1, max_k=20, metric='euclidean', p=None, weighted=False):
    best_k = min_k
    best_rmse = float('inf')

    for k in range(min_k, max_k + 1):
        knn = KNearestNeighbors(n_neighbors=k, metric=metric, p=p, weighted=weighted)
        knn.fit(X_train, y_train)
        predictions = knn.predict(X_val)
        rmse = RMSE(y_val, predictions)

        if rmse < best_rmse:
            best_rmse = rmse
            best_k = k

    return best_k, best_rmse

# Split the dataset into train, validation, and test sets
def train_test_split(X, y, test_size=0.5, random_state=None):
    np.random.seed(random_state)
    indices = np.arange(len(X))
    np.random.shuffle(indices)

    test_count = int(test_size * len(X))
    X_test = X[indices[:test_count]]
    y_test = y[indices[:test_count]]
    X_val = X[indices[test_count:]]
    y_val = y[indices[test_count:]]

    return X_test, y_test, X_val, y_val

# Split the test_data into test and validation sets
X_test, y_test, X_val, y_val = train_test_split(X_test, y_test, test_size=0.5)


In [8]:
# Find the best k and RMSE for each distance metric
best_k_euclidean, best_rmse_euclidean = find_best_k(X_train, y_train, X_val, y_val, metric='euclidean', weighted=True)
best_k_manhattan, best_rmse_manhattan = find_best_k(X_train, y_train, X_val, y_val, metric='manhattan', weighted=True)
best_k_minkowski, best_rmse_minkowski = find_best_k(X_train, y_train, X_val, y_val, metric='minkowski', p=3, weighted=True)

print("Euclidean Distance:")
print(f"Best k: {best_k_euclidean}, RMSE: {best_rmse_euclidean}")

print("\nManhattan Distance:")
print(f"Best k: {best_k_manhattan}, RMSE: {best_rmse_manhattan}")

print("\nMinkowski Distance:")
print(f"Best k: {best_k_minkowski}, RMSE: {best_rmse_minkowski}")

# Train the models with the best k values for each distance metric and make predictions on the test set
knn_best_euclidean = KNearestNeighbors(n_neighbors=best_k_euclidean, metric='euclidean', weighted=True)
knn_best_euclidean.fit(X_train, y_train)
predictions_best_k_euclidean = knn_best_euclidean.predict(X_test)

knn_best_manhattan = KNearestNeighbors(n_neighbors=best_k_manhattan, metric='manhattan', weighted=True)
knn_best_manhattan.fit(X_train, y_train)
predictions_best_k_manhattan = knn_best_manhattan.predict(X_test)

knn_best_minkowski = KNearestNeighbors(n_neighbors=best_k_minkowski, metric='minkowski', p=3, weighted=True)
knn_best_minkowski.fit(X_train, y_train)
predictions_best_k_minkowski = knn_best_minkowski.predict(X_test)

# Calculate RMSE for the best k values for each distance metric
rmse_best_k_euclidean = RMSE(y_test, predictions_best_k_euclidean)
rmse_best_k_manhattan = RMSE(y_test, predictions_best_k_manhattan)
rmse_best_k_minkowski = RMSE(y_test, predictions_best_k_minkowski)

print("\nTest Set RMSE:")
print("RMSE (best k, Euclidean):", rmse_best_k_euclidean)
print("RMSE (best k, Manhattan):", rmse_best_k_manhattan)
print("RMSE (best k, Minkowski):", rmse_best_k_minkowski)


Euclidean Distance:
Best k: 20, RMSE: 1.2201003580123162

Manhattan Distance:
Best k: 20, RMSE: 1.2080812967424353

Minkowski Distance:
Best k: 20, RMSE: 1.2127105819801542

Test Set RMSE:
RMSE (best k, Euclidean): 0.7593575522746242
RMSE (best k, Manhattan): 0.7732395074157699
RMSE (best k, Minkowski): 0.7602897964166686


In [9]:
def MAE(y_true, y_pred): # Mean Absolute Error
    return np.mean(np.abs(y_true - y_pred))


def r_squared(y_true, y_pred): # Coefficient of Determination
    y_mean = np.mean(y_true)
    ss_total = np.sum((y_true - y_mean) ** 2)
    ss_residual = np.sum((y_true - y_pred) ** 2)
    return 1 - (ss_residual / ss_total)


def MSLE(y_true, y_pred): # Mean Squared Logarithmic Error
    return np.mean((np.log1p(y_true) - np.log1p(y_pred)) ** 2)


def MAPE(y_true, y_pred): # Mean Absolute Percentage Error
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [10]:
# Calculate MAE, R-squared, MSLE, and MAPE for the best k values for each distance metric
mae_best_k_euclidean = MAE(y_test, predictions_best_k_euclidean)
mae_best_k_manhattan = MAE(y_test, predictions_best_k_manhattan)
mae_best_k_minkowski = MAE(y_test, predictions_best_k_minkowski)

r_squared_best_k_euclidean = r_squared(y_test, predictions_best_k_euclidean)
r_squared_best_k_manhattan = r_squared(y_test, predictions_best_k_manhattan)
r_squared_best_k_minkowski = r_squared(y_test, predictions_best_k_minkowski)

msle_best_k_euclidean = MSLE(y_test, predictions_best_k_euclidean)
msle_best_k_manhattan = MSLE(y_test, predictions_best_k_manhattan)
msle_best_k_minkowski = MSLE(y_test, predictions_best_k_minkowski)

mape_best_k_euclidean = MAPE(y_test, predictions_best_k_euclidean)
mape_best_k_manhattan = MAPE(y_test, predictions_best_k_manhattan)
mape_best_k_minkowski = MAPE(y_test, predictions_best_k_minkowski)

# Print the results
print("Euclidean Distance:")
print(f"MAE: {mae_best_k_euclidean}, R-squared: {r_squared_best_k_euclidean}, MSLE: {msle_best_k_euclidean}, MAPE: {mape_best_k_euclidean}")

print("\nManhattan Distance:")
print(f"MAE: {mae_best_k_manhattan}, R-squared: {r_squared_best_k_manhattan}, MSLE: {msle_best_k_manhattan}, MAPE: {mape_best_k_manhattan}")

print("\nMinkowski Distance:")
print(f"MAE: {mae_best_k_minkowski}, R-squared: {r_squared_best_k_minkowski}, MSLE: {msle_best_k_minkowski}, MAPE: {mape_best_k_minkowski}")


Euclidean Distance:
MAE: 0.5658499976933106, R-squared: 0.4701101432119601, MSLE: 0.09392645135105897, MAPE: 86.3890280812257

Manhattan Distance:
MAE: 0.579521141864003, R-squared: 0.45055902517493696, MSLE: 0.09524701249024914, MAPE: 85.94177254698589

Minkowski Distance:
MAE: 0.5635348939490178, R-squared: 0.46880827970651906, MSLE: 0.09361466248889026, MAPE: 85.64657490005575
