In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error


In [2]:
file_path = "robust_student_loan_dataset.csv" 
data = pd.read_csv(file_path)

In [3]:
data = data.dropna()  # Dropping rows with missing values

# 2. Remove duplicates
data = data.drop_duplicates()

In [7]:
data['Debt-to-Income Ratio'] = data['Monthly Installment (USD)'] / data['Monthly Income (USD)']
data['Loan-to-Income Ratio'] = data['Loan Amount (USD)'] / data['Monthly Income (USD)']

In [8]:
X = data[['Loan Amount (USD)', 'Monthly Income (USD)', 'Monthly Expenses (USD)', 
          'Interest Rate (%)', 'Disposable Income (USD)', 'Monthly Installment (USD)', 
          'Debt-to-Income Ratio', 'Loan-to-Income Ratio']]
y = data['Time to Repay (Months)'] 

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [10]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# =========================
# KNN Regressor with Hyperparameter Tuning
# =========================
knn = KNeighborsRegressor()

# Define parameter grid for GridSearchCV
param_grid = {
    'n_neighbors': [3, 5, 7, 10],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

# GridSearchCV to find the best parameters
grid_search = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Best parameters
best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Train the best model
best_knn = grid_search.best_estimator_

# =========================
# Model Evaluation
# =========================
y_pred = best_knn.predict(X_test_scaled)

# Calculate RMSE
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")

Best Parameters: {'metric': 'manhattan', 'n_neighbors': 10, 'weights': 'uniform'}
Mean Squared Error (MSE): 146324.8245755271
Root Mean Squared Error (RMSE): 382.52427972029056


In [14]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(best_knn, X_test_scaled, y, cv=5, scoring='neg_mean_squared_error')
print(f"Cross-validated RMSE: {(-scores.mean()) ** 0.5}")


ValueError: Found input variables with inconsistent numbers of samples: [129, 642]