In [None]:
import pandas as pd
import preprocessing as pps
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset
df = pd.read_csv("data/cleaned_data.csv")

# Outlier removal
total_rows = len(df)
outlier_min = 480
outlier_count = (df['target'] > outlier_min).sum()
outlier_percentage = (outlier_count / total_rows) * 100
print(f"Number of outliers (>{outlier_min} min): {outlier_count}")
print(f"Percentage of outliers: {outlier_percentage:.2f}%")
df = df[df['target'] <= outlier_min]

# Define target and features
y = df['target']
X = df.drop(columns=['target'])

# Identify categorical and numerical columns
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Preprocessing pipeline
preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

# KNN model and parameter distribution for RandomizedSearchCV
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', KNeighborsRegressor())
])

param_distributions = {
    "regressor__n_neighbors": range(1, 101, 5),
    "regressor__weights": ['uniform', 'distance']
}

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# RandomizedSearchCV
random_search = RandomizedSearchCV(
    pipeline,
    param_distributions,
    n_iter=20,
    cv=5,
    n_jobs=-1,
    scoring='neg_mean_absolute_error',
    random_state=42
)

random_search.fit(X_train, y_train)

print(f"\nBest parameters: {random_search.best_params_}")

# Predictions and evaluation
y_test_pred = random_search.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
print("RMSE on Test Data:", rmse_test)

# Plot: Actual vs Predicted
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_test_pred, alpha=0.5)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual Delay')
plt.ylabel('Predicted Delay')
plt.title('KNN Regression Actual vs Predicted Delay')
plt.grid(True)
plt.show()

# Plot: Residuals distribution
residuals = y_test - y_test_pred
plt.figure(figsize=(8, 5))
sns.histplot(residuals, kde=True, bins=30)
plt.title('KNN Regression Distribution of Residuals (Prediction Errors)')
plt.xlabel('Prediction Error (Actual - Predicted)')
plt.grid(True)
plt.show()

# Plot: Residuals vs Predicted
plt.figure(figsize=(8, 6))
plt.scatter(y_test_pred, residuals, alpha=0.5)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Regression Predicted Delay')
plt.ylabel('Residuals')
plt.title('KNN Regression - Residuals vs Predicted')
plt.grid(True)
plt.show()