In [30]:
# Cell 1: Import libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt


%matplotlib inline


In [31]:
# Cell 2: Load and preprocess dataset
data = pd.read_csv("../data/cleaned_california_housing.csv")
data["Target_Class"] = pd.qcut(data["MedHouseVal"], q=4, labels=["Low", "Medium", "High", "Very High"])

X = data.drop(columns=["MedHouseVal", "Target_Class"])
y = data["Target_Class"]

print("Features (X) shape:", X.shape)
print("Target (y) shape:", y.shape)


Features (X) shape: (19794, 8)
Target (y) shape: (19794,)


In [32]:
# Cell 3: Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (15835, 8)
Testing set size: (3959, 8)


In [33]:
# Cell 4: K-Nearest Neighbors
knn_model = KNeighborsClassifier()
param_grid_knn = {"n_neighbors": range(3, 11)}
grid_search_knn = GridSearchCV(knn_model, param_grid_knn, scoring='accuracy', cv=3, n_jobs=-1, verbose=1)
grid_search_knn.fit(X_train, y_train)

best_knn_model = grid_search_knn.best_estimator_
y_pred_knn = best_knn_model.predict(X_test)

print("KNN Classification Report:")
print(classification_report(y_test, y_pred_knn))


Fitting 3 folds for each of 8 candidates, totalling 24 fits
KNN Classification Report:
              precision    recall  f1-score   support

        High       0.27      0.42      0.33       987
         Low       0.40      0.41      0.40       991
      Medium       0.28      0.25      0.27       991
   Very High       0.47      0.26      0.34       990

    accuracy                           0.33      3959
   macro avg       0.36      0.33      0.33      3959
weighted avg       0.36      0.33      0.33      3959



In [34]:
# Cell 5: Decision Tree Classifier
dt_model = DecisionTreeClassifier(random_state=42)
param_grid_dt = {"max_depth": [None, 5, 10], "min_samples_split": [2, 5]}
grid_search_dt = GridSearchCV(dt_model, param_grid_dt, scoring='accuracy', cv=3, n_jobs=-1, verbose=1)
grid_search_dt.fit(X_train, y_train)

best_dt_model = grid_search_dt.best_estimator_
y_pred_dt = best_dt_model.predict(X_test)

print("Decision Tree Classification Report:")
print(classification_report(y_test, y_pred_dt))


Fitting 3 folds for each of 6 candidates, totalling 18 fits


Decision Tree Classification Report:
              precision    recall  f1-score   support

        High       0.56      0.66      0.61       987
         Low       0.79      0.79      0.79       991
      Medium       0.62      0.60      0.61       991
   Very High       0.80      0.69      0.75       990

    accuracy                           0.69      3959
   macro avg       0.70      0.69      0.69      3959
weighted avg       0.70      0.69      0.69      3959



In [35]:
# Cell 6: Save classification results
classification_results = pd.DataFrame({
    "Actual": y_test,
    "KNN_Predictions": y_pred_knn,
    "DT_Predictions": y_pred_dt
})
classification_results.to_csv("../results/classification/classification_predictions.csv", index=False)

metrics = pd.DataFrame({
    "Model": ["KNN", "Decision Tree"],
    "Accuracy": [accuracy_score(y_test, y_pred_knn), accuracy_score(y_test, y_pred_dt)]
})
metrics.to_csv("../results/classification/classification_metrics.csv", index=False)

print("Classification results and metrics saved.")


Classification results and metrics saved.
