In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score,accuracy_score,precision_score, recall_score, f1_score
from sklearn.metrics import silhouette_score, davies_bouldin_score, calinski_harabasz_score

from scipy.stats import shapiro
import statsmodels.api as sm



In [3]:
data = pd.read_csv("../data/1553768847-housing.csv")

In [5]:
df=data

In [12]:
# Display basic information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  int64  
 3   total_rooms         20640 non-null  int64  
 4   total_bedrooms      20640 non-null  float64
 5   population          20640 non-null  int64  
 6   households          20640 non-null  int64  
 7   median_income       20640 non-null  float64
 8   ocean_proximity     20640 non-null  object 
 9   median_house_value  20640 non-null  int64  
dtypes: float64(4), int64(5), object(1)
memory usage: 1.6+ MB


In [13]:
df.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,ocean_proximity,median_house_value
0,-122.23,37.88,41,880,129.0,322,126,8.3252,NEAR BAY,452600
1,-122.22,37.86,21,7099,1106.0,2401,1138,8.3014,NEAR BAY,358500
2,-122.24,37.85,52,1467,190.0,496,177,7.2574,NEAR BAY,352100
3,-122.25,37.85,52,1274,235.0,558,219,5.6431,NEAR BAY,341300
4,-122.25,37.85,52,1627,280.0,565,259,3.8462,NEAR BAY,342200


In [8]:
# Check for missing values
print(df.isnull().sum())

longitude               0
latitude                0
housing_median_age      0
total_rooms             0
total_bedrooms        207
population              0
households              0
median_income           0
ocean_proximity         0
median_house_value      0
dtype: int64


In [None]:
# Handle missing valuesin the total_bedrooms column by filling with median
df.fillna({"total_bedrooms": df["total_bedrooms"].median()}, inplace=True)


In [84]:
# Re-Check for missing values
print(df.isnull().sum())

longitude                     0
latitude                      0
housing_median_age            0
total_rooms                   0
total_bedrooms                0
population                    0
households                    0
median_income                 0
median_house_value            0
ocean_proximity_INLAND        0
ocean_proximity_ISLAND        0
ocean_proximity_NEAR BAY      0
ocean_proximity_NEAR OCEAN    0
Cluster                       0
dtype: int64


In [14]:
# Convert categorical  column Ocean Proximity into numerical using one-hot encoding to ensure models perform optimally.
df = pd.get_dummies(df, columns=["ocean_proximity"], drop_first=True)

In [22]:
# Define features and target variable
X = df.drop(columns=["median_house_value"])
y = df["median_house_value"]

In [23]:

#Normalize numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [24]:
# Split into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [25]:
# Check shapes of training and test sets
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((16512, 12), (4128, 12), (16512,), (4128,))

### 1. Logistic Regression Model ###

In [27]:

# Convert median_house_value into binary classification target
# Class 1: High value (above median), Class 0: Low value (below median)
median_value = np.median(y)
y_class = (y > median_value).astype(int)

In [29]:
# Split classification target into training and testing sets
y_train_class, y_test_class = train_test_split(y_class, test_size=0.2, random_state=42)

In [30]:
# Train Logistic Regression model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train_class)


### Evaluating Logistic Regression Model Performance

In [55]:
def evaluate_classification(model_name, y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    
    print(f"🔹 {model_name} Performance:")
    print(f"   - Accuracy: {accuracy:.4f}")
    print(f"   - Precision: {precision:.4f}")
    print(f"   - Recall: {recall:.4f}")
    print(f"   - F1 Score: {f1:.4f}")
    print("-" * 40)


In [None]:
# Predicting Logistic Regression
y_pred_class = log_reg.predict(X_test)
log_reg_accuracy = accuracy_score(y_test_class, y_pred_class)

In [None]:
# Evaluate Logistic Regression
evaluate_classification("Logistic Regression", y_test_class, y_pred_class)

🔹 Logistic Regression Performance:
   - Accuracy: 0.8333
   - Precision: 0.8385
   - Recall: 0.8230
   - F1 Score: 0.8307
----------------------------------------


### 2. K-MEANS CLUSTERING ###

In [65]:

# Determine the optimal number of clusters using the Elbow Method
inertia = []
k_values = range(2, 11)


for k in k_values:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(X_scaled)
    inertia.append(kmeans.inertia_)

In [66]:
# Train K-Means with optimal clusters (e.g., k=4)
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
df["Cluster"] = kmeans.fit_predict(X_scaled)

In [67]:
# Display cluster distribution
print("K-Means Cluster Distribution:\n", df["Cluster"].value_counts())

K-Means Cluster Distribution:
 Cluster
1    10430
3     6470
0     2209
2     1531
Name: count, dtype: int64


### Evaluating K Means Clustering Performace 

In [68]:
def evaluate_kmeans(model_name, X, labels):
    """Evaluates K-Means Clustering performance using key metrics."""
    silhouette = silhouette_score(X, labels)  # Measures how well clusters are separated
    davies_bouldin = davies_bouldin_score(X, labels)  # Lower is better
    calinski_harabasz = calinski_harabasz_score(X, labels)  # Higher is better
    
    print(f"🔹 {model_name} Performance:")
    print(f"   - Silhouette Score: {silhouette:.4f} (Higher is better)")
    print(f"   - Davies-Bouldin Index: {davies_bouldin:.4f} (Lower is better)")
    print(f"   - Calinski-Harabasz Index: {calinski_harabasz:.4f} (Higher is better)")
    print("-" * 40)

In [69]:
evaluate_kmeans("K-Means Clustering", X_scaled, df["Cluster"])


🔹 K-Means Clustering Performance:
   - Silhouette Score: 0.2828 (Higher is better)
   - Davies-Bouldin Index: 1.3159 (Lower is better)
   - Calinski-Harabasz Index: 4859.7751 (Higher is better)
----------------------------------------



### 3. DECISION TREE REGRESSION

In [71]:
# Train Decision Tree Regressor
dt_reg = DecisionTreeRegressor(random_state=42)
dt_reg.fit(X_train, y_train)

In [82]:
# Predicting Decision Tree
y_pred_dt = dt_reg.predict(X_test)
dt_rmse = np.sqrt(mean_squared_error(y_test, y_pred_dt))


### 4. RANDOM FOREST REGRESSION ###

In [74]:
# Train Random Forest Regressor
rf_reg = RandomForestRegressor(n_estimators=100, random_state=42)
rf_reg.fit(X_train, y_train)

In [83]:

# Predicting  Random Forest
y_pred_rf = rf_reg.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))



### 5. SUPPORT VECTOR REGRESSION (SVR) ###

In [77]:
# Train Support Vector Regressor with RBF Kernel
svr_reg = SVR(kernel="rbf")
svr_reg.fit(X_train, y_train)

In [None]:
# Predicting SVR model
y_pred_svr = svr_reg.predict(X_test)
svr_rmse = np.sqrt(mean_squared_error(y_test, y_pred_svr))


### Evaluating Model Performance

In [79]:
def evaluate_model(model_name, y_true, y_pred):
    """Evaluates a regression model and prints key metrics."""
    mae = mean_absolute_error(y_true, y_pred)  # Mean Absolute Error
    mse = mean_squared_error(y_true, y_pred)   # Mean Squared Error
    rmse = np.sqrt(mse)                        # Root Mean Squared Error
    r2 = r2_score(y_true, y_pred)              # R-squared (coefficient of determination)
    
    print(f"🔹 {model_name} Performance:")
    print(f"   - MAE:  {mae:.4f}")
    print(f"   - MSE:  {mse:.4f}")
    print(f"   - RMSE: {rmse:.4f}")
    print(f"   - R² Score: {r2:.4f}")
    print("-" * 40)

In [80]:
# Decision Tree Evaluation
evaluate_model("Decision Tree Regression", y_test, y_pred_dt)

# Random Forest Evaluation
evaluate_model("Random Forest Regression", y_test, y_pred_rf)

# Support Vector Regression (SVR) Evaluation
evaluate_model("Support Vector Regression", y_test, y_pred_svr)

🔹 Decision Tree Regression Performance:
   - MAE:  44338.0247
   - MSE:  4890905995.7766
   - RMSE: 69935.0127
   - R² Score: 0.6268
----------------------------------------
🔹 Random Forest Regression Performance:
   - MAE:  31609.5208
   - MSE:  2409093745.1586
   - RMSE: 49082.5198
   - R² Score: 0.8162
----------------------------------------
🔹 Support Vector Regression Performance:
   - MAE:  86961.4326
   - MSE:  13656001636.2463
   - RMSE: 116858.8963
   - R² Score: -0.0421
----------------------------------------
