In [1]:
# ============================================================
# HOUSE PRICE PREDICTION USING LINEAR REGRESSION
# Dataset: California Housing (housing.csv)
# ============================================================

# -----------------------------
# STEP 1: IMPORT LIBRARIES
# -----------------------------
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


# -----------------------------
# STEP 2: LOAD DATASET
# -----------------------------
df = pd.read_csv("housing.csv")

# Display first rows
print(df.head())


# -----------------------------
# STEP 3: DATA EXPLORATION
# -----------------------------
print("\nDataset Shape:", df.shape)
print("\nDataset Info:")
print(df.info())
print("\nStatistical Summary:")
print(df.describe())
print("\nMissing Values:")
print(df.isnull().sum())


# -----------------------------
# STEP 4: HANDLE MISSING VALUES
# -----------------------------
# Fill missing numeric values with mean
df.fillna(df.mean(numeric_only=True), inplace=True)


# -----------------------------
# STEP 5: HANDLE CATEGORICAL DATA
# -----------------------------
# Convert 'ocean_proximity' to numeric using one-hot encoding
df = pd.get_dummies(df, columns=['ocean_proximity'], drop_first=True)


# -----------------------------
# STEP 6: SPLIT FEATURES & TARGET
# -----------------------------
X = df.drop('median_house_value', axis=1)  # Input features
y = df['median_house_value']               # Target variable


# -----------------------------
# STEP 7: TRAIN-TEST SPLIT
# -----------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)


# -----------------------------
# STEP 8: FEATURE SCALING
# -----------------------------
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


# -----------------------------
# STEP 9: TRAIN LINEAR REGRESSION MODEL
# -----------------------------
model = LinearRegression()
model.fit(X_train, y_train)


# -----------------------------
# STEP 10: MAKE PREDICTIONS
# -----------------------------
y_pred = model.predict(X_test)


# -----------------------------
# STEP 11: EVALUATE MODEL
# -----------------------------
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("\nMean Squared Error (MSE):", mse)
print("R2 Score:", r2)


# -----------------------------
# STEP 12: ACTUAL vs PREDICTED
# -----------------------------
results = pd.DataFrame({
    'Actual Price': y_test,
    'Predicted Price': y_pred
})

print("\nSample Predictions:")
print(results.head())


   longitude  latitude  housing_median_age  total_rooms  total_bedrooms  \
0    -122.23     37.88                41.0        880.0           129.0   
1    -122.22     37.86                21.0       7099.0          1106.0   
2    -122.24     37.85                52.0       1467.0           190.0   
3    -122.25     37.85                52.0       1274.0           235.0   
4    -122.25     37.85                52.0       1627.0           280.0   

   population  households  median_income  median_house_value ocean_proximity  
0       322.0       126.0         8.3252            452600.0        NEAR BAY  
1      2401.0      1138.0         8.3014            358500.0        NEAR BAY  
2       496.0       177.0         7.2574            352100.0        NEAR BAY  
3       558.0       219.0         5.6431            341300.0        NEAR BAY  
4       565.0       259.0         3.8462            342200.0        NEAR BAY  

Dataset Shape: (20640, 10)

Dataset Info:
<class 'pandas.core.frame.DataFr