ðŸŽ“ Capstone Project 1 â€” California Housing Prediction

1. Project Overview
1.1 Overview and Problem Statement

## Import the required packages

In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense


: 

## Load Data

In [None]:
df = pd.read_csv('housing.csv')
print(df.head())
print(df.shape)


## 2. Data Preparation and Exploratory Data Analysis (EDA)

In [None]:
# 2. Data Preparation and Exploratory Data Analysis (EDA)

# Show the columns
print(df.columns)

# ocean_proximity is categorical variable
to_categorical = ['ocean_proximity']
df[to_categorical] = df[to_categorical].astype('category')

# housing_median_age and total_rooms are integer variables
to_integer = ['housing_median_age', 'total_rooms', 'households', 'population']
df[to_integer] = df[to_integer].astype('int64')

numerical_features = ['longitude', 'latitude', 'housing_median_age', 'total_rooms', 'total_bedrooms', 'population', 'households', 'median_income']

print(df.info())
print(df.describe())


In [None]:
# plot the histogram of the numerical features (in 2 columns)
fig, axes = plt.subplots(nrows=4, ncols=2, figsize=(20, 20))

for i, feature in enumerate(numerical_features):
    row = i // 2
    col = i % 2
    sns.histplot(df[feature], bins=30, kde=False, ax=axes[row, col])
    axes[row, col].set_title(f'Histogram of {feature}')
    axes[row, col].set_xlabel(feature)
    axes[row, col].set_ylabel('Frequency')

    plt.title(f'Histogram of {feature}')
    plt.xlabel(feature)
    plt.ylabel('Frequency')

# plot the scatter plot of longitude and latitude
plt.figure(figsize=(10, 6))
sns.scatterplot(x='longitude', y='latitude', data=df, hue='median_house_value', palette='viridis')
plt.title('Scatter Plot of Longitude and Latitude with Median House Value')
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.show()

# plot heatmap of the correlation matrix of the numerical features
corr_matrix = df[numerical_features].corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix of Numerical Features')
plt.show()



# 3. Data Preprocessing

In [None]:
# Split the data into training and testing sets
df_all_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_all_train, test_size=0.25, random_state=42)

y_train = df_train.median_house_value.values
y_val = df_val.median_house_value.values
y_test = df_test.median_house_value.values

df_train = df_train.drop('median_house_value', axis=1)
df_val = df_val.drop('median_house_value', axis=1)
df_test = df_test.drop('median_house_value', axis=1)

dv = DictVectorizer(sparse=True)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

all_train_dict = df_all_train.to_dict(orient='records')
X_all_train = dv.fit_transform(all_train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

test_dict = df_test.to_dict(orient='records')
X_test = dv.transform(test_dict)

print(dv.get_feature_names_out())


## 4. Model Training and Evaluation

In [None]:
# 4. Model Training and Evaluation

# Train the model with Linear Regression (Baseline Model) 
baseline_model = LinearRegression()
baseline_model.fit(X_train, y_train)
baseline_predictions = baseline_model.predict(X_val)
baseline_mse = mean_squared_error(y_val, baseline_predictions)
print(f"Baseline Model has MSE: {baseline_mse}")

# Train the model with Lasso Regression with hyperparameters tuning
lasso_alpha = [0.001, 0.01, 0.1, 1, 10]
for alpha in lasso_alpha:
    lasso_model = Lasso(alpha=alpha)
    lasso_model.fit(X_train, y_train)
    lasso_predictions = lasso_model.predict(X_val)
    lasso_mse = mean_squared_error(y_val, lasso_predictions)
    print(f"Lasso Model with alpha={alpha} has MSE: {lasso_mse}")

# Train the model with Ridge Regression with hyperparameters tuning
ridge_alpha = [0.001, 0.01, 0.1, 1, 10]
for alpha in ridge_alpha:
    ridge_model = Ridge(alpha=alpha)
    ridge_model.fit(X_train, y_train)
    ridge_predictions = ridge_model.predict(X_val)
    ridge_mse = mean_squared_error(y_val, ridge_predictions)
    print(f"Ridge Model with alpha={alpha} has MSE: {ridge_mse}")


# Train the model with Neural Network (Keras)
model = Sequential()
model.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mean_squared_error')

## Visualize the Results

In [None]:
# Plot the predictions of each of the above models
plt.figure(figsize=(10, 6))
plt.scatter(y_val, baseline_predictions, label='Baseline Model')
plt.scatter(y_val, lasso_predictions, label='Lasso Model')
plt.scatter(y_val, ridge_predictions, label='Ridge Model')
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')

In [None]:
# Test the models with the test set
baseline_predictions_test = baseline_model.predict(X_test)
lasso_predictions_test = lasso_model.predict(X_test)
ridge_predictions_test = ridge_model.predict(X_test)

# Calculate the MSE of the test set
baseline_mse_test = mean_squared_error(y_test, baseline_predictions_test)
lasso_mse_test = mean_squared_error(y_test, lasso_predictions_test)
ridge_mse_test = mean_squared_error(y_test, ridge_predictions_test)

print(f"Baseline Model has MSE: {baseline_mse_test}")
print(f"Lasso Model has MSE: {lasso_mse_test}")
print(f"Ridge Model has MSE: {ridge_mse_test}")

## Model Export and Scripting

In [None]:
# Use pickle to export the models
import pickle

with open('model.pkl', 'wb') as f:
    pickle.dump(model, f)