## House Price Index Prediction

In [None]:
# Cell 1: Import necessary libraries and load the datasets
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import os
%matplotlib inline
import warnings
import tensorflow as tf
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Set up visualization settings
warnings.filterwarnings("ignore")
sns.set(style="darkgrid", font_scale=1.5)
pd.set_option("display.max.columns", None)

# Load the datasets
current_dir = os.getcwd()
data_dir = "Group-7-Project-4/Data"
train_csv_path = os.path.join(current_dir, data_dir, "train.csv")
test_csv_path = os.path.join(current_dir, data_dir, "test.csv")

df_train = pd.read_csv(train_csv_path)
df_test = pd.read_csv(test_csv_path)


## Data Cleaning and Handling Missing Values

In [None]:
# Data Cleaning and Handling Missing Values
test_id = df_test["Id"]
df_train.drop(columns="Id", inplace=True)
df_test.drop(columns="Id", inplace=True)

# Check for missing values and drop columns with more than 45% missing values
cols_to_drop = ["FireplaceQu", "Fence", "Alley", "MiscFeature", "PoolQC"]
df_train.drop(columns=cols_to_drop, inplace=True)
df_test.drop(columns=cols_to_drop, inplace=True)

# Separate the target variable "SalePrice" from the train dataset
target = df_train[["SalePrice"]].reset_index(drop=True)
df_train.drop(columns=["SalePrice"], inplace=True)

# Concatenate the train and test datasets for data cleanup and analysis
df = pd.concat([df_train, df_test]).reset_index(drop=True)

# Handling Missing Values for Garage-related Features
garage_cols = ["GarageYrBlt", "GarageArea", "GarageCars", "GarageType", "GarageFinish", "GarageQual", "GarageCond"]
for col in garage_cols:
    if df[col].dtype == "object":
        df[col].fillna("None", inplace=True)
    else:
        df[col].fillna(0, inplace=True)

# Handling Missing Values for Basement-related Features
basement_cols = ["BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2"]
for col in basement_cols:
    df[col].fillna("None", inplace=True)

# Handling Missing Values for Other Categorical Features
df['MSZoning'] = df.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
cat_cols = ['Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Electrical', 'KitchenQual', 'Functional', 'SaleType']
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

# Handling Missing Values for LotFrontage
df["LotFrontage"] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))

# Handling Missing Values for MasVnrArea, BsmtFinSF1, and BsmtFinSF2
df["MasVnrArea"] = df.groupby("MasVnrType")["MasVnrArea"].transform(lambda x: x.fillna(x.median()))
df["BsmtFinSF1"] = df.groupby("BsmtFinType1")["BsmtFinSF1"].transform(lambda x: x.fillna(x.median()))
df["BsmtFinSF2"] = df.groupby("BsmtFinType2")["BsmtFinSF2"].transform(lambda x: x.fillna(x.median()))

# Handling Missing Values for BsmtFullBath and BsmtHalfBath
df["BsmtFullBath"].fillna(0, inplace=True)
df["BsmtHalfBath"].fillna(0, inplace=True)

# Creating a new feature 'TotalBsmtSF'
df["TotalBsmtSF"] = df["BsmtFinSF1"] + df["BsmtFinSF2"]

# Handling Missing Values for BsmtUnfSF
df["BsmtUnfSF"].fillna(df["BsmtUnfSF"].median(), inplace=True)

# Check if there are any missing values left
print("Total Missing Values Left is:", df.isnull().sum().sum())

# Separate the train and test datasets after data cleanup
df_train_cleaned = df.iloc[:len(df_train)].copy()
df_test_cleaned = df.iloc[len(df_train):].copy()


## Data Analysis and Visualization

In [None]:
# Visualize the distribution of the target variable "SalePrice"
plt.figure(figsize=(10, 6))
sns.histplot(target["SalePrice"], kde=True, color="blue")
plt.title("Distribution of SalePrice")
plt.xlabel("SalePrice")
plt.ylabel("Frequency")
plt.show()


In [None]:
# Check the skewness of the target variable "SalePrice"
print("Skewness of SalePrice:", target["SalePrice"].skew())

# Log-transform the target variable "SalePrice" to reduce skewness
target["SalePrice"] = np.log1p(target["SalePrice"])

# Check the skewness of the log-transformed target variable "SalePrice"
print("Skewness of log-transformed SalePrice:", target["SalePrice"].skew())


In [None]:
# Visualize the relationship between the overall quality ('OverallQual') and the sale price
plt.figure(figsize=(10, 6))
sns.boxplot(x=df_train_cleaned["OverallQual"], y=target["SalePrice"])
plt.title("OverallQual vs. SalePrice")
plt.show()


In [None]:
# Visualize the relationship between the above-ground living area square feet ('GrLivArea') and the sale price
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df_train_cleaned["GrLivArea"], y=target["SalePrice"])
plt.title("GrLivArea vs. SalePrice")
plt.show()


In [None]:
# Visualize the relationship between the total rooms above ground ('TotRmsAbvGrd') and the sale price
plt.figure(figsize=(10, 6))
sns.boxplot(x=df_train_cleaned["TotRmsAbvGrd"], y=target["SalePrice"])
plt.title("TotRmsAbvGrd vs. SalePrice")
plt.show()


In [None]:
# Visualize the relationship between the garage area ('GarageArea') and the sale price
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df_train_cleaned["GarageArea"], y=target["SalePrice"])
plt.title("GarageArea vs. SalePrice")
plt.show()


In [None]:
# Visualize the relationship between the year built ('YearBuilt') and the sale price
plt.figure(figsize=(14, 6))
sns.boxplot(x=df_train_cleaned["YearBuilt"], y=target["SalePrice"])
plt.xticks(rotation=90)
plt.title("YearBuilt vs. SalePrice")
plt.show()


In [None]:
# Visualize the relationship between the neighborhood ('Neighborhood') and the sale price
plt.figure(figsize=(14, 6))
sns.boxplot(x=df_train_cleaned["Neighborhood"], y=target["SalePrice"])
plt.xticks(rotation=90)
plt.title("Neighborhood vs. SalePrice")
plt.show()


In [None]:
# Calculate the correlation matrix
correlation_matrix = df_train_cleaned.corr()

# Visualize the correlation matrix as a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=True, fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()


## Train and Evaluate the Scikit-learn Linear Regression Model

In [None]:
# Assuming 'df_train_cleaned' is the cleaned train dataset and 'target' is the target variable
# Separate the features and target variable
X = df_train_cleaned.copy()  # No need to drop the target variable from the features
y = target["SalePrice"]

# Perform one-hot encoding on categorical columns
X = pd.get_dummies(X, drop_first=True)  # Use drop_first=True to avoid multicollinearity

# Split the dataset into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and train the Scikit-learn Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluate the Scikit-learn Linear Regression model
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
print("Scikit-learn Linear Regression Model - Mean Squared Error:", mse)
print("Scikit-learn Linear Regression Model - R-squared:", r_squared)


## Train and Evaluate the TensorFlow Model

In [None]:
# Create the TensorFlow model
tf_model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)  # Output layer with no activation function for regression
])
# Compile the model
tf_model.compile(optimizer='adam', loss='mean_squared_error')

# Train the TensorFlow model
tf_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

# Evaluate the TensorFlow model
tf_y_pred = tf_model.predict(X_test)
tf_mse = mean_squared_error(y_test, tf_y_pred)
tf_r_squared = r2_score(y_test, tf_y_pred)
print("TensorFlow Model - Mean Squared Error:", tf_mse)
print("TensorFlow Model - R-squared:", tf_r_squared)


## Train and Evaluate the Keras Model

In [None]:
# Create the Keras model (same architecture as TensorFlow model)
keras_model = tf.keras.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)  # Output layer with no activation function for regression
])
# Compile the model
keras_model.compile(optimizer='adam', loss='mean_squared_error')

# Train the Keras model
keras_model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

# Evaluate the Keras model
keras_y_pred = keras_model.predict(X_test)
keras_mse = mean_squared_error(y_test, keras_y_pred)
keras_r_squared = r2_score(y_test, keras_y_pred)
print("Keras Model - Mean Squared Error:", keras_mse)
print("Keras Model - R-squared:", keras_r_squared)


## Train and Evaluate the XGBoost Model

In [None]:
# Create and train the XGBoost model
xgb_model = xgb.XGBRegressor()
xgb_model.fit(X_train, y_train)

# Make predictions on the test set
xgb_y_pred = xgb_model.predict(X_test)
xgb_mse = mean_squared_error(y_test, xgb_y_pred)
xgb_r_squared = r2_score(y_test, xgb_y_pred)
print("XGBoost Model - Mean Squared Error:", xgb_mse)
print("XGBoost Model - R-squared:", xgb_r_squared)


## Compare model performances

In [None]:
# Compare model performances
print("Scikit-learn Linear Regression Model - Mean Squared Error:", mse)
print("Scikit-learn Linear Regression Model - R-squared:", r_squared)

print("TensorFlow Model - Mean Squared Error:", tf_mse)
print("TensorFlow Model - R-squared:", tf_r_squared)

print("Keras Model - Mean Squared Error:", keras_mse)
print("Keras Model - R-squared:", keras_r_squared)

print("XGBoost Model - Mean Squared Error:", xgb_mse)
print("XGBoost Model - R-squared:", xgb_r_squared)
