In [None]:
# Cell 1: Import necessary libraries and load the datasets

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
sns.set(style="darkgrid", font_scale=1.5)
pd.set_option("display.max.columns", None)

# Load the train and test datasets
df_train = pd.read_csv("/Users/nels.jacobson2/Desktop/Analytics_Class_Folder/Group-7-Project-4/Data/train.csv")
df_test = pd.read_csv("/Users/nels.jacobson2/Desktop/Analytics_Class_Folder/Group-7-Project-4/Data/test.csv")

In [None]:
# Cell 2: Display information about the train dataset
# Display the first few rows of the train dataset
print("Train Dataset has", df_train.shape[0], "Records/Rows and", df_train.shape[1], "attributes/columns.")
print("Test Dataset has", df_test.shape[0], "Records/Rows and", df_test.shape[1], "attributes/columns.")
df_train.head()

# Information about the train dataset
df_train.info(verbose=False)

# Summary statistics for categorical columns in the train dataset
df_train.describe(include="object")

# Summary statistics for numerical columns in the train dataset
df_train.describe(include=[int, float])

# Sample 5 rows from the train dataset
df_train.sample(5)


In [None]:
# Cell 3: Data Cleaning and Handling Missing Values
# Store test_id for future reference and drop the 'Id' column from both train and test datasets
test_id = df_test["Id"]
df_train.drop(columns="Id", inplace=True)
df_test.drop(columns="Id", inplace=True)

# Check for missing values and calculate their percentage
null_df = round(df_train.isnull().sum() / len(df_train) * 100, 2).sort_values().to_frame().rename(columns=
                                                                                                  {0: "Train % of Missing Values"})
null_df["Test % of Missing Values"] = round(df_test.isnull().sum() / len(df_train) * 100, 2)

# Display features with more than 45% missing values in either train or test dataset
null_df[(null_df["Train % of Missing Values"] > 45) | (null_df["Test % of Missing Values"] > 45)]

# Drop columns with more than 45% missing values from both train and test datasets
cols_to_drop = ["FireplaceQu", "Fence", "Alley", "MiscFeature", "PoolQC"]
df_train.drop(columns=cols_to_drop, inplace=True)
df_test.drop(columns=cols_to_drop, inplace=True)

# Separate the target variable "SalePrice" from the train dataset
target = df_train[["SalePrice"]].reset_index(drop=True)
df_train.drop(columns=["SalePrice"], inplace=True)


In [None]:
# Cell 4: Concatenate the train and test datasets for data cleanup and analysis
df = pd.concat([df_train, df_test]).reset_index(drop=True)


In [None]:
# Cell 5: Handling Missing Values for Garage-related Features
# Handle missing values for Garage-related features by filling with appropriate values
garage_cols = ["GarageYrBlt", "GarageArea", "GarageCars", "GarageType", "GarageFinish", "GarageQual", "GarageCond"]
for col in garage_cols:
    if df[col].dtype == "object":
        df[col].fillna("None", inplace=True)
    else:
        df[col].fillna(0, inplace=True)


In [None]:
# Cell 6: Handling Missing Values for Basement-related Features
# Handle missing values for Basement-related features by filling with appropriate values
basement_cols = ["BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2"]
for col in basement_cols:
    df[col].fillna("None", inplace=True)


In [None]:
# Cell 7: Handling Missing Values for Other Categorical Features
# Handle missing values for MSZoning by filling with the mode of each MSSubClass category
df['MSZoning'] = df.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))

# Handle missing values for categorical columns by filling with the most frequent value
cat_cols = ['Utilities', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'Electrical', 'KitchenQual', 'Functional', 'SaleType']
for col in cat_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)


In [None]:
# Cell 8: Handling Missing Values for LotFrontage
# Handle missing values for LotFrontage by filling with the median value for each neighborhood
df["LotFrontage"] = df.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))


In [None]:
# Cell 9: Handling Missing Values for MasVnrArea, BsmtFinSF1, and BsmtFinSF2
# Handle missing values for MasVnrArea, BsmtFinSF1, and BsmtFinSF2 by filling with the median value for each MasVnrType and BsmtFinType1, BsmtFinType2, respectively
df["MasVnrArea"] = df.groupby("MasVnrType")["MasVnrArea"].transform(lambda x: x.fillna(x.median()))
df["BsmtFinSF1"] = df.groupby("BsmtFinType1")["BsmtFinSF1"].transform(lambda x: x.fillna(x.median()))
df["BsmtFinSF2"] = df.groupby("BsmtFinType2")["BsmtFinSF2"].transform(lambda x: x.fillna(x.median()))


In [None]:
# Cell 10: Handling Missing Values for BsmtFullBath and BsmtHalfBath
# Handle missing values for BsmtFullBath and BsmtHalfBath by filling with 0
df["BsmtFullBath"].fillna(0, inplace=True)
df["BsmtHalfBath"].fillna(0, inplace=True)


In [None]:
# Cell 11: Creating a new feature 'TotalBsmtSF'
# Create a new feature 'TotalBsmtSF' by summing BsmtFinSF1 and BsmtFinSF2
df["TotalBsmtSF"] = df["BsmtFinSF1"] + df["BsmtFinSF2"]


In [None]:
# Cell 12: Handling Missing Values for BsmtUnfSF
# Handle missing values for BsmtUnfSF by filling with the median value
df["BsmtUnfSF"].fillna(df["BsmtUnfSF"].median(), inplace=True)


In [None]:
# Cell 13: Check if there are any missing values left
print("Total Missing Values Left is:", df.isnull().sum().sum())


In [None]:
# Cell 14: Separate the train and test datasets after data cleanup
df_train_cleaned = df.iloc[:len(df_train)].copy()
df_test_cleaned = df.iloc[len(df_train):].copy()


In [None]:
# Cell 15: Data Analysis and Visualization
# Visualize the distribution of the target variable "SalePrice"
plt.figure(figsize=(10, 6))
sns.histplot(target["SalePrice"], kde=True, color="blue")
plt.title("Distribution of SalePrice")
plt.xlabel("SalePrice")
plt.ylabel("Frequency")
plt.show()

# Check the skewness of the target variable "SalePrice"
print("Skewness of SalePrice:", target["SalePrice"].skew())

# Log-transform the target variable "SalePrice" to reduce skewness
target["SalePrice"] = np.log1p(target["SalePrice"])

# Check the skewness of the log-transformed target variable "SalePrice"
print("Skewness of log-transformed SalePrice:", target["SalePrice"].skew())

# Visualize the relationship between the overall quality ('OverallQual') and the sale price
plt.figure(figsize=(10, 6))
sns.boxplot(x=df_train_cleaned["OverallQual"], y=target["SalePrice"])
plt.title("OverallQual vs. SalePrice")
plt.show()

# Visualize the relationship between the above-ground living area square feet ('GrLivArea') and the sale price
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df_train_cleaned["GrLivArea"], y=target["SalePrice"])
plt.title("GrLivArea vs. SalePrice")
plt.show()

# Visualize the relationship between the total rooms above grade ('TotRmsAbvGrd') and the sale price
plt.figure(figsize=(10, 6))
sns.boxplot(x=df_train_cleaned["TotRmsAbvGrd"], y=target["SalePrice"])
plt.title("TotRmsAbvGrd vs. SalePrice")
plt.show()

# Visualize the relationship between the garage area ('GarageArea') and the sale price
plt.figure(figsize=(10, 6))
sns.scatterplot(x=df_train_cleaned["GarageArea"], y=target["SalePrice"])
plt.title("GarageArea vs. SalePrice")
plt.show()

# Visualize the relationship between the year built ('YearBuilt') and the sale price
plt.figure(figsize=(14, 6))
sns.boxplot(x=df_train_cleaned["YearBuilt"], y=target["SalePrice"])
plt.xticks(rotation=90)
plt.title("YearBuilt vs. SalePrice")
plt.show()

# Visualize the relationship between the neighborhood ('Neighborhood') and the sale price
plt.figure(figsize=(14, 6))
sns.boxplot(x=df_train_cleaned["Neighborhood"], y=target["SalePrice"])
plt.xticks(rotation=90)
plt.title("Neighborhood vs. SalePrice")
plt.show()

# Calculate the correlation matrix
correlation_matrix = df_train_cleaned.corr()

# Visualize the correlation matrix as a heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, cmap="coolwarm", annot=True, fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()
