In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from zipfile import ZipFile
from sklearn.ensemble import RandomForestRegressor

In [None]:
with ZipFile('house-prices-advanced-regression-techniques.zip', 'r') as f:
    f.extractall()

In [None]:
train_df = pd.read_csv('train.csv', index_col=0)
test_df = pd.read_csv('test.csv', index_col=0)

#data exploration
train_df.head()
train_df.info()
train_df.describe()

In [None]:
train_df.fillna(train_df.mode(), inplace=True)
test_df.fillna(test_df.mode(), inplace=True)

In [None]:
sample_submission_df = pd.read_csv('sample_submission.csv', index_col=False)
sample_submission_df.head
sample_submission_df.info()

In [None]:
# number of rows with missing data
train_df.isnull().sum()[train_df.isnull().sum() > 0]

In [None]:
# identify the percentage of missing values
train_missing_perectage = train_df.isnull().sum()[train_df.isnull().sum() > 0]/len(train_df)*100
train_missing_perectage

In [None]:
# fill missing data values with mode
train_df = train_df.apply(lambda x: x.fillna(x.mode()[0]), axis=0)
train_df.isnull().sum()

In [None]:
test_missing_perectage = test_df.isnull().sum()[test_df.isnull().sum() > 0]/len(train_df)*100
test_missing_perectage

In [None]:
test_df = test_df.apply(lambda x: x.fillna(x.mode()[0]), axis=0)
test_df.isnull().sum()

In [None]:
# Check for duplicates in the train and test data
print(f"Number of duplicate rows in the train dataframe: {train_df.duplicated().sum()}")
print(f"Number of duplicate rows in the test dataframe: {test_df.duplicated().sum()}")

In [None]:
train_cat_columns = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
train_num_columns = train_df.select_dtypes(exclude=['object']).columns.tolist()

In [None]:
# Plot the distribution of SalePrice
plt.figure(figsize=(4, 6))
sns.histplot(train_df['SalePrice'], kde=True)
plt.title("SalePrice Distribution")
plt.xlabel('SalePrice')
plt.ylabel('Frequency')
plt.show()

In [None]:
# numerical columns for the training dataframe
train_df_numerical = train_df[train_num_columns]

# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(train_df_numerical.corr(), cmap='coolwarm')
plt.title("Correlation Heatmap for Training Data")
plt.show()

In [None]:
# Set the figure size
plt.figure(figsize=(25, 25))

# Number of columns for the grid of subplots
num_columns = len(train_df_numerical.columns)

# Create box plots for each numerical column
for i, column in enumerate(train_df_numerical.columns):
    plt.subplot(num_columns // 3 + 1, 3, i + 1)  
    sns.boxplot(x=train_df_numerical[column])
    plt.title(f"Box Plot for {column}", fontsize=12)

# Manually adjust the layout to add more spacing between subplots
plt.subplots_adjust(hspace=2.5, wspace=0.5)  # Adjust vertical (hspace) and horizontal (wspace) spacing

# Display the plots
plt.show()