In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from zipfile import ZipFile
from sklearn.ensemble import RandomForestRegressor

In [2]:
with ZipFile('Boston-house-prices.zip', 'r') as f:
    f.extractall()

In [4]:
boston_df = pd.read_csv('boston.csv', index_col=0)

#data exploration
boston_df.head()
boston_df.info()
boston_df.describe()

<class 'pandas.core.frame.DataFrame'>
Index: 506 entries, 0.00632 to 0.04741
Data columns (total 13 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ZN       506 non-null    float64
 1   INDUS    506 non-null    float64
 2   CHAS     506 non-null    int64  
 3   NOX      506 non-null    float64
 4   RM       506 non-null    float64
 5   AGE      506 non-null    float64
 6   DIS      506 non-null    float64
 7   RAD      506 non-null    int64  
 8   TAX      506 non-null    float64
 9   PTRATIO  506 non-null    float64
 10  B        506 non-null    float64
 11  LSTAT    506 non-null    float64
 12  MEDV     506 non-null    float64
dtypes: float64(11), int64(2)
memory usage: 55.3 KB


Unnamed: 0,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,MEDV
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063,22.532806
std,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062,9.197104
min,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73,5.0
25%,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95,17.025
50%,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36,21.2
75%,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955,25.0
max,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97,50.0


In [None]:
train_df.fillna(train_df.mode(), inplace=True)
test_df.fillna(test_df.mode(), inplace=True)

In [None]:
sample_submission_df = pd.read_csv('sample_submission.csv', index_col=False)
sample_submission_df.head
sample_submission_df.info()

In [None]:
# number of rows with missing data
train_df.isnull().sum()[train_df.isnull().sum() > 0]

In [None]:
# identify the percentage of missing values
train_missing_perectage = train_df.isnull().sum()[train_df.isnull().sum() > 0]/len(train_df)*100
train_missing_perectage

In [None]:
# fill missing data values with mode
train_df = train_df.apply(lambda x: x.fillna(x.mode()[0]), axis=0)
train_df.isnull().sum()

In [None]:
test_missing_perectage = test_df.isnull().sum()[test_df.isnull().sum() > 0]/len(train_df)*100
test_missing_perectage

In [None]:
test_df = test_df.apply(lambda x: x.fillna(x.mode()[0]), axis=0)
test_df.isnull().sum()

In [None]:
# Check for duplicates in the train and test data
print(f"Number of duplicate rows in the train dataframe: {train_df.duplicated().sum()}")
print(f"Number of duplicate rows in the test dataframe: {test_df.duplicated().sum()}")

In [None]:
train_cat_columns = train_df.select_dtypes(include=['object', 'category']).columns.tolist()
train_num_columns = train_df.select_dtypes(exclude=['object']).columns.tolist()

In [None]:
# Plot the distribution of SalePrice
plt.figure(figsize=(4, 6))
sns.histplot(train_df['SalePrice'], kde=True)
plt.title("SalePrice Distribution")
plt.xlabel('SalePrice')
plt.ylabel('Frequency')
plt.show()

In [None]:
# numerical columns for the training dataframe
train_df_numerical = train_df[train_num_columns]

# Correlation heatmap
plt.figure(figsize=(12, 8))
sns.heatmap(train_df_numerical.corr(), cmap='coolwarm')
plt.title("Correlation Heatmap for Training Data")
plt.show()

In [None]:
# Set the figure size
plt.figure(figsize=(25, 25))

# Number of columns for the grid of subplots
num_columns = len(train_df_numerical.columns)

# Create box plots for each numerical column
for i, column in enumerate(train_df_numerical.columns):
    plt.subplot(num_columns // 3 + 1, 3, i + 1)  
    sns.boxplot(x=train_df_numerical[column])
    plt.title(f"Box Plot for {column}", fontsize=12)

# Manually adjust the layout to add more spacing between subplots
plt.subplots_adjust(hspace=2.5, wspace=0.5)  # Adjust vertical (hspace) and horizontal (wspace) spacing

# Display the plots
plt.show()