In [1]:
# Data processing
import pandas as pd
import numpy as np
import itertools

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

import shap

# local python scripts package
from utils import plots
from utils.plots import *

# Logging
import logging
# setup logging level
logging.getLogger().setLevel(logging.DEBUG)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# import dataset
data = pd.read_csv("../dataset/king_country_houses_aa.csv")
df = data.copy()
df.head(5)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,20141013T000000,221900.0,3,1.0,1180,5650,1.0,0,0,...,7,1180,0,1955,0,98178,47.5112,-122.257,1340,5650
1,6414100192,20141209T000000,538000.0,3,2.25,2570,7242,2.0,0,0,...,7,2170,400,1951,1991,98125,47.721,-122.319,1690,7639
2,5631500400,20150225T000000,180000.0,2,1.0,770,10000,1.0,0,0,...,6,770,0,1933,0,98028,47.7379,-122.233,2720,8062
3,2487200875,20141209T000000,604000.0,4,3.0,1960,5000,1.0,0,0,...,7,1050,910,1965,0,98136,47.5208,-122.393,1360,5000
4,1954400510,20150218T000000,510000.0,3,2.0,1680,8080,1.0,0,0,...,8,1680,0,1987,0,98074,47.6168,-122.045,1800,7503


In [3]:
target_variable = 'price'

In [4]:
# initiate a new instance of plots class 
# set color palette and theme for maximum of colors set to columns count
plots = Gryffindor_plots(n_columns=df.shape[1], target_column=target_variable)

## Feature engineering

Clustering zip-codes using K-Means

- To group nearby zip codes together → Similar economic/geographic areas may influence house prices.
- To simplify data → Instead of many zip codes, we now have just 5 clusters, making it easier for the model to learn patterns.
- To reduce complexity → Avoid the issue of sparse features from one-hot encoding.

In [5]:
df = df.copy()

# Extract unique zip codes
zip_df = df[['zipcode']].drop_duplicates().reset_index(drop=True)

# Apply K-Means clustering
num_clusters = 5  # You can change this number based on experimentation
kmeans = KMeans(n_clusters=num_clusters, random_state=42, n_init=10)
zip_df['zipcode_cluster'] = kmeans.fit_predict(zip_df[['zipcode']])

# Merge the cluster labels back to the original dataframe
df = df.merge(zip_df, on='zipcode', how='left')

# Display the first few rows
print(df[['zipcode', 'zipcode_cluster']].drop_duplicates().sort_values(by='zipcode'))

     zipcode  zipcode_cluster
38     98001                2
18     98002                2
6      98003                2
66     98004                2
70     98005                2
..       ...              ...
120    98177                1
0      98178                1
299    98188                1
7      98198                1
91     98199                1

[70 rows x 2 columns]


Encode zipcodes to make categorical data useful for training a model.

In [6]:
# Initialize LabelEncoder
label_enc = LabelEncoder()

# Apply Label Encoding to 'zipcode' column
df['zipcode_encoded'] = label_enc.fit_transform(df['zipcode'])

# Drop the original 'zipcode' column (optional, if not needed)
df = df.drop(columns=['zipcode'])

Interpreting date information:
- Convert date column to datetime format and extract year as 'yr_sold'
- Count age of the building and save as new column
- Count years since last removation and save as new column

In [7]:
# Convert 'date' column to datetime format
df['date'] = pd.to_datetime(df['date'], format='%Y%m%dT%H%M%S')

# Create a new column 'yr_sold' by extracting the year from 'date'
df['yr_sold'] = df['date'].dt.year

# Create a new column 'building_age' calculating the age of the building
df['building_age'] = df['yr_sold'] - df['yr_built']

# Create a new column 'yrs_after_renovation' calculating years since last renovation
df['yrs_since_renovation'] = df['yr_sold'] - df['yr_renovated']
df['yrs_since_renovation'] = df['yrs_since_renovation'].where(df['yr_renovated'] > 0, 0)


Interpreting space information:
- Total bathrooms per bedroom ratio: bath_bed_ratio = bathrooms / bedrooms (handle division by zero)
- Total space per room: sqft_per_room = sqft_living / (bedrooms + bathrooms)


In [8]:
# Create 'bath_bed_ratio' while handling division by zero
df['bath_bed_ratio'] = df['bathrooms'] / df['bedrooms']
df['bath_bed_ratio'] = df['bath_bed_ratio'].replace([float('inf'), -float('inf')], 0)  # Handle division by zero

# Create 'sqft_per_room' while handling division by zero
df['sqft_per_room'] = df['sqft_living'] / (df['bedrooms'] + df['bathrooms'])
df['sqft_per_room'] = df['sqft_per_room'].replace([float('inf'), -float('inf')], 0)  # Handle division by zero

Log transformation for highly skewed variables:
- sqft_lot
- sqft_lot15
- sqft_above
- sqft_basement
- sqft_living


In [9]:
# Apply Log Transformation to highly skewed variables
for col in ['sqft_lot', 'sqft_lot15', 'sqft_above', 'sqft_basement', 'sqft_living']:
    df[f'log_{col}'] = np.log1p(df[col])  # log1p(x) = log(x + 1) to handle zero values

Extracting data from ID column and dropping it:
- Which houses were sold more than once?
- How many times the house was sold?
- Years since last sale?

In [10]:
# Create a column that counts how many times each house (ID) appears in the dataset.
df['sale_count'] = df.groupby('id')['id'].transform('count')

# Mark houses sold multiple times (1 if sold multiple times, 0 if sold only once)
df['multiple_sales'] = df['sale_count'].apply(lambda x: 1 if x > 1 else 0)

# Sort the dataframe by 'id' and 'yr_sold' to ensure correct order
df = df.sort_values(by=['id', 'yr_sold'])

# Calculate the difference in years between consecutive sales for the same house
df['yrs_since_previous_sale'] = df.groupby('id')['yr_sold'].diff().fillna(0)

# Now the ID column becomes redundant for ML purposes
df = df.drop(columns=['id'])

### Collecting the features

In [11]:
# feature groups
continuous_features = []
descrete_features = []
ordinal_encoded_categorical_features = []

In [12]:
# all non-numeric
df.select_dtypes(exclude="number").columns.tolist()

['date']

In [13]:
df['date'].dtypes

dtype('<M8[ns]')

In [14]:
descrete_features.append('date')

In [15]:
# all numeric features
numeric_columns_list = df.select_dtypes(include="number").columns.tolist()
numeric_columns_list.sort()
numeric_columns_list

['bath_bed_ratio',
 'bathrooms',
 'bedrooms',
 'building_age',
 'condition',
 'floors',
 'grade',
 'lat',
 'log_sqft_above',
 'log_sqft_basement',
 'log_sqft_living',
 'log_sqft_lot',
 'log_sqft_lot15',
 'long',
 'multiple_sales',
 'price',
 'sale_count',
 'sqft_above',
 'sqft_basement',
 'sqft_living',
 'sqft_living15',
 'sqft_lot',
 'sqft_lot15',
 'sqft_per_room',
 'view',
 'waterfront',
 'yr_built',
 'yr_renovated',
 'yr_sold',
 'yrs_since_previous_sale',
 'yrs_since_renovation',
 'zipcode_cluster',
 'zipcode_encoded']

In [16]:
continuous_features.append('bath_bed_ratio')
descrete_features.append('bathrooms')
descrete_features.append('bedrooms')
descrete_features.append('building_age')
ordinal_encoded_categorical_features.append('condition')
descrete_features.append('floors')
ordinal_encoded_categorical_features.append('grade')
continuous_features.append('lat')
continuous_features.append('log_sqft_above')
continuous_features.append('log_sqft_basement')
continuous_features.append('log_sqft_living')
continuous_features.append('log_sqft_lot')
continuous_features.append('log_sqft_lot15')
continuous_features.append('long')
descrete_features.append('multiple_sales')
descrete_features.append('sale_count')
continuous_features.append('sqft_above')
continuous_features.append('sqft_basement')
continuous_features.append('sqft_living')
continuous_features.append('sqft_living15')
continuous_features.append('sqft_lot')
continuous_features.append('sqft_lot15')
continuous_features.append('sqft_per_room')
descrete_features.append('view')
ordinal_encoded_categorical_features.append('waterfront')
descrete_features.append('yr_built')
descrete_features.append('yr_renovated')
descrete_features.append('yr_sold')
descrete_features.append('yrs_since_previous_sale')
descrete_features.append('yrs_since_renovation')
descrete_features.append('zipcode_cluster')
descrete_features.append('zipcode_encoded')

print(f"continuous_features: \n {continuous_features}\n")
print(f"descrete_features: \n {descrete_features}\n")
print(f"ordinal_encoded_categorical_features: \n {ordinal_encoded_categorical_features}\n")

continuous_features: 
 ['bath_bed_ratio', 'lat', 'log_sqft_above', 'log_sqft_basement', 'log_sqft_living', 'log_sqft_lot', 'log_sqft_lot15', 'long', 'sqft_above', 'sqft_basement', 'sqft_living', 'sqft_living15', 'sqft_lot', 'sqft_lot15', 'sqft_per_room']

descrete_features: 
 ['date', 'bathrooms', 'bedrooms', 'building_age', 'floors', 'multiple_sales', 'sale_count', 'view', 'yr_built', 'yr_renovated', 'yr_sold', 'yrs_since_previous_sale', 'yrs_since_renovation', 'zipcode_cluster', 'zipcode_encoded']

ordinal_encoded_categorical_features: 
 ['condition', 'grade', 'waterfront']



In [17]:
all_numeric_features = continuous_features + descrete_features
all_numeric_features.remove('date')
print(all_numeric_features)

['bath_bed_ratio', 'lat', 'log_sqft_above', 'log_sqft_basement', 'log_sqft_living', 'log_sqft_lot', 'log_sqft_lot15', 'long', 'sqft_above', 'sqft_basement', 'sqft_living', 'sqft_living15', 'sqft_lot', 'sqft_lot15', 'sqft_per_room', 'bathrooms', 'bedrooms', 'building_age', 'floors', 'multiple_sales', 'sale_count', 'view', 'yr_built', 'yr_renovated', 'yr_sold', 'yrs_since_previous_sale', 'yrs_since_renovation', 'zipcode_cluster', 'zipcode_encoded']


### Feature importance

In [18]:
# Define features and target
X = df[all_numeric_features]  # Select numerical features
y = df['price']  # Target variable

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [19]:
# Train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [20]:
# Initialize SHAP explainer
explainer = shap.TreeExplainer(model)


In [21]:
# Compute SHAP values
shap_values = explainer.shap_values(X_test)



In [22]:
# Compute mean absolute SHAP values for feature importance
shap_summary = np.abs(shap_values).mean(axis=0)



In [23]:
# Create a DataFrame for feature importance
shap_summary_df = pd.DataFrame({'Feature': all_numeric_features, 'SHAP values': shap_summary})
shap_summary_df = shap_summary_df.sort_values('SHAP values', ascending=False)

In [26]:
shap_summary_df

Unnamed: 0,Feature,SHAP values
1,lat,127451.356525
10,sqft_living,67991.286462
4,log_sqft_living,67779.359729
7,long,42709.079822
11,sqft_living15,27953.672247
21,view,14822.927587
8,sqft_above,6546.325316
2,log_sqft_above,6485.693467
28,zipcode_encoded,6337.290617
0,bath_bed_ratio,4032.392476


# Key features based on SHAP Feature Importance

1. **Latitude (`lat`)**: 🚀 The strongest predictor of price (~127,451 SHAP value).  
   - Suggests that location heavily influences house prices.  

2. **Living Area (`sqft_living`, `log_sqft_living`)**: (~67,000 SHAP value).  
   - Larger houses tend to be more expensive.

3. **Longitude (`long`)**: (~42,709 SHAP value).  
   - Together with latitude, it defines neighborhood influence.

4. **Neighborhood Influence (`sqft_living15`)**: (~27,953 SHAP value).  
   - Nearby homes' sizes impact pricing.

## ⚖️ Moderately Important Features:
5. **View Quality (`view`)**: (~14,822 SHAP value).  
   - A better view significantly increases price.

6. **Above-Ground Area (`sqft_above`, `log_sqft_above`)**: (~6,500 SHAP value).  
   - Matters, but less than total living area.

7. **Zip Code (`zipcode_encoded`)**: (~6,337 SHAP value).  
   - Some zip codes impact pricing, but **latitude and longitude are better indicators**.

## 📉 Least Important Features:
8. **Years Since Last Sale (`yrs_since_previous_sale`)**: (~265 SHAP value).  
   - A house's **sales history has little impact** on price.

9. **Total Sale Count (`sale_count`)**: (~113 SHAP value).  
   - The number of times a house was sold does not significantly affect price.

10. **Multiple Sales Flag (`multiple_sales`)**: (~100 SHAP value).  
    - Houses sold multiple times do not differ much in pricing.

11. **Floors (`floors`) and Bedrooms (`bedrooms`)**: (~500 SHAP value).  
    - Simply counting rooms is **not as important as total square footage**.

