In [None]:
import polars as pl
import polars.selectors as cs
import sklearn
import catboost

import warnings 
warnings.filterwarnings('ignore')

In [None]:
print('1. Data Exploration')

In [None]:
# King County House Sales dataset from OpenML (includes Seattle)
# this is an ARFF file, which is a text file with a specific format
url = 'https://www.openml.org/data/download/22044765/dataset'
cols = ['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 
        'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated',
        'zipcode', 'lat', 'long', 'sqft_living15', 'sqft_lot15', 'date_year', 'date_month', 'date_day']

raw = pl.read_csv(url, new_columns=cols, skip_rows=31, has_header=False) # some rows are skipped due to dataset format

# data summary
#display(raw)
raw.describe() 

In [None]:
# Data visualization

print("1) correlation")
display(raw
        .to_pandas(use_pyarrow_extension_array=True)
        .corr()
        .style.background_gradient(cmap='RdBu', vmin=-1, vmax=1)
       )

print("2) scatter plot")
display(raw
        .plot.scatter('sqft_living', 'price', alpha=0.1)
       )

print("3) plot the trend of price by date, grouped in zip code")
display(raw
        .group_by('date_month', 'zipcode')
        .agg(pl.col('price').mean())
        .sort('date_month') #sort by date
        .plot.line('date_month', 'price', by='zipcode', alpha=0.5)
       )

print("4) lat/long scatter plot")
# observed that prices are higher around the coast
display(raw
        #.filter(pl.col('price') > 1_000_000) # to display luxurious houses better, filter out house prices below $1 Mil. 
        .sort('price')
        .plot.scatter(x='long', y='lat', alpha=0.5, c='price', s=1) # this display prices on a map
       )

In [None]:
print('2. Data Preprocessing')

In [None]:
# import sklearn pipelines

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer #ColumnTransformer is a tool that allows us to run certain steps just on specific columns, not on everything in there
from sklearn.preprocessing import StandardScaler, OneHotEncoder #StandardScaler standardizes the data, meaning it gives each column a mean value of zero and a standard deviation of one; OneHotEncoder is a mechanism for taking categorical data, because most machine learning algorithms don't work with text data or categorical data, and it encodes that into numeric values
from sklearn.impute import SimpleImputer #SimpleImputer that fills in missing values
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer #FunctionTransformer is a class from Scikit-Learn that allows converting a function into a transformer to stick into a pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn import set_config
set_config(transform_output='polars')

In [None]:
# 1.1 handle numerical variables
#print(tweak_housing(raw).select(cs.numeric()).columns) # identify numerical columns in the dataset
numeric_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 
                    'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 
                    'lat', 'long', 'sqft_living15', 'sqft_lot15', 'zip_mean']  # note that zip_mean will be added in step 3.0 as the average price of a zipcode
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())]) # define numerical transformer

# 1.2 handle categorical variables
categorical_features = ['zipcode']
categorical_transformer = OneHotEncoder(handle_unknown='ignore',
                                        sparse_output=False, max_categories=10) # allow only 10 categorical vars to be created instead of high-dimentional zip codes


# *1.3 Column Transformer: apply numerical transformation to number columns and categorical transformer to categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])


In [None]:
# 2.1 Tweak function: define a function to tweak housing data by condensing date information and adjusting renovated years
def tweak_housing(df):
    return (df
            .with_columns(zipcode=pl.col('zipcode').cast(pl.String).cast(pl.Categorical),
                          date=pl.date(pl.col('date_year'), pl.col('date_month'), pl.col('date_day')),
                          yr_renovated=pl.col('yr_renovated').replace(0, None),
                          )
            .select(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 
                     'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 
                     'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long', 'sqft_living15', 
                     'sqft_lot15', 'date',  #'date_year', 'date_month', 'date_day', 
                     ])
    )
#tweak_housing(raw)

# *2.2 Treak Transformer: Treak function as a transformer
tweak_transformer = FunctionTransformer(tweak_housing)

In [None]:
# *3.0 Custom Transformer
class ZipAvgPriceAdder(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    def fit(self, X, y=None):
        # assume X is a polars dataframe
        self.zip_avg_price = (X
                              .group_by('zipcode')
                              .agg(zip_mean=pl.col('price').mean()) # map the zip code to the average price of that zip code
        )
        return self
    
    def transform(self, X, y=None):
        return X.join(self.zip_avg_price, on='zipcode') # add zip average price to columns of X

#zip_adder = ZipAvgPriceAdder()
#zip_adder.fit_transform(raw.select(['zipcode', 'price'])) 

In [None]:
# 4.0 Make the Pipeline!
# Append classifier to preprocessing pipeline. Now we have a full prediction pipeline.
pipe = Pipeline(steps=[('tweak', tweak_transformer),
                      ('zip_avg_price', ZipAvgPriceAdder()),
                      ('preprocessor', preprocessor),
                      ])

X = raw #.drop('price')
y = raw.select('price') # Note sklearn wants a Polars dataframe for y

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# outputs:
display(pipe)
display(pipe.fit_transform(raw, raw.select('price'))) # Standardize features