In [1]:
import pandas as pd

TEST = '/kaggle/input/house-prices-advanced-regression-techniques/test.csv'
TRAIN = '/kaggle/input/house-prices-advanced-regression-techniques/train.csv'

test_raw_df = pd.read_csv(filepath_or_buffer=TEST, )
submission_df = test_raw_df[['Id']]
test_raw_df = test_raw_df.drop(columns=['Id'])
train_raw_df = pd.read_csv(filepath_or_buffer=TRAIN, ).drop(columns=['Id'])
train_raw_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


We're going to process our data in three phases:
* Impute missing values for numerical data
* Fill in unknown data for missing categorical data and use a label encoder to convert non-numerical data to integers
* Remove a handful of price outliers from our training data
We want our code to be reintrant so we're going to do this all in one cell and we're going to make copies of our data as we proceed so we can look at the intermediate data if necessary. We would need to do this differently if we had streaming data, or very large data, or whatever.

In [2]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

TARGET = 'SalePrice'

# first we want to separate the numerical columns from the categorical columns
keys = [key for key, value in train_raw_df.drop(columns=[TARGET]).dtypes.to_dict().items() if str(value) in {'int64', 'float64'}]

# doing this a row at a time may be dumb but let's do it initially just to get code working
train_knn_df = train_raw_df.copy()
test_knn_df = test_raw_df.copy()
for key in keys:
    imputer = KNNImputer()
    train_knn_df[key] = imputer.fit_transform(X=train_knn_df[key].values.reshape(-1, 1))
    test_knn_df[key] = imputer.transform(X=test_knn_df[key].values.reshape(-1, 1))

# now let's fill in the unknown values in our categorical columns
train_df = train_knn_df.copy()
test_df = test_knn_df.copy()
categorical = [column for column in train_knn_df.columns if column not in keys and column != TARGET]
for column in categorical:
    encoder = LabelEncoder()
    # we can have missing values in our training data or our test data or both, so we need to fit the encoder carefully
    values = train_df[column].fillna(value='unknown').unique().tolist() + test_df[column].fillna(value='unknown').unique().tolist()
    encoder.fit(y=values)
    train_df[column] = encoder.transform(X=train_df[column].fillna(value='unknown'))
    test_df[column]= encoder.transform(X=test_df[column].fillna(value='unknown'))
    
# remove price outliers from the training data
train_df = train_df[train_df[TARGET] < 500000]
print(train_df.shape, test_df.shape)

(1451, 80) (1459, 79)


This is our first look so we're just going to run a nominal number of iterations, pick the best iteration according to the validation set and be done. We will come back and try to tune the model.

In [3]:
from lightgbm import LGBMRegressor
from sklearn.model_selection import train_test_split

BAGGING_FRACTION = 7e-1
LEARNING_RATE = 1e-1 # was 5e-2
RANDOM_STATE = 1 # was 2013
X_train, X_valid, y_train, y_valid = train_test_split(train_df.drop(columns=[TARGET]), train_df[TARGET], test_size=0.20, random_state=RANDOM_STATE)

model = LGBMRegressor(boosting_type='gbdt', bagging_fraction=BAGGING_FRACTION, bagging_freq=5, feature_fraction=9e-1, learning_rate=LEARNING_RATE,
                      metric='rmse', num_leaves=31, verbose=1, random_state=RANDOM_STATE, force_col_wise=True,)
model.fit(X=X_train, y=y_train, eval_set=[(X_valid, y_valid), (X_train, y_train)])
result = model.predict(X=test_df, num_iteration=model.best_iteration_)

[LightGBM] [Info] Total Bins 3097
[LightGBM] [Info] Number of data points in the train set: 1160, number of used features: 72
[LightGBM] [Info] Start training from score 177781.313793


In [4]:
import pandas as pd
from plotly import express

training_df = pd.DataFrame(data={'training': model.evals_result_['training']['rmse'], 'validation': model.evals_result_['valid_0']['rmse']})
express.line(data_frame=training_df, y=['training', 'validation'])

In [5]:
import numpy as np
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred) -> float:
    return mean_squared_error(squared=False, y_true=np.log(1 + y_true), y_pred=np.log(1 + y_pred))

print('RMSE: {:7.5f}'.format(rmse(y_true=y_valid, y_pred=model.predict(X=X_valid, num_iteration=model.best_iteration_))))

RMSE: 0.13899


In [6]:
SUBMISSION = '/kaggle/working/submission.csv'

submission_df[TARGET] = result
submission_df.to_csv(path_or_buf=SUBMISSION, index=False)
print('done.')

done.


The model doesn't use all of our features so let's look at the feature importances; this may suggest some future work.

In [7]:
import pandas as pd
from plotly import express

feature_df = pd.DataFrame(data={'feature': model.feature_name_, 'importance': model.feature_importances_}).sort_values(ascending=False, by='importance')
express.histogram(data_frame=feature_df, x='feature', y='importance', )

How stable is our validation RMSE as a function of the random seed?

In [8]:
# from plotly import express

# results = []

# bagging_fraction = 7e-1
# bagging_fractions = [1e-1, 2e-1, 3e-1, 4e-1, 5e-1, 6e-1, 7e-1, 8e-1, 9e-1]
# random_state = 2024
# random_states = range(2000, 2020)
# for random_state in random_states:
#     Xi_train, Xi_valid, yi_train, yi_valid = train_test_split(train_df.drop(columns=[TARGET]), train_df[TARGET], test_size=0.20, random_state=random_state)
#     modeli = LGBMRegressor(boosting_type='gbdt', bagging_fraction=bagging_fraction, bagging_freq=5, feature_fraction=9e-1, learning_rate=5e-2, metric='rmse',
#                            num_leaves=31, verbose=0, random_state=random_state)
#     modeli.fit(X=Xi_train, y=yi_train, eval_set=[(Xi_valid, yi_valid), (Xi_train, yi_train)])
#     results.append(
#         rmse(y_true=yi_valid, y_pred=modeli.predict(X=Xi_valid, num_iteration=modeli.best_iteration_, verbose=0))
#     )
# plot_df = pd.DataFrame(data={'random state': list(random_states), 'rmse': results})
# express.line(data_frame=plot_df, x='random state', y='rmse').show()
# print('mean rmse: {:7.5f}'.format(sum(results)/len(results)))