In [1]:
import pandas as pd

TEST = '/kaggle/input/house-prices-advanced-regression-techniques/test.csv'
TRAIN = '/kaggle/input/house-prices-advanced-regression-techniques/train.csv'

test_raw_df = pd.read_csv(filepath_or_buffer=TEST, )
submission_df = test_raw_df[['Id']]
test_raw_df = test_raw_df.drop(columns=['Id'])
train_raw_df = pd.read_csv(filepath_or_buffer=TRAIN, ).drop(columns=['Id'])
train_raw_df.head()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
1,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
2,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
3,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
4,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [2]:
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder

TARGET = 'SalePrice'

# first we want to separate the numerical columns from the categorical columns
keys = [key for key, value in train_raw_df.drop(columns=[TARGET]).dtypes.to_dict().items() if str(value) in {'int64', 'float64'}]

# doing this a row at a time may be dumb but let's do it initially just to get code working
train_knn_df = train_raw_df.copy()
test_knn_df = test_raw_df.copy()
for key in keys:
    imputer = KNNImputer()
    train_knn_df[key] = imputer.fit_transform(X=train_knn_df[key].values.reshape(-1, 1))
    test_knn_df[key] = imputer.transform(X=test_knn_df[key].values.reshape(-1, 1))

# now let's fill in the unknown values in our categorical columns
train_df = train_knn_df.copy()
test_df = test_knn_df.copy()
categorical = [column for column in train_knn_df.columns if column not in keys and column != TARGET]
for column in categorical:
    encoder = LabelEncoder()
    values = train_df[column].fillna(value='unknown').unique().tolist() + test_df[column].fillna(value='unknown').unique().tolist()
    encoder.fit(y=values)
    train_df[column] = encoder.transform(X=train_df[column].fillna(value='unknown'))
    test_df[column]= encoder.transform(X=test_df[column].fillna(value='unknown'))
print(train_df.shape, test_df.shape)

(1460, 80) (1459, 79)


In [3]:
from lightgbm import Dataset
from lightgbm import train
from sklearn.model_selection import train_test_split


X_train, X_valid, y_train, y_valid = train_test_split(train_df.drop(columns=[TARGET]), train_df[TARGET], test_size=0.20, random_state=2024)
train_data = Dataset(data=X_train, label=y_train)
valid_data = Dataset(data=X_valid, label=y_valid)

parameters = {
    'bagging_fraction' : 8e-1,
    'bagging_freq' : 5,
    'boosting_type' : 'gbdt',
    'feature_fraction' : 9e-1,
    'learning_rate' : 5e-2,
    'metric' : 'rmse',
    'num_leaves': 31,
    'objective' : 'regression',
    'verbose' : 0,
}

model = train(params=parameters, train_set=train_data, valid_sets=valid_data, num_boost_round=1000, )

result = model.predict(data=test_df, num_iteration=model.best_iteration)

In [4]:
SUBMISSION = '/kaggle/working/submission.csv'

submission_df[TARGET] = result
submission_df.to_csv(path_or_buf=SUBMISSION, index=False)
print('done.')

done.
