In [4]:
pip install joblib

Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error

# Load the data
data = pd.read_csv("train.csv")

# Select only 5 important columns
data = data[['OverallQual', 'YearBuilt', 'TotalBsmtSF', '1stFlrSF', 'GrLivArea', 'SalePrice']]

# Handle missing values
data.ffill(inplace=True)  # Forward fill for simplicity

# Split the data into features and target
X = data.drop(columns=['SalePrice'])
y = data['SalePrice']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the CatBoostRegressor model
model = CatBoostRegressor(iterations=1000, depth=6, learning_rate=0.1, loss_function='RMSE')
model.fit(X_train, y_train, verbose=100)

# Make predictions
y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

# Evaluate the model
train_rmse = mean_squared_error(y_train, y_pred_train, squared=False)
test_rmse = mean_squared_error(y_test, y_pred_test, squared=False)

print("Train RMSE:", train_rmse)
print("Test RMSE:", test_rmse)


0:	learn: 71996.9525532	total: 53.7ms	remaining: 53.6s
100:	learn: 22953.8302150	total: 139ms	remaining: 1.23s
200:	learn: 19665.0990863	total: 221ms	remaining: 878ms
300:	learn: 17144.0042575	total: 307ms	remaining: 712ms
400:	learn: 15387.6776472	total: 388ms	remaining: 580ms
500:	learn: 13947.5359498	total: 469ms	remaining: 467ms
600:	learn: 12800.8750076	total: 552ms	remaining: 367ms
700:	learn: 11670.9236604	total: 633ms	remaining: 270ms
800:	learn: 10804.3838988	total: 715ms	remaining: 178ms
900:	learn: 10065.4545648	total: 798ms	remaining: 87.7ms
999:	learn: 9306.6663193	total: 880ms	remaining: 0us
Train RMSE: 9306.666319253523
Test RMSE: 28637.558247799414


In [2]:
import joblib

# Assuming you have already trained and have the 'model' object

# Specify the file path where you want to save the model
model_file_path = 'catboost_model_house.cbm'

# Save the model
model.save_model(model_file_path, format="cbm")

In [3]:
X_test.iloc[0]

OverallQual       6
YearBuilt      1963
TotalBsmtSF    1059
1stFlrSF       1068
GrLivArea      1068
Name: 892, dtype: int64