In [9]:

import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error


df = pd.read_csv("train.csv")


df['HouseAge'] = df['YrSold'] - df['YearBuilt']
df['RemodAge'] = df['YrSold'] - df['YearRemodAdd']
df['TotalBath'] = df['FullBath'] + 0.5*df['HalfBath']
df['TotalSF'] = df['TotalBsmtSF'] + df['1stFlrSF'] + df['2ndFlrSF'] + df['GarageArea']

num_cols = df.select_dtypes(include=['int64','float64']).columns
for col in num_cols:
    df[col] = df[col].fillna(df[col].median())

cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = df[col].fillna("None")


ordered_cols = ['ExterQual','ExterCond','HeatingQC','KitchenQual']
for col in ordered_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str))

df = pd.get_dummies(df, drop_first=True)


X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]


X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)


n_estimators_list = [100, 200]
max_depth_list = [10, 20, None]
min_samples_split_list = [2, 5]

best_rmse = float('inf')
best_params = {}
best_rf = None

for n in n_estimators_list:
    for depth in max_depth_list:
        for split in min_samples_split_list:
            rf = RandomForestRegressor(n_estimators=n, max_depth=depth, min_samples_split=split,
                                       random_state=42, n_jobs=-1)
            rf.fit(X_train, y_train)
            y_pred = rf.predict(X_valid)
            rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
            if rmse < best_rmse:
                best_rmse = rmse
                best_params = {"n_estimators": n, "max_depth": depth, "min_samples_split": split}
                best_rf = rf

print("Best Parameters:", best_params)
print("Best RMSE on validation set:", best_rmse)


test_df = pd.read_csv("test.csv")
test_ids = test_df['Id']


test_df['HouseAge'] = test_df['YrSold'] - test_df['YearBuilt']
test_df['RemodAge'] = test_df['YrSold'] - test_df['YearRemodAdd']
test_df['TotalBath'] = test_df['FullBath'] + 0.5*test_df['HalfBath']
test_df['TotalSF'] = test_df['TotalBsmtSF'] + test_df['1stFlrSF'] + test_df['2ndFlrSF'] + test_df['GarageArea']


num_cols_test = test_df.select_dtypes(include=['int64','float64']).columns
for col in num_cols_test:
    if col in X_train.columns:
        test_df[col] = test_df[col].fillna(X_train[col].median())
    else:
        test_df[col] = test_df[col].fillna(0)

cat_cols_test = test_df.select_dtypes(include=['object']).columns
for col in cat_cols_test:
    test_df[col] = test_df[col].fillna("None")

for col in ordered_cols:
    if col in test_df.columns:
        le = LabelEncoder()
        # fit on combined train+test to avoid unseen labels
        le.fit(list(df[col].astype(str).values) + list(test_df[col].astype(str).values))
        test_df[col] = le.transform(test_df[col].astype(str))


test_df = pd.get_dummies(test_df, drop_first=True)


test_df = test_df.reindex(columns=X_train.columns, fill_value=0)


y_test_pred = best_rf.predict(test_df)


submission = pd.DataFrame({'Id': test_ids, 'SalePrice': y_test_pred})
submission.to_csv("submission.csv", index=False)
print("Submission file created successfully!")


Best Parameters: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 5}
Best RMSE on validation set: 31231.007912285677
Submission file created successfully!
