In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [None]:
# Load the datasets
train_path = "train.csv"
test_path = "test.csv"
submission_path = "sample_submission.csv"

train_df = pd.read_csv(train_path)
test_df = pd.read_csv(test_path)

In [None]:
test_ids = test_df["Id"]  
train_df = train_df.drop(columns=["Id"])
test_df = test_df.drop(columns=["Id"])

In [None]:
num_cols = train_df.select_dtypes(include=["number"]).columns.tolist()
cat_cols = train_df.select_dtypes(exclude=["number"]).columns.tolist()

In [None]:
num_cols.remove("SalePrice")

In [None]:
imputer_num = SimpleImputer(strategy="median")
imputer_cat = SimpleImputer(strategy="most_frequent")

In [None]:
train_df[num_cols] = imputer_num.fit_transform(train_df[num_cols])
test_df[num_cols] = imputer_num.transform(test_df[num_cols])

train_df[cat_cols] = imputer_cat.fit_transform(train_df[cat_cols])
test_df[cat_cols] = imputer_cat.transform(test_df[cat_cols])

In [None]:
label_encoders = {}
for col in cat_cols:
    le = LabelEncoder()
    train_df[col] = le.fit_transform(train_df[col])
    test_df[col] = test_df[col].apply(lambda x: x if x in le.classes_ else "Unknown")
    le.classes_ = np.append(le.classes_, "Unknown")
    test_df[col] = le.transform(test_df[col])
    label_encoders[col] = le

In [None]:
common_cols = list(set(train_df.columns) & set(test_df.columns))
X = train_df[common_cols]
y = train_df["SalePrice"]
X_test = test_df[common_cols]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


In [None]:
y_pred = model.predict(X_valid)
rmse = np.sqrt(mean_squared_error(y_valid, y_pred))
print(f"Validation RMSE: {rmse}")

In [None]:
missing_values = X_test.isnull().sum()
for col in missing_values.index:
    X_test[col] = X_test[col].fillna(X[col].median())

In [None]:
test_predictions = model.predict(X_test)

In [None]:
submission = pd.DataFrame({"Id": test_ids, "SalePrice": test_predictions})
submission.to_csv("submission.csv", index=False)