In [29]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
df = pd.read_csv("train.csv")

y = df["SalePrice"]

X = df.drop(columns=["Id", "SalePrice"])

X = pd.get_dummies(X, drop_first=True)

# Train-validation split (80% train, 20% validation)
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.4, random_state=42
)

print("Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)

# Scale numeric features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Random Forest Regressor
rf = RandomForestRegressor(n_estimators=123, random_state=42)
rf.fit(X_train_scaled, y_train)
rf_preds = rf.predict(X_val_scaled)

# XGBoost Regressor
xgb = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
xgb.fit(X_train_scaled, y_train)
xgb_preds = xgb.predict(X_val_scaled)

# Evaluation (RMSE)
rf_rmse = np.sqrt(mean_squared_error(y_val, rf_preds))
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_preds))

print("Random Forest RMSE on Validation:", rf_rmse)
print("XGBoost RMSE on Validation:", xgb_rmse)



Train shape: (876, 244)
Validation shape: (584, 244)
Random Forest RMSE on Validation: 30284.243134406872
XGBoost RMSE on Validation: 33550.81352217856


In [30]:
test_df = pd.read_csv("test.csv")

X_all = combined.iloc[:len(X), :]
X_test = combined.iloc[len(X):, :]

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


model = XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
model.fit(X_train_scaled, y_train)

val_preds = model.predict(X_val_scaled)
val_rmse = np.sqrt(mean_squared_error(y_val, val_preds))
print("Validation RMSE:", val_rmse)

model.fit(scaler.fit_transform(X_all), y)
test_preds = model.predict(X_test_scaled)

submission = pd.DataFrame({
    "Id": test_df["Id"],
    "SalePrice": test_preds
})

submission.to_csv("submission.csv", index=False)
print("Submission file saved as 'submission.csv'.")

Validation RMSE: 33550.81352217856
Submission file saved as 'submission.csv'.


In [31]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 80 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1455 non-null   object 
 3   LotFrontage    1232 non-null   float64
 4   LotArea        1459 non-null   int64  
 5   Street         1459 non-null   object 
 6   Alley          107 non-null    object 
 7   LotShape       1459 non-null   object 
 8   LandContour    1459 non-null   object 
 9   Utilities      1457 non-null   object 
 10  LotConfig      1459 non-null   object 
 11  LandSlope      1459 non-null   object 
 12  Neighborhood   1459 non-null   object 
 13  Condition1     1459 non-null   object 
 14  Condition2     1459 non-null   object 
 15  BldgType       1459 non-null   object 
 16  HouseStyle     1459 non-null   object 
 17  OverallQual    1459 non-null   int64  
 18  OverallC

In [32]:
print(val_preds.shape)

(584,)


In [33]:
X_test.shape

(1459, 244)