In [2]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.preprocessing import LabelEncoder

train_df = pd.read_csv("Train.csv", low_memory=False)
test_df = pd.read_csv("Test.csv", low_memory=False)

train_df['saledate'] = pd.to_datetime(train_df['saledate'])
test_df['saledate'] = pd.to_datetime(test_df['saledate'])

train_df['saleYear'] = train_df['saledate'].dt.year
train_df['saleMonth'] = train_df['saledate'].dt.month
test_df['saleYear'] = test_df['saledate'].dt.year
test_df['saleMonth'] = test_df['saledate'].dt.month

train_df.drop('saledate', axis=1, inplace=True)
test_df.drop('saledate', axis=1, inplace=True)
missing = train_df.isnull().sum() / len(train_df)
drop_cols = missing[missing > 0.80].index.tolist()

train_df.drop(columns=drop_cols, inplace=True)
test_df.drop(columns=[col for col in drop_cols if col in test_df.columns], inplace=True)

cat_cols_train = train_df.select_dtypes(include='object').columns
cat_cols_test = test_df.select_dtypes(include='object').columns

train_df[cat_cols_train] = train_df[cat_cols_train].fillna("None")
test_df[cat_cols_test] = test_df[cat_cols_test].fillna("None")

num_cols_train = train_df.select_dtypes(include=['float64', 'int64']).columns.drop('SalePrice')
for col in num_cols_train:
    train_df[col] = train_df[col].fillna(train_df[col].median())

num_cols_test = test_df.select_dtypes(include=['float64', 'int64']).columns
for col in num_cols_test:
    test_df[col] = test_df[col].fillna(test_df[col].median())

combined = pd.concat([train_df.drop('SalePrice', axis=1), test_df], axis=0)

label_encoders = {}
for col in combined.select_dtypes(include='object').columns:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col].astype(str))
    label_encoders[col] = le

train_encoded = combined.iloc[:len(train_df), :]
test_encoded = combined.iloc[len(train_df):, :]

X = train_encoded
y = train_df['SalePrice']
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

model = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
model.fit(X_train, y_train)

valid_preds = model.predict(X_valid)
rmsle = np.sqrt(mean_squared_log_error(y_valid, valid_preds))
print(f"RMSLE : {rmsle:.4f}")


test_preds = model.predict(test_encoded.drop(columns=["SalePrice"], errors='ignore'))

submission = pd.DataFrame({
    "SalesID": test_df["SalesID"],
    "SalePrice": test_preds
})

submission.to_csv("test_predictions.csv", index=False)



RMSLE : 0.2159
