In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, r2_score

# Load datasets
train_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("/kaggle/input/house-prices-advanced-regression-techniques/test.csv")

# Drop rows with missing target
train_data = train_data.dropna(subset=["SalePrice"])

# Separate target variable
y = train_data["SalePrice"]
X = train_data.drop(columns=["Id", "SalePrice"])
test_ids = test_data["Id"]
test_data = test_data.drop(columns=["Id"])

# Identify categorical & numerical features
categorical_features = X.select_dtypes(include=["object"]).columns
numerical_features = X.select_dtypes(include=[np.number]).columns

# Handle missing values for numerical features
X[numerical_features] = X[numerical_features].fillna(X[numerical_features].median())
test_data[numerical_features] = test_data[numerical_features].fillna(test_data[numerical_features].median())

# Handle missing values for categorical features
X[categorical_features] = X[categorical_features].fillna("Missing")
test_data[categorical_features] = test_data[categorical_features].fillna("Missing")

# Label Encode Categorical Variables
label_encoders = {}  # Dictionary to store encoders for each categorical column
for col in categorical_features:
    le = LabelEncoder()
    
    # Combine train and test data before fitting to avoid unseen labels issue
    le.fit(pd.concat([X[col], test_data[col]], axis=0))
    
    X[col] = le.transform(X[col])
    test_data[col] = le.transform(test_data[col])
    
    # Store the encoder
    label_encoders[col] = le

# Split data into training and validation set
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest Model
model = RandomForestRegressor(n_estimators=300, max_depth=15, min_samples_split=5, random_state=42)
model.fit(X_train, y_train)

# Predict & evaluate
y_pred = model.predict(X_valid)

# Compute MAE and Accuracy
mae = mean_absolute_error(y_valid, y_pred)
mean_actual = np.mean(y_valid)
accuracy = 100 - (mae / mean_actual * 100)

# Compute R² Score
r2 = r2_score(y_valid, y_pred)

# Print results
print(f"Validation MAE: {mae:.2f}")
print(f"Validation Accuracy: {accuracy:.2f}%")
print(f"R² Score: {r2:.4f}")

# Make predictions on test set
predictions = model.predict(test_data)

# Prepare submission file
submission = pd.DataFrame({"Id": test_ids, "SalePrice": predictions})
submission.to_csv("submission.csv", index=False)
print("✅ Submission file created!")

Validation MAE: 17575.47
Validation Accuracy: 90.17%
R² Score: 0.8912
✅ Submission file created!
