In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [3]:
df = pd.read_csv('house_prices.csv')

In [4]:
# 1. Data Preprocessing

# Drop columns with too many missing values
df_cleaned = df.drop(columns=['Alley', 'PoolQC', 'Fence', 'MiscFeature'])

# Fill missing numerical values with the median
numerical_columns = df_cleaned.select_dtypes(include=['float64', 'int64']).columns
df_cleaned[numerical_columns] = df_cleaned[numerical_columns].fillna(df_cleaned[numerical_columns].median())

# Fill missing categorical values with the mode
categorical_columns = df_cleaned.select_dtypes(include=['object']).columns
for column in categorical_columns:
    df_cleaned[column].fillna(df_cleaned[column].mode()[0], inplace=True)


In [5]:
# Encode categorical variables using LabelEncoder
label_encoders = {}
for column in categorical_columns:
    le = LabelEncoder()
    df_cleaned[column] = le.fit_transform(df_cleaned[column].astype(str))
    label_encoders[column] = le

In [6]:
# Scale numerical features using StandardScaler
scaler = StandardScaler()
df_cleaned[numerical_columns] = scaler.fit_transform(df_cleaned[numerical_columns])

# Define features and target
X = df_cleaned.drop(columns=['Id', 'Property_Sale_Price'])
y = df_cleaned['Property_Sale_Price']


In [7]:
# 2. Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 3. Hyperparameter Tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10]
}

In [8]:
rf_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf_model, param_grid=param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
grid_search.fit(X_train, y_train)

In [10]:
# Best model from grid search
best_rf_model = grid_search.best_estimator_

# 4. Make predictions and evaluate the model
y_pred = best_rf_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)

In [11]:
# Output the evaluation metrics and the best parameters from GridSearchCV
print(f'Mean Squared Error: {mse}')
print(f'Best Parameters: {grid_search.best_params_}')
print(f'Sample Predictions: {y_pred[:5]}')


Mean Squared Error: 0.12567924404884617
Best Parameters: {'max_depth': 20, 'min_samples_split': 5, 'n_estimators': 100}
Sample Predictions: [-0.50458802  1.81910199 -0.82265606 -0.22021661  1.79692256]


In [12]:
# 5. Feature Importance Analysis
feature_importances = pd.Series(best_rf_model.feature_importances_, index=X.columns)
important_features = feature_importances.sort_values(ascending=False)

# Output top 10 important features
print("\nTop 10 Important Features:")
print(important_features.head(10))



Top 10 Important Features:
OverallQual    0.563069
GrLivArea      0.122022
2ndFlrSF       0.035896
TotalBsmtSF    0.035344
BsmtFinSF1     0.029448
1stFlrSF       0.026374
LotArea        0.017345
GarageArea     0.016231
YearBuilt      0.012130
GarageCars     0.012053
dtype: float64
