In [None]:
# Install dependencies if not installed
!pip install pandas numpy scikit-learn xgboost lightgbm seaborn matplotlib geopandas folium

# Import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import geopandas as gpd
import folium

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

# Load dataset (modify path if needed)
df = pd.read_csv("house_prices.csv")

# Drop columns with too many missing values
df.drop(columns=["Alley", "PoolQC", "Fence", "MiscFeature"], inplace=True)

# Fill missing values for numerical and categorical columns
num_cols = df.select_dtypes(include=["number"]).columns
cat_cols = df.select_dtypes(include=["object"]).columns

num_imputer = SimpleImputer(strategy="median")
cat_imputer = SimpleImputer(strategy="most_frequent")

df[num_cols] = num_imputer.fit_transform(df[num_cols])
df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

# Encode categorical variables
encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_cats = pd.DataFrame(encoder.fit_transform(df[cat_cols]), columns=encoder.get_feature_names_out(cat_cols))

# Combine processed numerical and categorical data
df_processed = pd.concat([df[num_cols], encoded_cats], axis=1)

# Define target variable
target = "SalePrice"
X = df_processed.drop(columns=[target])
y = df_processed[target]

# Split dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train multiple regression models
models = {
    "Linear Regression": LinearRegression(),
    "Ridge Regression": Ridge(alpha=1.0),
    "Gradient Boosting": GradientBoostingRegressor(n_estimators=200, learning_rate=0.05),
    "XGBoost": xgb.XGBRegressor(n_estimators=200, learning_rate=0.05),
    "LightGBM": lgb.LGBMRegressor(n_estimators=200, learning_rate=0.05)
}

# Train and evaluate models
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print(f"{name} RMSE: {rmse:.2f}")

# Geospatial Visualization (If location data is available)
if "Longitude" in df.columns and "Latitude" in df.columns:
    gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(df["Longitude"], df["Latitude"]))

    # Create interactive map with price color coding
    m = folium.Map(location=[df["Latitude"].mean(), df["Longitude"].mean()], zoom_start=10)
    for _, row in gdf.iterrows():
        folium.CircleMarker(
            location=[row["Latitude"], row["Longitude"]],
            radius=5,
            color="blue",
            fill=True,
            fill_color="blue",
            fill_opacity=0.5,
            popup=f"Price: ${row['SalePrice']:,}"
        ).add_to(m)

    m.save("real_estate_map.html")
    print("\n📍 Geospatial price visualization saved as 'real_estate_map.html'.")

# Feature Importance (Gradient Boosting Model)
gb_model = models["Gradient Boosting"]
feature_importances = pd.Series(gb_model.feature_importances_, index=X.columns).sort_values(ascending=False)

plt.figure(figsize=(10, 5))
sns.barplot(x=feature_importances.values[:10], y=feature_importances.index[:10])
plt.title("Top 10 Important Features for Price Prediction")
plt.xlabel("Feature Importance Score")
plt.show()
