In [29]:
import os
import datetime
import numpy as np
import pandas as pd
import joblib
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.ensemble import RandomForestRegressor, StackingRegressor
import xgboost as xgb
import lightgbm as lgb

In [30]:
# Try different encodings if necessary: 'utf-8', 'latin1', 'iso-8859-1'
data = pd.read_csv("df_FATURE ENG_DATA.csv", encoding="utf-8")

# Display a sample of the data
data.head()

Unnamed: 0,price,rooms,bathroom,lift,terrace,square_meters,real_state,neighborhood,square_meters_price
0,750,3,1,True,False,60.0,flat,Horta- Guinardo,12.5
1,770,2,1,True,False,59.0,flat,Sant Andreu,13.050847
2,1300,1,1,True,True,30.0,flat,Gràcia,43.333333
3,2800,1,1,True,True,70.0,flat,Ciutat Vella,40.0
4,720,2,1,True,False,44.0,flat,Sant Andreu,16.363636


In [31]:
# Load Data
    #data = pd.read_csv(data_path, encoding="utf-8")  # New data WITHOUT MISSING VALUES
    #data.columns = data.columns.str.strip()  # Removes leading/trailing spaces

# Outlier handling (drop values >3 standard deviations)
data = data[(np.abs(data.select_dtypes(include=np.number).apply(zscore)) < 3).all(axis=1)]


In [32]:

# Feature Selection
target = "price"
features = ['rooms', 'bathroom', 'lift', 'terrace', 'square_meters', 'real_state', 'neighborhood']    

In [33]:
# Create dummy variables for categorical features
data = pd.get_dummies(data, columns=['real_state', 'neighborhood'], drop_first=False)
for feature, baseline in {'real_state': "flat", 'neighborhood': "Eixample"}.items():
    if f"{feature}_{baseline}" in data.columns:
        data.drop(columns=[f"{feature}_{baseline}"], inplace=True)

In [35]:
data.head()

Unnamed: 0,price,rooms,bathroom,lift,terrace,square_meters,square_meters_price,real_state_apartment,real_state_attic,real_state_study,neighborhood_Ciutat Vella,neighborhood_Gràcia,neighborhood_Horta- Guinardo,neighborhood_Les Corts,neighborhood_Nou Barris,neighborhood_Sant Andreu,neighborhood_Sant Martí,neighborhood_Sants-Montjuïc,neighborhood_Sarria-Sant Gervasi
0,750,3,1,1,0,60.0,12.5,0,0,0,0,0,1,0,0,0,0,0,0
1,770,2,1,1,0,59.0,13.050847,0,0,0,0,0,0,0,0,1,0,0,0
2,1300,1,1,1,1,30.0,43.333333,0,0,0,0,1,0,0,0,0,0,0,0
3,2800,1,1,1,1,70.0,40.0,0,0,0,1,0,0,0,0,0,0,0,0
4,720,2,1,1,0,44.0,16.363636,0,0,0,0,0,0,0,0,1,0,0,0


In [34]:
# Convert boolean columns to numeric (0 and 1)
bool_cols = data.select_dtypes(['bool']).columns
data[bool_cols] = data[bool_cols].astype(int)

In [24]:
model_features=data.columns

In [26]:
X = data[model_features]
y = data[target]


In [28]:
X.head()

Unnamed: 0,price,rooms,bathroom,lift,terrace,square_meters,square_meters_price,real_state_apartment,real_state_attic,real_state_study,neighborhood_Ciutat Vella,neighborhood_Gràcia,neighborhood_Horta- Guinardo,neighborhood_Les Corts,neighborhood_Nou Barris,neighborhood_Sant Andreu,neighborhood_Sant Martí,neighborhood_Sants-Montjuïc,neighborhood_Sarria-Sant Gervasi
0,750,3,1,1,0,60.0,12.5,0,0,0,0,0,1,0,0,0,0,0,0
1,770,2,1,1,0,59.0,13.050847,0,0,0,0,0,0,0,0,1,0,0,0
2,1300,1,1,1,1,30.0,43.333333,0,0,0,0,1,0,0,0,0,0,0,0
3,2800,1,1,1,1,70.0,40.0,0,0,0,1,0,0,0,0,0,0,0,0
4,720,2,1,1,0,44.0,16.363636,0,0,0,0,0,0,0,0,1,0,0,0


In [27]:

# Apply Log Transformation to Reduce Skewness
y = np.log1p(y)


In [None]:

# Create Polynomial & Interaction Features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X)


In [None]:

# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

In [None]:





    # Standardize Features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Define Base Models
    rf = RandomForestRegressor(n_estimators=300, max_depth=20, min_samples_split=5, random_state=42)
    xgbr = xgb.XGBRegressor(n_estimators=300, max_depth=10, learning_rate=0.05, random_state=42)
    lgbr = lgb.LGBMRegressor(n_estimators=300, max_depth=10, learning_rate=0.05, random_state=42)

    # Stacking Model
    stacked_model = StackingRegressor(
        estimators=[("rf", rf), ("xgb", xgbr), ("lgb", lgbr)],
        final_estimator=xgb.XGBRegressor(n_estimators=100, learning_rate=0.1, max_depth=5, random_state=42)
    )

    # Train Model
    stacked_model.fit(X_train, y_train)

    # Evaluate Model
    r2_score = stacked_model.score(X_test, y_test)
    print(f"Improved R² Score: {r2_score:.4f}")

    # Create models directory if it doesn't exist
    os.makedirs("models", exist_ok=True)

    # Get current date
    current_date = datetime.datetime.now().strftime("%Y-%m-%d")

    # Export the best model
    joblib.dump(stacked_model, f"models/stacked_model_at_{current_date}.pkl")
    joblib.dump(scaler, f"models/scaler_at_{current_date}.pkl")
    joblib.dump(poly, f"models/poly_at_{current_date}.pkl")
    
    print("Models saved successfully!")