In [None]:
#Import libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler , OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import gc
import warnings
from google.colab import files
import os
warnings.filterwarnings('ignore')
#set plot style
sns.set(style="whitegrid")

In [None]:
#Step1 : Mount Google Drive and load dataset
print("Mounting Google Drive to access dataset")
try:
  from google.colab import drive
  drive.mount('/content/drive')
  #define path to dataset
  data_folder = '/content/drive/My Drive/data/'
  data_file=data_folder + 'Property_with_Feature_Engineering.csv'
  #check if file exists
  if not os.path.exists(data_file):
    raise FileNotFoundError(f"Property_with_Feature_Engineering.csv not found in {data_folder}")

  #Load dataset with specified dtype and encoding
  numeric_cols=['price','latitude','longitude','baths','area_marla','area_sqft','bedrooms','year','month','day']
  dtypes={'property_id':str,'location_id':str,'page_url':str,'property_type':str,'price_bin':str,
          'location':str,'city':str,'province_name':str,'locality':str,'purpose':str,'agency':str,'agent':str}
  converters={col : lambda x: pd.to_numeric(x,errors='coerce') for col in numeric_cols}
  df= pd.read_csv(data_file,encoding='latin1',sep=',',dtype=dtypes,converters=converters)
  print(f"Dataset loaded successfully , shape: {df.shape}")

  #clean columns names
  df.columns=[col.strip().strip('"').lower() for col in df.columns]
  #Inspect data
  print("\n==== Dataset Info ====")
  print(f"Rows: {len(df)}")
  print("Columns: ",df.columns.tolist())
  print("\nData Types: \n",df.dtypes)
  print("\nNulls: \n",df.isnull().sum())
  print("Duplicates: ",df.duplicated().sum())
  print("\nSample Data : \n",df.head())

except Exception as e :
  print(f"Error loading dataset: {e}")
  exit()

Mounting Google Drive to access dataset
Mounted at /content/drive
Dataset loaded successfully , shape: (191393, 24)

==== Dataset Info ====
Rows: 191393
Columns:  ['property_id', 'location_id', 'page_url', 'property_type', 'price', 'price_bin', 'location', 'city', 'province_name', 'locality', 'latitude', 'longitude', 'baths', 'area', 'area_marla', 'area_sqft', 'purpose', 'bedrooms', 'date_added', 'year', 'month', 'day', 'agency', 'agent']

Data Types: 
 property_id       object
location_id       object
page_url          object
property_type     object
price              int64
price_bin         object
location          object
city              object
province_name     object
locality          object
latitude         float64
longitude        float64
baths              int64
area              object
area_marla       float64
area_sqft        float64
purpose           object
bedrooms           int64
date_added        object
year               int64
month              int64
day              

In [None]:
#Step 2 : Dropping unnecessary columns
print("\n==== Data Cleaning ====")
try :
  # Verify if 'area' is redundant with 'area_marla' or 'area_sqft'
  print("Checking 'area' vs. 'area_marla' and 'area_sqft'...")
  df['area']=pd.to_numeric(df['area'],errors='coerce') #convert 'area' to numeric
  area_correlation=df[['area','area_marla','area_sqft']].corr()
  print("Correlation between area columns:\n", area_correlation)
  if area_correlation.loc['area','area_marla']>0.9 or area_correlation.loc['area','area_sqft']>0.9:
    print("'area' is highly correlated with 'area_marla' or 'area_sqft', dropping 'area'")
    df=df.drop(columns=['area'])
  else:
    print("'area' not dropped, keeping for now ")

  #Drop Specified columns
  columns_to_drop=['property_id', 'location_id', 'page_url', 'date_added', 'province_name', 'agency', 'agent']
  df=df.drop(columns=[col for col in columns_to_drop if col in df.columns])
  print(f"Columns dropped: {columns_to_drop}")
  #Define numerical and categorical columns
  numerical_cols=['price', 'latitude', 'longitude', 'baths', 'area_marla', 'area_sqft', 'bedrooms', 'year', 'month', 'day']
  if 'area' in df.columns:
    numerical_cols.append('area')
  categorical_cols=['property_type', 'price_bin', 'location', 'city', 'locality', 'purpose']
  #Handling missing values
  for col in numerical_cols:
    if col in df.columns:
      df[col]=df[col].fillna(df[col].median())
  for col in categorical_cols:
    if col in df.columns:
      df[col]=df[col].fillna(df[col].mode()[0])

  #Verfiy no nulls remain
  print("\nNulls After Imputation:")
  print(df.isnull().sum())

  #Inspect updated dataset
  print("\nUpdated Dataset Info:")
  print(f"Rows : {len(df)}")
  print("Columns: ",df.columns.tolist())
  print("\nSample Data: \n", df.head())

except Exception as e :
  print(f"Error in data cleaning: {e}")
  exit()


==== Data Cleaning ====
Checking 'area' vs. 'area_marla' and 'area_sqft'...
Correlation between area columns:
             area  area_marla  area_sqft
area         NaN         NaN        NaN
area_marla   NaN         1.0        1.0
area_sqft    NaN         1.0        1.0
'area' not dropped, keeping for now 
Columns dropped: ['property_id', 'location_id', 'page_url', 'date_added', 'province_name', 'agency', 'agent']

Nulls After Imputation:
property_type         0
price                 0
price_bin             0
location              0
city                  0
locality              0
latitude              0
longitude             0
baths                 0
area             191393
area_marla            0
area_sqft             0
purpose               0
bedrooms              0
year                  0
month                 0
day                   0
dtype: int64

Updated Dataset Info:
Rows : 191393
Columns:  ['property_type', 'price', 'price_bin', 'location', 'city', 'locality', 'latitude', 'lon

In [None]:
#Step3 : Data Preprocessing (drop area,encode features,remove outliers,scale)
print("\n==== Data Preprocessing ====")
try:
  #drop 'area'
  df=df.drop(columns=['area'])
  print("Dropped 'area' due to invalid format and redundancy with 'area_marla'/'area_sqft'")
  #define numerical and categorical columns
  numerical_cols=['latitude','longitude','baths','area_marla','bedrooms','year','month','day']
  categorical_cols=['property_type','price_bin','location','city','locality','purpose']
  target='price'
  #Limit high-cardinality categorical features
  for col in ['locality','city','location']:
    if col in df.columns:
      top_values=df[col].value_counts().head(50).index
      df[col]=df[col].where(df[col].isin(top_values),'Other')
      print(f"Limited '{col}' to top 50 values")
  #Remove outliers using IQR for numerical columns
  print("\nRemoving outliers...")
  for col in numerical_cols + [target]:
    if col in df.columns:
      Q1=df[col].quantile(0.25)
      Q3=df[col].quantile(0.75)
      IQR=Q3-Q1
      lower_bound=Q1-1.5*IQR
      upper_bound=Q3+1.5*IQR
      outliers=df[(df[col]<lower_bound) | (df[col]>upper_bound)][col]
  #Log-transform skewed target ('price') and numerical features
  skewed_cols= ['price','area_marla','area_sqft']
  for col in skewed_cols:
    if col in df.columns:
      df[col]=np.log1p(df[col])
      print(f"Log-transformed '{col}' to reduce skewness")
  #Create preprocessing pipline
  numerical_transformer=Pipeline(steps=[('scaler',StandardScaler())])
  categorical_transformer=Pipeline(steps=[('onehot',OneHotEncoder(handle_unknown='ignore',sparse_output=True))])
  preprocessor=ColumnTransformer(
      transformers=[
          ('num',numerical_transformer,numerical_cols),
          ('cat',categorical_transformer,categorical_cols)
      ])
  #applying preprocessing to features
  features=numerical_cols + categorical_cols
  X=df[features]
  y=df[target]
  X_preprocessed=preprocessor.fit_transform(X)
  #saving the preprocessor for later use
  joblib.dump(preprocessor,'/content/drive/My Drive/data/preprocessor.pkl')
  print("Saved to 'preprocessor.pkl'")
  #Inspect preprocessed data
  print("\nPreprocessed Data Info")
  print(f"X shape: {X_preprocessed.shape}")
  print(f"y shape: {y.shape}")
  print(f"Remaining rows after outlier removal: {len(df)}")
  print("Feature names:", preprocessor.get_feature_names_out().tolist()[:10], "...")

except Exception as e :
  print(f"Error in data preprocessing: {e}")
  exit()



==== Data Preprocessing ====
Dropped 'area' due to invalid format and redundancy with 'area_marla'/'area_sqft'
Limited 'locality' to top 50 values
Limited 'city' to top 50 values
Limited 'location' to top 50 values

Removing outliers...
Log-transformed 'price' to reduce skewness
Log-transformed 'area_marla' to reduce skewness
Log-transformed 'area_sqft' to reduce skewness
Saved to 'preprocessor.pkl'

Preprocessed Data Info
X shape: (191393, 128)
y shape: (191393,)
Remaining rows after outlier removal: 191393
Feature names: ['num__latitude', 'num__longitude', 'num__baths', 'num__area_marla', 'num__bedrooms', 'num__year', 'num__month', 'num__day', 'cat__property_type_Farm House', 'cat__property_type_Flat'] ...


In [None]:
# Step 4: Optimized Data Splitting with Correct Saving
print("\n=== Optimized Data Splitting ===")
try:
    from sklearn.model_selection import train_test_split
    import pandas as pd
    import numpy as np
    import os
    from scipy.sparse import issparse

    # Reload raw DataFrame to access 'price_bin' and filter price
    raw_df = df.copy()  # Use df from previous cells

    # Filter out rows where price <= 0
    initial_rows = len(raw_df)
    raw_df = raw_df[raw_df['price'] > 0]
    filtered_rows = len(raw_df)
    print(f"Filtered out {initial_rows - filtered_rows} rows with price <= 0. Remaining rows: {filtered_rows}")

    # Create stratified train/test split using 'price_bin'
    X = raw_df.drop(columns=['price'])
    y = raw_df['price']
    X_train_raw, X_test_raw, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=raw_df['price_bin']
    )
    print(f"Training set shape: X_train_raw {X_train_raw.shape}, y_train {y_train.shape}")
    print(f"Test set shape: X_test_raw {X_test_raw.shape}, y_test {y_test.shape}")

    # Verify stratification
    print("\nPrice_bin distribution in train vs. test:")
    train_price_bin_dist = X_train_raw['price_bin'].value_counts(normalize=True)
    test_price_bin_dist = X_test_raw['price_bin'].value_counts(normalize=True)
    print("Train:\n", train_price_bin_dist)
    print("Test:\n", test_price_bin_dist)

    # Apply preprocessor and convert to dense arrays
    data_path = '/content/drive/My Drive/data/'
    preprocessor = joblib.load(data_path + 'preprocessor.pkl')
    X_train = preprocessor.transform(X_train_raw)
    X_test = preprocessor.transform(X_test_raw)
    if issparse(X_train):
        X_train = X_train.toarray()
    if issparse(X_test):
        X_test = X_test.toarray()
    print(f"Preprocessed training set shape: {X_train.shape}")
    print(f"Preprocessed test set shape: {X_test.shape}")

    # Save raw and preprocessed data
    os.makedirs(data_path, exist_ok=True)
    # Save raw data as CSV
    X_train_raw.to_csv(data_path + 'X_train_raw.csv', index=False)
    y_train.to_csv(data_path + 'y_train.csv', index=False)
    X_test_raw.to_csv(data_path + 'X_test_raw.csv', index=False)
    y_test.to_csv(data_path + 'y_test.csv', index=False)
    # Save preprocessed data as .npy (dense arrays)
    np.save(data_path + 'X_train.npy', X_train)
    np.save(data_path + 'y_train.npy', y_train)
    np.save(data_path + 'X_test.npy', X_test)
    np.save(data_path + 'y_test.npy', y_test)
    print("Saved raw (X_train_raw.csv, y_train.csv, X_test_raw.csv, y_test.csv) and preprocessed (X_train.npy, y_train.npy, X_test.npy, y_test.npy) to Google Drive")

    # Verify numerical feature stats
    print("\nPrice stats (log-transformed) in train vs. test:")
    print("Train price stats:\n", pd.Series(y_train).describe())
    print("Test price stats:\n", pd.Series(y_test).describe())

except Exception as e:
    print(f"Error in optimized data splitting: {e}")
    exit()


=== Optimized Data Splitting ===
Filtered out 3 rows with price <= 0. Remaining rows: 191390
Training set shape: X_train_raw (153112, 15), y_train (153112,)
Test set shape: X_test_raw (38278, 15), y_test (38278,)

Price_bin distribution in train vs. test:
Train:
 price_bin
Low          0.262148
High         0.251385
Medium       0.245454
Very High    0.241013
Name: proportion, dtype: float64
Test:
 price_bin
Low          0.262135
High         0.251372
Medium       0.245467
Very High    0.241026
Name: proportion, dtype: float64
Preprocessed training set shape: (153112, 128)
Preprocessed test set shape: (38278, 128)
Saved raw (X_train_raw.csv, y_train.csv, X_test_raw.csv, y_test.csv) and preprocessed (X_train.npy, y_train.npy, X_test.npy, y_test.npy) to Google Drive

Price stats (log-transformed) in train vs. test:
Train price stats:
 count    153112.000000
mean         14.596523
std           2.815985
min           0.693147
25%          11.289794
50%          15.789592
75%          16.

In [None]:
# Step 5: Optimized Model Training and Evaluation
print("\n=== Optimized Model Training and Evaluation ===")
try:
    from xgboost import XGBRegressor
    from catboost import CatBoostRegressor
    from sklearn.model_selection import KFold, RandomizedSearchCV
    from sklearn.pipeline import Pipeline
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    import numpy as np
    import pandas as pd
    import joblib
    import os
    from google.colab import files
    from scipy.sparse import issparse

    # Load preprocessed data
    data_path = '/content/drive/My Drive/data/'
    X_train = np.load(data_path + 'X_train.npy', allow_pickle=True)
    y_train = np.load(data_path + 'y_train.npy', allow_pickle=True)
    X_test = np.load(data_path + 'X_test.npy', allow_pickle=True)
    y_test = np.load(data_path + 'y_test.npy', allow_pickle=True)

    # Convert sparse matrices to dense if necessary
    if issparse(X_train):
        print("Converting X_train from sparse to dense...")
        X_train = X_train.toarray()
    if issparse(X_test):
        print("Converting X_test from sparse to dense...")
        X_test = X_test.toarray()

    # Verify shapes
    print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
    print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

    # Check if shapes are valid
    if len(X_train.shape) != 2 or len(X_test.shape) != 2:
        raise ValueError("X_train or X_test is not a 2D array")

    # Define models
    models = {
        'XGBoost': XGBRegressor(random_state=42, n_jobs=1),
        'CatBoost': CatBoostRegressor(random_state=42, verbose=0, thread_count=1)
    }

    # Define hyperparameter grid for XGBoost
    param_dist = {
        'model__learning_rate': [0.01, 0.05, 0.1, 0.3],
        'model__max_depth': [5, 7, 9],
        'model__n_estimators': [100, 200, 300],
        'model__subsample': [0.7, 0.9]
    }

    # Load preprocessor for feature names
    preprocessor = joblib.load(data_path + 'preprocessor.pkl')
    feature_names = preprocessor.get_feature_names_out().tolist()

    # Initialize results
    results = {}
    best_model = None
    best_r2 = -float('inf')
    best_model_name = None

    # Cross-validation setup
    kf = KFold(n_splits=5, shuffle=True, random_state=42)

    for model_name, model in models.items():
        print(f"\nTraining and tuning {model_name}...")
        try:
            if model_name == 'XGBoost':
                # Create pipeline for XGBoost
                pipeline = Pipeline([('model', model)])
                # Randomized search for XGBoost
                search = RandomizedSearchCV(
                    pipeline,
                    param_distributions=param_dist,
                    n_iter=10,
                    cv=kf,
                    scoring='neg_mean_squared_error',
                    random_state=42,
                    n_jobs=1
                )
                search.fit(X_train, y_train)
                best_estimator = search.best_estimator_
                print(f"Best parameters for {model_name}: {search.best_params_}")
            else:
                # CatBoost (no pipeline needed)
                best_estimator = model
                best_estimator.fit(X_train, y_train)

            # Evaluate on test set (original price scale)
            y_pred_log = best_estimator.predict(X_test)
            y_pred = np.expm1(y_pred_log)  # Inverse of log1p
            y_test_orig = np.expm1(y_test)  # Inverse of log1p
            rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred))
            mae = mean_absolute_error(y_test_orig, y_pred)
            r2 = r2_score(y_test_orig, y_pred)

            results[model_name] = {'RMSE': rmse, 'MAE': mae, 'R2': r2}
            print(f"{model_name} Results (original price scale):")
            print(f"RMSE: {rmse:.2f}")
            print(f"MAE: {mae:.2f}")
            print(f"R2: {r2:.2f}")

            # Save model if it has the best R2
            if r2 > best_r2:
                best_r2 = r2
                best_model = best_estimator
                best_model_name = model_name
                model_filename = data_path + f'{model_name.lower()}_best_model.pkl'
                joblib.dump(best_estimator, model_filename, compress=3)
                print(f"Saved {model_name} model to {model_filename}")

            # Feature importance
            if model_name in ['XGBoost', 'CatBoost']:
                importance = best_estimator.get_feature_importance() if model_name == 'CatBoost' else best_estimator.named_steps['model'].feature_importances_
                importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
                print(f"\nTop 5 features for {model_name}:")
                print(importance_df.sort_values(by='Importance', ascending=False).head(5))

        except Exception as e:
            print(f"Error training {model_name}: {e}")

    # Compare models
    print("\nModel Comparison:")
    if results:
        results_df = pd.DataFrame(results).T
        print(results_df.sort_values(by='R2', ascending=False))
    else:
        print("No models trained successfully")

    # Download best model
    if best_model is not None:
        files.download(model_filename)
        print(f"Downloaded best model: {model_filename}")

except Exception as e:
    print(f"Error in model training: {e}")
    exit()

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
# Step 6: Fixed and Optimized Model Refinement and Validation
print("\n=== Fixed and Optimized Model Refinement and Validation ===")
try:
    from xgboost import XGBRegressor
    from catboost import CatBoostRegressor
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import KFold, RandomizedSearchCV, train_test_split
    from sklearn.pipeline import Pipeline
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    from sklearn.preprocessing import StandardScaler, OneHotEncoder
    from sklearn.compose import ColumnTransformer
    import numpy as np
    import pandas as pd
    import joblib
    import os
    from google.colab import files
    from scipy.sparse import issparse

    # Load raw data
    data_path = '/content/drive/My Drive/data/'
    X_train_raw = pd.read_csv(data_path + 'X_train_raw.csv')
    X_test_raw = pd.read_csv(data_path + 'X_test_raw.csv')
    y_train = np.load(data_path + 'y_train.npy', allow_pickle=True)
    y_test = np.load(data_path + 'y_test.npy', allow_pickle=True)

    # Drop price_bin
    X_train_raw = X_train_raw.drop(columns=['price_bin'])
    X_test_raw = X_test_raw.drop(columns=['price_bin'])
    print("Dropped 'price_bin' to prevent leakage")

    # Feature engineering
    X_train_raw['bedrooms_area_marla'] = X_train_raw['bedrooms'] * X_train_raw['area_marla']
    X_test_raw['bedrooms_area_marla'] = X_test_raw['bedrooms'] * X_test_raw['area_marla']
    X_train_raw['baths_area_marla'] = X_train_raw['baths'] * X_train_raw['area_marla']
    X_test_raw['baths_area_marla'] = X_test_raw['baths'] * X_test_raw['area_marla']
    X_train_raw['lat_lon_interaction'] = X_train_raw['latitude'] * X_train_raw['longitude']
    X_test_raw['lat_lon_interaction'] = X_test_raw['latitude'] * X_test_raw['longitude']
    print("Added features: bedrooms_area_marla, baths_area_marla, lat_lon_interaction")

    # Define numerical and categorical columns
    numerical_cols = ['latitude', 'longitude', 'baths', 'area_marla', 'bedrooms', 'year', 'month', 'day',
                     'bedrooms_area_marla', 'baths_area_marla', 'lat_lon_interaction']
    categorical_cols = ['property_type', 'location', 'city', 'locality', 'purpose']

    # Check for infinities or NaNs
    if np.any(np.isinf(X_train_raw[numerical_cols])) or np.any(np.isnan(X_train_raw[numerical_cols])):
        print("Warning: Infinities or NaNs detected, replacing with median")
        for col in numerical_cols:
            X_train_raw[col] = X_train_raw[col].replace([np.inf, -np.inf], np.nan).fillna(X_train_raw[col].median())
            X_test_raw[col] = X_test_raw[col].replace([np.inf, -np.inf], np.nan).fillna(X_train_raw[col].median())

    # Update preprocessor (keep sparse output)
    numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    X_train = preprocessor.fit_transform(X_train_raw)
    X_test = preprocessor.transform(X_test_raw)
    print(f"Preprocessed shapes: X_train {X_train.shape}, X_test {X_test.shape}")

    # Save preprocessor
    joblib.dump(preprocessor, data_path + 'preprocessor_final.pkl')
    print("Saved final preprocessor to 'preprocessor_final.pkl'")

    # Create holdout set
    X_test, X_holdout, y_test, y_holdout = train_test_split(
        X_test, y_test, test_size=0.2, random_state=42
    )
    print(f"Test set shape: {X_test.shape}, Holdout set shape: {X_holdout.shape}")

    # Train and tune XGBoost
    print("\nTuning XGBoost...")
    pipeline = Pipeline([('xgb', XGBRegressor(random_state=42, n_jobs=1, early_stopping_rounds=10))])
    param_dist_xgb = {
        'xgb__learning_rate': [0.05, 0.1],
        'xgb__max_depth': [5, 7],
        'xgb__n_estimators': [100, 200],
        'xgb__subsample': [0.8, 1.0]
    }
    search = RandomizedSearchCV(
        pipeline,
        param_distributions=param_dist_xgb,
        n_iter=5,
        cv=KFold(n_splits=3, shuffle=True, random_state=42),
        scoring='neg_mean_squared_error',
        random_state=42,
        n_jobs=1
    )
    search.fit(X_train, y_train, xgb__eval_set=[(X_test, y_test)], xgb__verbose=False)
    xgb_model = search.best_estimator_
    print(f"Best parameters for XGBoost: {search.best_params_}")

    # Train CatBoost and RandomForest
    print("\nTraining CatBoost and RandomForest...")
    cat_model = CatBoostRegressor(random_state=42, verbose=0, thread_count=1, early_stopping_rounds=10)
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1)
    cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))
    rf_model.fit(X_train, y_train)

    # Feature selection with lower threshold
    importance = xgb_model.named_steps['xgb'].feature_importances_
    feature_names = preprocessor.get_feature_names_out().tolist()
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
    top_features = importance_df[importance_df['Importance'] > 0.001]['Feature'].tolist()
    print(f"\nSelected {len(top_features)} features with importance > 0.001:")
    print(importance_df.sort_values(by='Importance', ascending=False).head(5))

    # Filter data
    feature_indices = [feature_names.index(f) for f in top_features]
    X_train_filtered = X_train[:, feature_indices]
    X_test_filtered = X_test[:, feature_indices]
    X_holdout_filtered = X_holdout[:, feature_indices]
    print(f"Filtered shapes: X_train {X_train_filtered.shape}, X_test {X_test_filtered.shape}, X_holdout {X_holdout_filtered.shape}")

    # Retrain models on filtered features
    print("\nRetraining models on filtered features...")
    xgb_model_filtered = XGBRegressor(
        **{k.split('__')[1]: v for k, v in search.best_params_.items()},
        random_state=42, n_jobs=1, early_stopping_rounds=10
    )
    cat_model_filtered = CatBoostRegressor(random_state=42, verbose=0, thread_count=1, early_stopping_rounds=10)
    rf_model_filtered = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1)
    xgb_model_filtered.fit(X_train_filtered, y_train, eval_set=[(X_test_filtered, y_test)], verbose=False)
    cat_model_filtered.fit(X_train_filtered, y_train, eval_set=(X_test_filtered, y_test))
    rf_model_filtered.fit(X_train_filtered, y_train)

    # Weighted ensemble predictions
    print("\nEvaluating Weighted Ensemble...")
    y_pred_xgb_log = xgb_model_filtered.predict(X_test_filtered)
    y_pred_cat_log = cat_model_filtered.predict(X_test_filtered)
    y_pred_rf_log = rf_model_filtered.predict(X_test_filtered)
    weights = [0.5, 0.3, 0.2]  # XGBoost: 50%, CatBoost: 30%, RandomForest: 20%
    y_pred_log = weights[0] * y_pred_xgb_log + weights[1] * y_pred_cat_log + weights[2] * y_pred_rf_log
    y_pred = np.expm1(y_pred_log)
    y_test_orig = np.expm1(y_test)
    rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred))
    mae = mean_absolute_error(y_test_orig, y_pred)
    r2 = r2_score(y_test_orig, y_pred)
    print("Weighted Ensemble Test Results (original price scale):")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R2: {r2:.2f}")

    # Validate on holdout set
    y_pred_xgb_log_holdout = xgb_model_filtered.predict(X_holdout_filtered)
    y_pred_cat_log_holdout = cat_model_filtered.predict(X_holdout_filtered)
    y_pred_rf_log_holdout = rf_model_filtered.predict(X_holdout_filtered)
    y_pred_log_holdout = weights[0] * y_pred_xgb_log_holdout + weights[1] * y_pred_cat_log_holdout + weights[2] * y_pred_rf_log_holdout
    y_pred_holdout = np.expm1(y_pred_log_holdout)
    y_holdout_orig = np.expm1(y_holdout)
    r2_holdout = r2_score(y_holdout_orig, y_pred_holdout)
    print(f"Weighted Ensemble Holdout R2: {r2_holdout:.2f}")

    # Save best model (XGBoost) and artifacts
    model_filename = data_path + 'xgboost_final_model.pkl'
    joblib.dump(xgb_model_filtered, model_filename, compress=3)
    importance_df.to_csv(data_path + 'feature_importance.csv', index=False)
    print(f"Saved XGBoost model to {model_filename}")
    print("Saved feature importance to 'feature_importance.csv'")

    # Save selected feature indices for deployment
    joblib.dump(feature_indices, data_path + 'selected_feature_indices.pkl')
    print("Saved selected feature indices to 'selected_feature_indices.pkl'")

    # Download artifacts
    files.download(model_filename)
    files.download(data_path + 'preprocessor_final.pkl')
    files.download(data_path + 'feature_importance.csv')
    files.download(data_path + 'selected_feature_indices.pkl')
    print(f"Downloaded {model_filename}, preprocessor_final.pkl, feature_importance.csv, selected_feature_indices.pkl")

except Exception as e:
    print(f"Error in model refinement: {e}")
    exit()


In [None]:
# Step 8: Best Optimized Model to Achieve R² ≈ 0.88
print("\n=== Best Optimized Model to Achieve R² ≈ 0.88 ===")
try:
    from xgboost import XGBRegressor
    from catboost import CatBoostRegressor
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import train_test_split
    from sklearn.pipeline import Pipeline
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
    from sklearn.compose import ColumnTransformer
    from sklearn.cluster import KMeans
    import numpy as np
    import pandas as pd
    import joblib
    import os
    from google.colab import files
    from scipy.sparse import issparse

    # Load raw data
    data_path = '/content/drive/My Drive/data/'
    X_train_raw = pd.read_csv(data_path + 'X_train_raw.csv')
    X_test_raw = pd.read_csv(data_path + 'X_test_raw.csv')
    y_train = np.load(data_path + 'y_train.npy', allow_pickle=True)
    y_test = np.load(data_path + 'y_test.npy', allow_pickle=True)

    # Optional: Sample 70% of training data to reduce runtime (uncomment if needed)
    # sample_frac = 0.7
    # X_train_raw, _, y_train, _ = train_test_split(
    #     X_train_raw, y_train, train_size=sample_frac, random_state=42
    # )
    # print(f"Sampled training data: {X_train_raw.shape[0]} rows")

    # Drop price_bin
    X_train_raw = X_train_raw.drop(columns=['price_bin'])
    X_test_raw = X_test_raw.drop(columns=['price_bin'])
    print("Dropped 'price_bin' to prevent leakage")

    # Feature engineering
    X_train_raw['bedrooms_area_marla'] = X_train_raw['bedrooms'] * X_train_raw['area_marla']
    X_test_raw['bedrooms_area_marla'] = X_test_raw['bedrooms'] * X_test_raw['area_marla']
    X_train_raw['baths_area_marla'] = X_train_raw['baths'] * X_train_raw['area_marla']
    X_test_raw['baths_area_marla'] = X_test_raw['baths'] * X_test_raw['area_marla']
    X_train_raw['lat_lon_interaction'] = X_train_raw['latitude'] * X_train_raw['longitude']
    X_test_raw['lat_lon_interaction'] = X_test_raw['latitude'] * X_test_raw['longitude']

    # Add distance to city center
    city_centers = {
        'Lahore': (31.5204, 74.3587),
        'Karachi': (24.8607, 67.0011),
        'Islamabad': (33.6844, 73.0479)
    }
    def calculate_distance(row, city):
        if row['city'] in city_centers:
            lat_center, lon_center = city_centers[row['city']]
            return np.sqrt((row['latitude'] - lat_center)**2 + (row['longitude'] - lon_center)**2)
        return 0.0
    X_train_raw['distance_to_center'] = X_train_raw.apply(lambda row: calculate_distance(row, row['city']), axis=1)
    X_test_raw['distance_to_center'] = X_test_raw.apply(lambda row: calculate_distance(row, row['city']), axis=1)

    # Add location clusters
    kmeans = KMeans(n_clusters=10, random_state=42)
    X_train_raw['location_cluster'] = kmeans.fit_predict(X_train_raw[['latitude', 'longitude']])
    X_test_raw['location_cluster'] = kmeans.predict(X_test_raw[['latitude', 'longitude']])

    # Define numerical and categorical columns
    numerical_cols = ['latitude', 'longitude', 'baths', 'area_marla', 'bedrooms', 'year', 'month', 'day',
                     'bedrooms_area_marla', 'baths_area_marla', 'lat_lon_interaction', 'distance_to_center']
    categorical_cols = ['property_type', 'location', 'city', 'locality', 'purpose', 'location_cluster']

    # Add polynomial features for area_marla and bedrooms
    poly = PolynomialFeatures(degree=2, include_bias=False)
    poly_cols = ['area_marla', 'bedrooms']
    poly_features_train = poly.fit_transform(X_train_raw[poly_cols])
    poly_features_test = poly.transform(X_test_raw[poly_cols])
    poly_feature_names = poly.get_feature_names_out(poly_cols)
    for i, name in enumerate(poly_feature_names):
        X_train_raw[f'poly_{name}'] = poly_features_train[:, i]
        X_test_raw[f'poly_{name}'] = poly_features_test[:, i]
    numerical_cols.extend([f'poly_{name}' for name in poly_feature_names])

    # Check for infinities or NaNs
    if np.any(np.isinf(X_train_raw[numerical_cols])) or np.any(np.isnan(X_train_raw[numerical_cols])):
        print("Warning: Infinities or NaNs detected, replacing with median")
        for col in numerical_cols:
            X_train_raw[col] = X_train_raw[col].replace([np.inf, -np.inf], np.nan).fillna(X_train_raw[col].median())
            X_test_raw[col] = X_test_raw[col].replace([np.inf, -np.inf], np.nan).fillna(X_train_raw[col].median())

    # Update preprocessor
    numerical_transformer = Pipeline(steps=[('scaler', StandardScaler())])
    categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=True))])
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])
    X_train = preprocessor.fit_transform(X_train_raw)
    X_test = preprocessor.transform(X_test_raw)
    print(f"Preprocessed shapes: X_train {X_train.shape}, X_test {X_test.shape}")

    # Save preprocessor
    joblib.dump(preprocessor, data_path + 'preprocessor_final.pkl')
    print("Saved final preprocessor to 'preprocessor_final.pkl'")

    # Create holdout set
    X_test, X_holdout, y_test, y_holdout = train_test_split(
        X_test, y_test, test_size=0.2, random_state=42
    )
    print(f"Test set shape: {X_test.shape}, Holdout set shape: {X_holdout.shape}")

    # Train models with optimized parameters
    print("\nTraining models...")
    xgb_model = XGBRegressor(
        learning_rate=0.1, max_depth=7, n_estimators=200, subsample=0.8,
        random_state=42, n_jobs=1, early_stopping_rounds=10
    )
    cat_model = CatBoostRegressor(random_state=42, verbose=0, thread_count=1, early_stopping_rounds=10)
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1)
    xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False)
    cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))
    rf_model.fit(X_train, y_train)

    # Feature selection
    importance = xgb_model.feature_importances_
    feature_names = preprocessor.get_feature_names_out().tolist()
    importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importance})
    top_features = importance_df[importance_df['Importance'] > 0.0001]['Feature'].tolist()
    print(f"\nSelected {len(top_features)} features with importance > 0.0001:")
    print(importance_df.sort_values(by='Importance', ascending=False).head(5))

    # Filter data
    feature_indices = [feature_names.index(f) for f in top_features]
    X_train_filtered = X_train[:, feature_indices]
    X_test_filtered = X_test[:, feature_indices]
    X_holdout_filtered = X_holdout[:, feature_indices]
    print(f"Filtered shapes: X_train {X_train_filtered.shape}, X_test {X_test_filtered.shape}, X_holdout {X_holdout_filtered.shape}")

    # Retrain models on filtered features
    print("\nRetraining models on filtered features...")
    xgb_model_filtered = XGBRegressor(
        learning_rate=0.1, max_depth=7, n_estimators=200, subsample=0.8,
        random_state=42, n_jobs=1, early_stopping_rounds=10
    )
    cat_model_filtered = CatBoostRegressor(random_state=42, verbose=0, thread_count=1, early_stopping_rounds=10)
    rf_model_filtered = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=1)
    xgb_model_filtered.fit(X_train_filtered, y_train, eval_set=[(X_test_filtered, y_test)], verbose=False)
    cat_model_filtered.fit(X_train_filtered, y_train, eval_set=(X_test_filtered, y_test))
    rf_model_filtered.fit(X_train_filtered, y_train)

    # Evaluate weighted ensemble
    print("\nEvaluating Weighted Ensemble...")
    y_pred_xgb_log = xgb_model_filtered.predict(X_test_filtered)
    y_pred_cat_log = cat_model_filtered.predict(X_test_filtered)
    y_pred_rf_log = rf_model_filtered.predict(X_test_filtered)
    weights = [0.6, 0.3, 0.1]  # XGBoost: 60%, CatBoost: 30%, RandomForest: 10%
    y_pred_log = weights[0] * y_pred_xgb_log + weights[1] * y_pred_cat_log + weights[2] * y_pred_rf_log
    y_pred = np.expm1(y_pred_log)
    y_test_orig = np.expm1(y_test)
    rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred))
    mae = mean_absolute_error(y_test_orig, y_pred)
    r2 = r2_score(y_test_orig, y_pred)
    print("Weighted Ensemble Test Results (original price scale):")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R2: {r2:.2f}")

    # Validate on holdout set
    y_pred_xgb_log_holdout = xgb_model_filtered.predict(X_holdout_filtered)
    y_pred_cat_log_holdout = cat_model_filtered.predict(X_holdout_filtered)
    y_pred_rf_log_holdout = rf_model_filtered.predict(X_holdout_filtered)
    y_pred_log_holdout = weights[0] * y_pred_xgb_log_holdout + weights[1] * y_pred_cat_log_holdout + weights[2] * y_pred_rf_log_holdout
    y_pred_holdout = np.expm1(y_pred_log_holdout)
    y_holdout_orig = np.expm1(y_holdout)
    r2_holdout = r2_score(y_holdout_orig, y_pred_holdout)
    print(f"Weighted Ensemble Holdout R2: {r2_holdout:.2f}")

    # Save artifacts
    model_filename = data_path + 'xgboost_final_model.pkl'
    joblib.dump(xgb_model_filtered, model_filename, compress=3)
    joblib.dump(cat_model_filtered, data_path + 'catboost_final_model.pkl', compress=3)
    joblib.dump(rf_model_filtered, data_path + 'rf_final_model.pkl', compress=3)
    importance_df.to_csv(data_path + 'feature_importance.csv', index=False)
    joblib.dump(feature_indices, data_path + 'selected_feature_indices.pkl')
    joblib.dump(weights, data_path + 'ensemble_weights.pkl')
    print(f"Saved XGBoost model to {model_filename}")
    print(f"Saved CatBoost model to 'catboost_final_model.pkl'")
    print(f"Saved RandomForest model to 'rf_final_model.pkl'")
    print("Saved feature importance to 'feature_importance.csv'")
    print("Saved selected feature indices to 'selected_feature_indices.pkl'")
    print("Saved ensemble weights to 'ensemble_weights.pkl'")

    # Download artifacts
    files.download(model_filename)
    files.download(data_path + 'catboost_final_model.pkl')
    files.download(data_path + 'rf_final_model.pkl')
    files.download(data_path + 'preprocessor_final.pkl')
    files.download(data_path + 'feature_importance.csv')
    files.download(data_path + 'selected_feature_indices.pkl')
    files.download(data_path + 'ensemble_weights.pkl')
    print(f"Downloaded {model_filename}, catboost_final_model.pkl, rf_final_model.pkl, preprocessor_final.pkl, feature_importance.csv, selected_feature_indices.pkl, ensemble_weights.pkl")

except Exception as e:
    print(f"Error in model optimization: {e}")
    exit()


=== Best Optimized Model to Achieve R² ≈ 0.88 ===
Dropped 'price_bin' to prevent leakage
Preprocessed shapes: X_train (153112, 143), X_test (38278, 143)
Saved final preprocessor to 'preprocessor_final.pkl'
Test set shape: (30622, 143), Holdout set shape: (7656, 143)

Training models...

Selected 88 features with importance > 0.0001:
                                               Feature  Importance
131                              cat__purpose_For Rent    0.931306
3                                      num__area_marla    0.010590
19                            cat__property_type_House    0.006050
93   cat__locality_DHA Defence, Islamabad, Islamaba...    0.002944
79                                cat__city_Rawalpindi    0.002750
Filtered shapes: X_train (153112, 88), X_test (30622, 88), X_holdout (7656, 88)

Retraining models on filtered features...

Evaluating Weighted Ensemble...
Weighted Ensemble Test Results (original price scale):
RMSE: 12523118.45
MAE: 3112168.39
R2: 0.86
Weighted

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded /content/drive/My Drive/data/xgboost_final_model.pkl, catboost_final_model.pkl, rf_final_model.pkl, preprocessor_final.pkl, feature_importance.csv, selected_feature_indices.pkl, ensemble_weights.pkl


In [None]:
# Prediction Script: Validating Model on Test Data
print("\n=== Validating Model on Test Data ===")
try:
    from xgboost import XGBRegressor
    from catboost import CatBoostRegressor
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
    from sklearn.compose import ColumnTransformer
    from sklearn.cluster import KMeans
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    import numpy as np
    import pandas as pd
    import joblib
    import os
    from google.colab import files
    from scipy.sparse import issparse

    # Load saved artifacts
    data_path = '/content/drive/My Drive/data/'
    xgb_model = joblib.load(data_path + 'xgboost_final_model.pkl')
    cat_model = joblib.load(data_path + 'catboost_final_model.pkl')
    rf_model = joblib.load(data_path + 'rf_final_model.pkl')
    preprocessor = joblib.load(data_path + 'preprocessor_final.pkl')
    feature_indices = joblib.load(data_path + 'selected_feature_indices.pkl')
    weights = joblib.load(data_path + 'ensemble_weights.pkl')
    print("Loaded saved models, preprocessor, feature indices, and ensemble weights")

    # Load test data (using X_test_raw.csv and y_test.npy for validation)
    new_data = pd.read_csv(data_path + 'X_test_raw.csv')
    y_test = np.load(data_path + 'y_test.npy', allow_pickle=True)
    print("Loaded test data for validation")

    # Drop price_bin if present
    if 'price_bin' in new_data.columns:
        new_data = new_data.drop(columns=['price_bin'])
        print("Dropped 'price_bin' from test data")

    # Feature engineering (same as training)
    new_data['bedrooms_area_marla'] = new_data['bedrooms'] * new_data['area_marla']
    new_data['baths_area_marla'] = new_data['baths'] * new_data['area_marla']
    new_data['lat_lon_interaction'] = new_data['latitude'] * new_data['longitude']

    # Add distance to city center
    city_centers = {
        'Lahore': (31.5204, 74.3587),
        'Karachi': (24.8607, 67.0011),
        'Islamabad': (33.6844, 73.0479)
    }
    def calculate_distance(row, city):
        if row['city'] in city_centers:
            lat_center, lon_center = city_centers[row['city']]
            return np.sqrt((row['latitude'] - lat_center)**2 + (row['longitude'] - lon_center)**2)
        return 0.0
    new_data['distance_to_center'] = new_data.apply(lambda row: calculate_distance(row, row['city']), axis=1)

    # Add location clusters
    kmeans = KMeans(n_clusters=10, random_state=42)
    kmeans.fit(new_data[['latitude', 'longitude']])  # Fit on test data (or load saved KMeans if available)
    new_data['location_cluster'] = kmeans.predict(new_data[['latitude', 'longitude']])

    # Add polynomial features
    poly = PolynomialFeatures(degree=2, include_bias=False)
    poly_cols = ['area_marla', 'bedrooms']
    poly_features = poly.fit_transform(new_data[poly_cols])
    poly_feature_names = poly.get_feature_names_out(poly_cols)
    for i, name in enumerate(poly_feature_names):
        new_data[f'poly_{name}'] = poly_features[:, i]

    # Define numerical and categorical columns
    numerical_cols = ['latitude', 'longitude', 'baths', 'area_marla', 'bedrooms', 'year', 'month', 'day',
                     'bedrooms_area_marla', 'baths_area_marla', 'lat_lon_interaction', 'distance_to_center']
    numerical_cols.extend([f'poly_{name}' for name in poly_feature_names])
    categorical_cols = ['property_type', 'location', 'city', 'locality', 'purpose', 'location_cluster']

    # Check for infinities or NaNs
    if np.any(np.isinf(new_data[numerical_cols])) or np.any(np.isnan(new_data[numerical_cols])):
        print("Warning: Infinities or NaNs detected in test data, replacing with median")
        for col in numerical_cols:
            new_data[col] = new_data[col].replace([np.inf, -np.inf], np.nan).fillna(new_data[col].median())

    # Preprocess test data
    X_new = preprocessor.transform(new_data)
    X_new_filtered = X_new[:, feature_indices]
    print(f"Preprocessed test data shape: {X_new_filtered.shape}")

    # Predict with weighted ensemble
    print("\nMaking predictions...")
    y_pred_xgb_log = xgb_model.predict(X_new_filtered)
    y_pred_cat_log = cat_model.predict(X_new_filtered)
    y_pred_rf_log = rf_model.predict(X_new_filtered)
    y_pred_log = weights[0] * y_pred_xgb_log + weights[1] * y_pred_cat_log + weights[2] * y_pred_rf_log
    y_pred = np.expm1(y_pred_log)

    # Evaluate predictions
    y_test_orig = np.expm1(y_test)
    rmse = np.sqrt(mean_squared_error(y_test_orig, y_pred))
    mae = mean_absolute_error(y_test_orig, y_pred)
    r2 = r2_score(y_test_orig, y_pred)
    print("Validation Results on Test Data (original price scale):")
    print(f"RMSE: {rmse:.2f}")
    print(f"MAE: {mae:.2f}")
    print(f"R2: {r2:.2f}")

    # Save predictions
    new_data['predicted_price'] = y_pred
    new_data.to_csv(data_path + 'test_predictions.csv', index=False)
    print("Saved predictions to 'test_predictions.csv'")
    files.download(data_path + 'test_predictions.csv')
    print("Downloaded test_predictions.csv")

except Exception as e:
    print(f"Error in prediction: {e}")
    exit()


=== Validating Model on Test Data ===
Loaded saved models, preprocessor, feature indices, and ensemble weights
Loaded test data for validation
Dropped 'price_bin' from test data
Preprocessed test data shape: (38278, 88)

Making predictions...
Validation Results on Test Data (original price scale):
RMSE: 12263856.91
MAE: 3103779.52
R2: 0.86
Saved predictions to 'test_predictions.csv'


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Downloaded test_predictions.csv
