# Feature Engineering

### •	Generate new features to support later modeling (e.g., price per square meter, total rooms, encoded categorical values).
### •	Apply appropriate feature scaling (e.g., MinMaxScaler, StandardScaler).


In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

# Load data
df = pd.read_csv('merged_final.csv')
print("Data loaded successfully")
print(f"Shape: {df.shape}")
print(f"Columns: {list(df.columns)}")

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# STEP 1: CREATE NEW FEATURES
# These are simple calculations that might help the model understand patterns better

# Price per square meter - this is very important for real estate
# It tells us how expensive each square meter is
if 'price' in df.columns and 'area' in df.columns:
    df['price_per_sqm'] = df['price'] / df['area']
    print("Created price_per_sqm feature")

# Total number of rooms - bedrooms + bathrooms
# More rooms usually means higher price
if 'bedrooms' in df.columns and 'bathrooms' in df.columns:
    df['total_rooms'] = df['bedrooms'] + df['bathrooms']
    print("Created total_rooms feature")

# Price per bedroom - how much you pay per bedroom
# Useful to compare properties with different bedroom counts
if 'price' in df.columns and 'bedrooms' in df.columns:
    df['price_per_bedroom'] = df['price'] / df['bedrooms']
    print("Created price_per_bedroom feature")

# Property age - how old is the property
# Newer properties might be more expensive
if 'year_built' in df.columns:
    current_year = 2024
    df['property_age'] = current_year - df['year_built']
    print("Created property_age feature")

# Area in different scales - sometimes models work better with transformed data
if 'area' in df.columns:
    df['area_squared'] = df['area'] ** 2  # Square of area
    df['area_log'] = np.log(df['area'] + 1)  # Log of area (adding 1 to avoid log(0))
    print("Created area_squared and area_log features")

# STEP 2: HANDLE CATEGORICAL VARIABLES
# Categorical variables are non-numeric like 'city', 'property_type'
# Models need numbers, so we convert categories to numbers

# Find all text/categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f"\nCategorical columns: {categorical_cols}")

# For each categorical column, convert to numbers
for col in categorical_cols:
    if col in df.columns:
        # LabelEncoder converts categories to numbers (0, 1, 2, etc.)
        # For example: 'apartment' becomes 0, 'house' becomes 1
        le = LabelEncoder()
        
        # Fill missing values with 'Unknown' first
        df[col] = df[col].fillna('Unknown')
        
        # Create new column with encoded values
        df[col + '_encoded'] = le.fit_transform(df[col])
        print(f"Encoded {col} -> {col}_encoded")

# STEP 3: REMOVE PROBLEMATIC VALUES
# Replace infinite values with NaN (happens when dividing by zero)
df = df.replace([np.inf, -np.inf], np.nan)

# Fill any remaining NaN values with the median (middle value)
# Median is better than mean because it's not affected by extreme values
numeric_cols = df.select_dtypes(include=[np.number]).columns
for col in numeric_cols:
    if df[col].isnull().sum() > 0:
        median_value = df[col].median()
        df[col] = df[col].fillna(median_value)
        print(f"Filled missing values in {col} with median: {median_value}")

# STEP 4: PREPARE DATA FOR SCALING
# Remove original categorical columns since we have encoded versions
cols_to_drop = []
for col in categorical_cols:
    if col in df.columns:
        cols_to_drop.append(col)

# Also remove any ID columns or index columns
if 'Unnamed: 0' in df.columns:
    cols_to_drop.append('Unnamed: 0')

# Create clean dataset without categorical columns
df_clean = df.drop(columns=cols_to_drop)
print(f"\nCleaned dataset shape: {df_clean.shape}")

# STEP 5: FEATURE SCALING
# Scaling makes all features have similar ranges
# This helps many machine learning algorithms work better

# Get list of columns to scale (all numeric columns except target)
# Assuming 'price' is your target variable
cols_to_scale = df_clean.columns.tolist()
if 'price' in cols_to_scale:
    cols_to_scale.remove('price')  # Don't scale the target variable

print(f"Columns to scale: {len(cols_to_scale)}")

# Method 1: StandardScaler
# Makes features have mean=0 and standard deviation=1
# Good for most algorithms like linear regression, SVM
scaler_std = StandardScaler()
df_standardized = df_clean.copy()
df_standardized[cols_to_scale] = scaler_std.fit_transform(df_clean[cols_to_scale])

# Method 2: MinMaxScaler  
# Makes features range from 0 to 1
# Good for neural networks and algorithms sensitive to outliers
scaler_minmax = MinMaxScaler()
df_minmax = df_clean.copy()
df_minmax[cols_to_scale] = scaler_minmax.fit_transform(df_clean[cols_to_scale])

print("Scaling completed")

# STEP 6: SAVE THE RESULTS
# Save one clean file ready for modeling

# Use standardized features (good for most models)
df_standardized.to_csv('features_engineered.csv', index=False)
print("Saved features_engineered.csv")

# STEP 7: SUMMARY
print(f"\nSUMMARY:")
print(f"Original columns: {len(df.columns)}")
print(f"Final columns: {len(df_clean.columns)}")
print(f"Rows: {len(df_clean)}")

# Show correlation with price if it exists
if 'price' in df_clean.columns:
    print(f"\nTop 5 features most correlated with price:")
    correlations = df_clean.corr()['price'].abs().sort_values(ascending=False)
    print(correlations.head(6))  # Top 6 includes price itself


Data loaded successfully
Shape: (558, 6)
Columns: ['bedrooms', 'bathrooms', 'property_type', 'title', 'price', 'source']

Missing values:
bedrooms         0
bathrooms        0
property_type    0
title            0
price            0
source           0
dtype: int64
Created total_rooms feature
Created price_per_bedroom feature

Categorical columns: ['property_type', 'title', 'source']
Encoded property_type -> property_type_encoded
Encoded title -> title_encoded
Encoded source -> source_encoded

Cleaned dataset shape: (558, 8)
Columns to scale: 7
Scaling completed
Saved features_engineered.csv

SUMMARY:
Original columns: 11
Final columns: 8
Rows: 558

Top 5 features most correlated with price:
price                    1.000000
price_per_bedroom        0.586067
bathrooms                0.362752
total_rooms              0.358535
bedrooms                 0.346896
property_type_encoded    0.250079
Name: price, dtype: float64


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import joblib

df = pd.read_csv('features_engineered.csv')
print("Data loaded")
print(f"Shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

#target
target = 'price'
X = df.drop(columns=[target])
y = df[target]

# Optional: drop any non-numeric columns (in case encoding failed earlier)
X = X.select_dtypes(include=[np.number])

# Step 3: Scale features (optional but helpful for linear models)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 4: Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
print("Data split into training and testing sets")

# Train models
# Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_test)

# Random Forest
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_test)

# Evaluation function
def evaluate_model(name, y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    print(f"\n{name} Results:")
    print(f"RMSE: {rmse:.2f}")
    print(f"R² Score: {r2:.2f}")


# Evaluate both models
evaluate_model("Linear Regression", y_test, lr_preds)
evaluate_model("Random Forest", y_test, rf_preds)

# Save the better model (e.g., Random Forest)
joblib.dump(rf, 'best_model.pkl')
joblib.dump(scaler, 'scaler.pkl')



Data loaded
Shape: (558, 8)
Columns: ['bedrooms', 'bathrooms', 'price', 'total_rooms', 'price_per_bedroom', 'property_type_encoded', 'title_encoded', 'source_encoded']
Data split into training and testing sets

Linear Regression Results:
RMSE: 89744.82
R² Score: 0.65

Random Forest Results:
RMSE: 52045.22
R² Score: 0.88


['scaler.pkl']