In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

def load_and_preprocess_data(filepath):
    """Load and preprocess the dataset with advanced feature engineering."""
    data = pd.read_csv(filepath)
    
    # Define brand tiers
    premium_brands = ['Apple', 'Google', 'OnePlus']
    mid_brands = ['Nothing', 'Samsung']
    
    # Create brand tiers
    data['brand_tier'] = data['Brand'].apply(
        lambda x: 'premium' if x in premium_brands 
        else 'mid' if x in mid_brands 
        else 'budget'
    )
    
    # Create RAM tiers
    data['ram_tier'] = pd.cut(
        data['RAM (GB)'],
        bins=[0, 4, 8, 12, float('inf')],
        labels=['basic', 'mid', 'high', 'ultra']
    )
    
    # Create interaction features
    data['storage_per_ram'] = data['ROM (GB)'] / data['RAM (GB)']
    data['camera_total'] = data['Front Camera (MP)'] + data['Back Camera (MP)']
    data['price_per_ram'] = data['Price (Rs.)'] / data['RAM (GB)']
    
    # Log transform the price
    data['Log_Price'] = np.log(data['Price (Rs.)'])
    
    # Create price segments
    data['price_segment'] = pd.qcut(
        data['Price (Rs.)'],
        q=3,
        labels=['budget', 'mid', 'premium']
    )
    
    # One-hot encode categorical features
    categorical_columns = ['Brand', 'brand_tier', 'ram_tier']
    data_encoded = pd.get_dummies(data, columns=categorical_columns)
    
    return data_encoded

def train_segmented_models(X, y, segment_column='price_segment'):
    """Train separate models for different price segments."""
    models = {}
    scalers = {}
    feature_columns = None
    
    for segment in ['budget', 'mid', 'premium']:
        # Filter data for this segment
        segment_mask = X[segment_column] == segment
        X_segment = X[segment_mask].drop(columns=[segment_column])
        y_segment = y[segment_mask]
        
        # Store feature columns for the API
        if feature_columns is None:
            feature_columns = X_segment.columns.tolist()
        
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X_segment, y_segment, test_size=0.2, random_state=42
        )
        
        # Scale features
        scaler = RobustScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train model
        # model = RandomForestRegressor(
        #     n_estimators=300,
        #     max_depth=12,
        #     min_samples_leaf=5,
        #     random_state=42,
        #     n_jobs=-1
        # )
        model = DecisionTreeRegressor()
        
        model.fit(X_train_scaled, y_train)
        
        # Evaluate segment performance
        y_pred = model.predict(X_test_scaled)
        mae = mean_absolute_error(np.exp(y_test), np.exp(y_pred))
        rmse = np.sqrt(mean_squared_error(np.exp(y_test), np.exp(y_pred)))
        r2 = r2_score(y_test, y_pred)
        
        print(f"\n{segment.title()} Segment Metrics:")
        print(f"MAE: ₹{mae:,.2f}")
        print(f"RMSE: ₹{rmse:,.2f}")
        print(f"R² Score: {r2:.4f}")
        
        models[segment] = model
        scalers[segment] = scaler
    
    return models, scalers, feature_columns

def save_models(models, scalers, feature_columns):
    """Save the trained models, scalers, and feature columns."""
    with open('../models/segmented_models.pkl', 'wb') as f:
        pickle.dump(models, f)
    
    with open('../models/segmented_scalers.pkl', 'wb') as f:
        pickle.dump(scalers, f)
        
    with open('../models/feature_columns.pkl', 'wb') as f:
        pickle.dump(feature_columns, f)

def main():
    # Load and preprocess data
    print("Loading and preprocessing data...")
    data = load_and_preprocess_data('../data/processed/5g_smartphones_dataset.csv')
    
    # Prepare features and target
    X = data.drop(['Model', 'Price (Rs.)', 'Log_Price'], axis=1)
    y = data['Log_Price']
    
    # Train segmented models
    print("\nTraining segmented models...")
    models, scalers, feature_columns = train_segmented_models(X, y)
    
    # Save models and artifacts
    print("\nSaving models and artifacts...")
    save_models(models, scalers, feature_columns)
    
    print("\nTraining completed successfully!")

if __name__ == "__main__":
    main()

Loading and preprocessing data...

Training segmented models...

Budget Segment Metrics:
MAE: ₹94.03
RMSE: ₹217.28
R² Score: 0.9962

Mid Segment Metrics:
MAE: ₹264.09
RMSE: ₹1,045.54
R² Score: 0.8971

Premium Segment Metrics:
MAE: ₹2,785.49
RMSE: ₹6,822.95
R² Score: 0.9744

Saving models and artifacts...

Training completed successfully!
