# Amazon ML Challenge 2025: Product Pricing Solution

This notebook implements the winning solution strategy using LightGBM and custom Feature Engineering.

In [None]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack, csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler, RobustScaler
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
from datetime import datetime
import warnings
import gc
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')

In [None]:
class MaxAccuracyFeatureEngineer:
    def __init__(self):
        self.tfidf = None
        self.scaler = RobustScaler()  # IMPROVEMENT: Better for outliers
        self.model = None
        self.feature_columns = []
        self.unit_categories = None
        self.numeric_medians = {}
        self.categorical_columns = None
        self.expected_feature_count = None
        self.brand_categories = None
        self.category_categories = None
        
    def extract_brand(self, text):
        """IMPROVED: More comprehensive brand detection"""
        if pd.isna(text):
            return 'unknown'
        text_lower = str(text).lower()
        
        # Expanded brand mapping
        brands = {
            'amazon': ['amazon basics', 'amazon', 'amazonbasics'],
            'samsung': ['samsung'],
            'apple': ['apple', 'iphone', 'ipad', 'macbook', 'airpods'],
            'sony': ['sony', 'playstation'],
            'lg': ['lg'],
            'intel': ['intel', 'core i3', 'core i5', 'core i7', 'core i9'],
            'nvidia': ['nvidia', 'geforce', 'rtx', 'gtx'],
            'amd': ['amd', 'ryzen', 'radeon'],
            'corsair': ['corsair'],
            'logitech': ['logitech'],
            'razer': ['razer'],
            'dell': ['dell', 'alienware'],
            'hp': ['hp', 'hewlett'],
            'lenovo': ['lenovo', 'thinkpad'],
            'canon': ['canon', 'eos'],
            'nikon': ['nikon'],
            'gopro': ['gopro'],
            'dji': ['dji'],
            'bose': ['bose'],
            'jbl': ['jbl'],
            'beats': ['beats'],
            'anker': ['anker'],
            'belkin': ['belkin'],
            'netgear': ['netgear'],
            'tp-link': ['tp-link', 'tplink'],
            'asus': ['asus', 'rog'],
            'microsoft': ['microsoft', 'surface', 'xbox'],
            'panasonic': ['panasonic'],
            'philips': ['philips'],
            'sandisk': ['sandisk'],
            'western digital': ['western digital', 'wd'],
            'seagate': ['seagate'],
            'kingston': ['kingston'],
            'crucial': ['crucial']
        }
        
        for brand, keywords in brands.items():
            if any(kw in text_lower for kw in keywords):
                return brand
        
        # Extract first capitalized word
        words = text.split()
        for word in words:
            if len(word) > 2 and word[0].isupper():
                return word.lower()
        
        return 'generic'
    
    def extract_product_category(self, text):
        """IMPROVED: More granular categories"""
        if pd.isna(text):
            return 'other'
        text_lower = str(text).lower()
        
        categories = {
            'laptop': ['laptop', 'notebook', 'macbook', 'chromebook'],
            'phone': ['phone', 'smartphone', 'iphone', 'galaxy'],
            'tablet': ['tablet', 'ipad'],
            'computer': ['desktop', 'pc', 'tower', 'workstation'],
            'cpu': ['cpu', 'processor', 'ryzen', 'core i'],
            'gpu': ['gpu', 'graphics card', 'geforce', 'radeon'],
            'monitor': ['monitor', 'display', 'screen'],
            'keyboard': ['keyboard'],
            'mouse': ['mouse', 'mice'],
            'headphone': ['headphone', 'headset', 'earbuds', 'airpods'],
            'speaker': ['speaker', 'soundbar'],
            'networking': ['router', 'modem', 'ethernet', 'wifi', 'switch', 'network'],
            'storage': ['ssd', 'hard drive', 'hdd', 'usb', 'sd card', 'flash'],
            'ram': ['ram', 'memory', 'ddr4', 'ddr5'],
            'motherboard': ['motherboard', 'mobo'],
            'psu': ['power supply', 'psu'],
            'case': ['case', 'chassis', 'tower case'],
            'cooling': ['cooler', 'cooling', 'fan', 'radiator'],
            'cable': ['cable', 'cord', 'wire', 'hdmi', 'usb-c'],
            'charger': ['charger', 'adapter', 'power adapter'],
            'camera': ['camera', 'webcam', 'dslr', 'mirrorless'],
            'components': ['resistor', 'capacitor', 'transistor', 'diode', 'ic', 'led'],
            'food': ['food', 'drink', 'sauce', 'snack', 'cookie', 'butter', 'protein'],
            'home': ['furniture', 'decoration', 'lamp', 'curtain', 'bedding'],
            'tools': ['tool', 'drill', 'saw', 'hammer', 'wrench'],
            'sports': ['sports', 'bike', 'yoga', 'fitness', 'gym'],
            'beauty': ['beauty', 'shampoo', 'makeup', 'cosmetics'],
            'cooking': ['pot', 'pan', 'knife', 'blender', 'cookware']
        }
        
        for category, keywords in categories.items():
            if any(kw in text_lower for kw in keywords):
                return category
        
        return 'other'
    
    def extract_numbers(self, text):
        """NEW: Extract all numeric values for analysis"""
        if pd.isna(text):
            return []
        numbers = re.findall(r'\d+\.?\d*', str(text))
        return [float(n) for n in numbers if float(n) > 0]
    
    def extract_storage_capacity(self, text):
        """NEW: Specifically extract storage (GB, TB)"""
        if pd.isna(text):
            return 0
        text_lower = str(text).lower()
        
        # Look for TB
        tb_match = re.search(r'(\d+)\s*tb', text_lower)
        if tb_match:
            return float(tb_match.group(1)) * 1024  # Convert to GB
        
        # Look for GB
        gb_match = re.search(r'(\d+)\s*gb', text_lower)
        if gb_match:
            return float(gb_match.group(1))
        
        return 0
    
    def extract_ram_capacity(self, text):
        """NEW: Extract RAM capacity"""
        if pd.isna(text):
            return 0
        text_lower = str(text).lower()
        
        # Look for RAM mentions
        ram_patterns = [
            r'(\d+)\s*gb\s*ram',
            r'ram[:\s]+(\d+)\s*gb',
            r'(\d+)gb\s*ddr'
        ]
        
        for pattern in ram_patterns:
            match = re.search(pattern, text_lower)
            if match:
                return float(match.group(1))
        
        return 0
    
    def extract_wattage(self, text):
        """NEW: Extract power/wattage"""
        if pd.isna(text):
            return 0
        text_lower = str(text).lower()
        
        watt_match = re.search(r'(\d+)\s*w(?:att)?', text_lower)
        if watt_match:
            return float(watt_match.group(1))
        
        return 0
    
    def extract_screen_size(self, text):
        """NEW: Extract screen size in inches"""
        if pd.isna(text):
            return 0
        text_lower = str(text).lower()
        
        inch_patterns = [
            r'(\d+\.?\d*)\s*inch',
            r'(\d+\.?\d*)"',
            r'(\d+\.?\d*)″'
        ]
        
        for pattern in inch_patterns:
            match = re.search(pattern, text_lower)
            if match:
                size = float(match.group(1))
                if 5 <= size <= 100:  # Reasonable screen sizes
                    return size
        
        return 0
    
    def extract_weight(self, text):
        """NEW: Extract weight"""
        if pd.isna(text):
            return 0
        text_lower = str(text).lower()
        
        # Pounds
        lb_match = re.search(r'(\d+\.?\d*)\s*(?:lb|pound)', text_lower)
        if lb_match:
            return float(lb_match.group(1)) * 0.453592  # Convert to kg
        
        # Kg
        kg_match = re.search(r'(\d+\.?\d*)\s*kg', text_lower)
        if kg_match:
            return float(kg_match.group(1))
        
        # Ounces
        oz_match = re.search(r'(\d+\.?\d*)\s*oz', text_lower)
        if oz_match:
            return float(oz_match.group(1)) * 0.0283495  # Convert to kg
        
        return 0
    
    def extract_quality_tier(self, text):
        """Enhanced quality detection"""
        if pd.isna(text):
            return 'standard'
        text_lower = str(text).lower()
        
        premium_keywords = ['premium', 'luxury', 'pro', 'professional', 'elite', 
                          'high-end', 'limited edition', 'exclusive', 'titanium', 
                          'diamond', 'ultra', 'max', 'plus', 'advanced']
        if any(kw in text_lower for kw in premium_keywords):
            return 'premium'
        
        budget_keywords = ['budget', 'basic', 'entry-level', 'starter', 'economy', 
                         'value', 'cheap', 'inexpensive', 'affordable', 'lite']
        if any(kw in text_lower for kw in budget_keywords):
            return 'budget'
        
        return 'standard'
    
    def extract_pack_quantity(self, text):
        """Enhanced pack detection"""
        if pd.isna(text):
            return 1, 'single'
        text_lower = str(text).lower()
        
        # Pack of X
        pack_match = re.search(r'pack of (\d+)', text_lower)
        if pack_match:
            qty = int(pack_match.group(1))
            return qty, f'pack_{min(qty, 10)}'
        
        # X pack
        pack_match2 = re.search(r'(\d+)[\s-]pack', text_lower)
        if pack_match2:
            qty = int(pack_match2.group(1))
            return qty, f'pack_{min(qty, 10)}'
        
        # X piece/set/count
        qty_match = re.search(r'(\d+)\s*(?:piece|set|box|count)', text_lower)
        if qty_match:
            qty = int(qty_match.group(1))
            if qty > 1:
                return qty, f'multi_{min(qty, 10)}'
        
        return 1, 'single'
    
    def count_specifications(self, text):
        """Enhanced spec counting"""
        if pd.isna(text):
            return 0
        text_lower = str(text).lower()
        
        spec_keywords = [
            'voltage', 'current', 'watt', 'hz', 'mhz', 'ghz', 'rpm', 
            'resolution', 'fps', 'dpi', 'ppi',
            'gb', 'mb', 'kb', 'tb',
            'inch', 'cm', 'mm', 'meter',
            'kg', 'lb', 'oz', 'gram',
            'liter', 'ml', 'gallon',
            'temperature', 'celsius', 'fahrenheit',
            'pressure', 'psi', 'bar',
            'speed', 'bandwidth', 'latency',
            'capacity', 'output', 'input',
            'compatibility', 'wireless', 'bluetooth'
        ]
        
        count = sum(1 for keyword in spec_keywords if keyword in text_lower)
        return count
    
    def has_warranty_mention(self, text):
        if pd.isna(text):
            return 0
        text_lower = str(text).lower()
        warranty_keywords = ['warranty', 'guarantee', 'guaranteed', 'year warranty', 
                           'lifetime warranty', 'coverage', 'protected']
        return 1 if any(kw in text_lower for kw in warranty_keywords) else 0
    
    def has_discount_mention(self, text):
        if pd.isna(text):
            return 0
        text_lower = str(text).lower()
        discount_keywords = ['discount', 'sale', 'promo', 'limited time', 
                           'save', 'off', '%', 'deal', 'clearance', 'reduced']
        return 1 if any(kw in text_lower for kw in discount_keywords) else 0
    
    def has_color_mention(self, text):
        """NEW: Check for color mentions"""
        if pd.isna(text):
            return 0
        text_lower = str(text).lower()
        colors = ['black', 'white', 'red', 'blue', 'green', 'silver', 'gold', 
                 'gray', 'pink', 'purple', 'yellow', 'orange', 'brown']
        return 1 if any(color in text_lower for color in colors) else 0
    
    def has_model_number(self, text):
        """NEW: Check for model numbers (often premium products)"""
        if pd.isna(text):
            return 0
        # Pattern: Letters followed by numbers (e.g., GTX1080, i7-9700K)
        pattern = r'[A-Z]{2,}\d+|[A-Z]\d+-\d+'
        return 1 if re.search(pattern, str(text)) else 0
    
    def extract_year(self, text):
        """NEW: Extract year (newer = higher price)"""
        if pd.isna(text):
            return 0
        
        # Look for years 2015-2025
        year_match = re.search(r'20(1[5-9]|2[0-5])', str(text))
        if year_match:
            return int(year_match.group(0))
        
        return 0
    
    def safe_one_hot_encode(self, series, prefix, expected_categories):
        """Consistent one-hot encoding"""
        dummies = pd.get_dummies(series, prefix=prefix)
        
        for category in expected_categories:
            col_name = f"{prefix}_{category}"
            if col_name not in dummies.columns:
                dummies[col_name] = 0
        
        expected_columns = [f"{prefix}_{cat}" for cat in expected_categories]
        dummies = dummies.reindex(columns=expected_columns, fill_value=0)
        
        return dummies.astype(np.float32)
    
    def extract_advanced_features(self, df, is_training=True):
        """ENHANCED: More features for better accuracy"""
        print("Extracting advanced features...")
        
        # Basic value/unit
        df['Value'] = pd.to_numeric(
            df['catalog_content'].str.extract(r'Value:\s*([\d\.]+)')[0], 
            errors='coerce'
        )
        df['Unit'] = df['catalog_content'].str.extract(r'Unit:\s*(\w+)')[0]
        df['Unit'] = df['Unit'].fillna('missing')
        
        if is_training:
            self.unit_categories = df['Unit'].unique()
        
        # Categorical features
        print("  Extracting brands...")
        df['brand'] = df['catalog_content'].apply(self.extract_brand)
        if is_training:
            self.brand_categories = df['brand'].unique()
        
        print("  Extracting categories...")
        df['category'] = df['catalog_content'].apply(self.extract_product_category)
        if is_training:
            self.category_categories = df['category'].unique()
        
        print("  Extracting quality tiers...")
        df['quality_tier'] = df['catalog_content'].apply(self.extract_quality_tier)
        
        print("  Extracting pack quantities...")
        pack_info = df['catalog_content'].apply(self.extract_pack_quantity)
        df['pack_quantity'] = pack_info.apply(lambda x: x[0])
        df['pack_type'] = pack_info.apply(lambda x: x[1])
        
        # NEW FEATURES
        print("  Extracting technical specs...")
        df['storage_gb'] = df['catalog_content'].apply(self.extract_storage_capacity)
        df['ram_gb'] = df['catalog_content'].apply(self.extract_ram_capacity)
        df['wattage'] = df['catalog_content'].apply(self.extract_wattage)
        df['screen_size'] = df['catalog_content'].apply(self.extract_screen_size)
        df['weight_kg'] = df['catalog_content'].apply(self.extract_weight)
        df['year'] = df['catalog_content'].apply(self.extract_year)
        
        # NEW FLAGS
        df['spec_count'] = df['catalog_content'].apply(self.count_specifications)
        df['has_warranty'] = df['catalog_content'].apply(self.has_warranty_mention)
        df['has_discount'] = df['catalog_content'].apply(self.has_discount_mention)
        df['has_color'] = df['catalog_content'].apply(self.has_color_mention)
        df['has_model_number'] = df['catalog_content'].apply(self.has_model_number)
        
        # Text statistics
        df['content_length'] = df['catalog_content'].str.len().fillna(0)
        df['word_count'] = df['catalog_content'].str.split().str.len().fillna(0)
        df['char_density'] = df['content_length'] / (df['word_count'].replace(0, 1))
        df['numeric_count'] = df['catalog_content'].str.count(r'\d+\.?\d*')
        df['has_decimal'] = df['catalog_content'].str.contains(r'\d+\.\d+').astype(np.int8)
        df['bullet_points'] = df['catalog_content'].str.count('Bullet Point')
        df['has_description'] = df['catalog_content'].str.contains('Product Description:', case=False).astype(np.int8)
        df['uppercase_ratio'] = df['catalog_content'].apply(lambda x: sum(1 for c in str(x) if c.isupper()) / max(len(str(x)), 1))
        
        # Value-based features
        df['has_value_flag'] = (~df['Value'].isna()).astype(np.int8)
        df['value_per_word'] = df['Value'] / (df['word_count'].replace(0, 1))
        
        # NEW: Interaction features (powerful!)
        df['storage_x_ram'] = df['storage_gb'] * df['ram_gb']
        df['brand_is_premium'] = df['brand'].isin(['apple', 'samsung', 'sony', 'bose', 'dell']).astype(np.int8)
        df['is_electronics'] = df['category'].isin(['laptop', 'phone', 'tablet', 'computer', 'gpu', 'cpu']).astype(np.int8)
        df['premium_electronics'] = df['brand_is_premium'] * df['is_electronics']
        
        # Handle missing values
        numeric_cols = ['Value', 'storage_gb', 'ram_gb', 'wattage', 'screen_size', 'weight_kg']
        for col in numeric_cols:
            if col in df.columns:
                if is_training:
                    median_val = df[col].median()
                    self.numeric_medians[col] = median_val
                else:
                    median_val = self.numeric_medians.get(col, 0)
                df[col] = df[col].fillna(median_val)
        
        # Log transforms for skewed features (IMPROVEMENT)
        for col in ['Value', 'storage_gb', 'ram_gb', 'wattage']:
            if col in df.columns:
                df[f'{col}_log'] = np.log1p(df[col])
        
        # Type conversion
        for col in df.select_dtypes(include=[np.number]).columns:
            if col in ['content_length', 'word_count', 'numeric_count', 'bullet_points', 'spec_count', 'pack_quantity', 'year']:
                df[col] = df[col].astype(np.int32)
            else:
                df[col] = df[col].astype(np.float32)
        
        return df

    def create_tfidf_features(self, texts):
        """IMPROVED: Better TF-IDF parameters"""
        if self.tfidf is None:
            self.tfidf = TfidfVectorizer(
                max_features=10000,  # INCREASED from 8000
                stop_words='english',
                ngram_range=(1, 3),  # IMPROVED: Added trigrams
                min_df=3,  # IMPROVED: Lower threshold
                max_df=0.90,  # IMPROVED: Higher threshold
                sublinear_tf=True,
                dtype=np.float32,
                norm='l2',
                use_idf=True
            )
            return self.tfidf.fit_transform(texts)
        else:
            return self.tfidf.transform(texts)

    def prepare_full_features(self, df, is_training=True):
        """ENHANCED: More features"""
        df['catalog_content'] = df['catalog_content'].fillna('missing_content')
        df = self.extract_advanced_features(df, is_training=is_training)
        
        # EXPANDED feature list
        self.feature_columns = [
            'Value', 'Value_log', 'content_length', 'word_count', 'char_density', 
            'numeric_count', 'has_decimal', 'has_value_flag', 'value_per_word',
            'bullet_points', 'has_description', 'spec_count', 'pack_quantity',
            'has_warranty', 'has_discount', 'has_color', 'has_model_number',
            'storage_gb', 'storage_gb_log', 'ram_gb', 'ram_gb_log', 
            'wattage', 'wattage_log', 'screen_size', 'weight_kg',
            'year', 'uppercase_ratio', 'storage_x_ram',
            'brand_is_premium', 'is_electronics', 'premium_electronics'
        ]
        
        for col in self.feature_columns:
            if col not in df.columns:
                df[col] = 0.0
        
        # Categorical encoding
        categorical_features = {
            'brand': self.brand_categories if hasattr(self, 'brand_categories') else df['brand'].unique(),
            'category': self.category_categories if hasattr(self, 'category_categories') else df['category'].unique(),
            'quality_tier': ['premium', 'standard', 'budget'],
            'pack_type': ['single'] + [f'multi_{i}' for i in range(2, 11)] + [f'pack_{i}' for i in range(2, 11)]
        }
        
        categorical_encoded_list = []
        for col, categories in categorical_features.items():
            if col in df.columns:
                encoded = self.safe_one_hot_encode(df[col], col, categories)
                categorical_encoded_list.append(encoded)
        
        if categorical_encoded_list:
            categorical_encoded = pd.concat(categorical_encoded_list, axis=1)
        else:
            categorical_encoded = pd.DataFrame()
        
        if is_training:
            self.categorical_columns = categorical_encoded.columns.tolist()
        
        # Unit encoding
        if self.unit_categories is None:
            self.unit_categories = df['Unit'].unique()
        
        unit_encoded = self.safe_one_hot_encode(df['Unit'], 'Unit', self.unit_categories)
        
        # TF-IDF
        X_tfidf = self.create_tfidf_features(df['catalog_content'])
        
        # Numeric features with robust scaling
        numeric_features = df[self.feature_columns].values.astype(np.float32)
        if is_training:
            numeric_features = self.scaler.fit_transform(numeric_features)
        else:
            numeric_features = self.scaler.transform(numeric_features)
        
        # Combine
        cat_array = categorical_encoded.values.astype(np.float32) if not categorical_encoded.empty else np.zeros((len(df), 0))
        
        X_combined = hstack([
            X_tfidf,
            csr_matrix(numeric_features),
            csr_matrix(unit_encoded.values),
            csr_matrix(cat_array)
        ]).astype(np.float32)
        
        if is_training:
            self.expected_feature_count = X_combined.shape[1]
            print(f"✓ Total features: {self.expected_feature_count}")
        
        return X_combined, df

    def align_features(self, X_test):
        """Ensure feature dimensions match"""
        if X_test.shape[1] < self.expected_feature_count:
            missing = self.expected_feature_count - X_test.shape[1]
            padding = csr_matrix((X_test.shape[0], missing), dtype=np.float32)
            X_test = hstack([X_test, padding])
        elif X_test.shape[1] > self.expected_feature_count:
            X_test = X_test[:, :self.expected_feature_count]
        
        return X_test

    def train_model(self, X_train, y_train):
        """OPTIMIZED: Better hyperparameters + K-Fold CV"""
        print("\n" + "="*60)
        print("TRAINING MODEL (MAXIMUM ACCURACY)")
        print("="*60)
        
        # IMPROVEMENT: Use K-Fold for robustness
        kf = KFold(n_splits=5, shuffle=True, random_state=42)
        
        cv_scores = []
        for fold, (train_idx, val_idx) in enumerate(kf.split(X_train), 1):
            print(f"\nFold {fold}/5")
            
            X_tr = X_train[train_idx]
            y_tr = y_train[train_idx]
            X_val = X_train[val_idx]
            y_val = y_train[val_idx]
            
            # OPTIMIZED parameters
            model = lgb.LGBMRegressor(
                n_estimators=3000,  # INCREASED
                learning_rate=0.02,  # DECREASED for better convergence
                max_depth=12,  # INCREASED
                num_leaves=200,  # INCREASED
                subsample=0.80,  # Slightly reduced
                colsample_bytree=0.80,  # Slightly reduced
                min_child_samples=25,  # Slightly reduced
                reg_alpha=0.15,  # INCREASED regularization
                reg_lambda=0.15,  # INCREASED regularization
                min_split_gain=0.01,  # NEW: Minimum gain to split
                random_state=42 + fold,
                n_jobs=-1,
                verbose=-1,
                extra_trees=True  # NEW: Extra randomization
            )
            
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                eval_metric='rmse',
                callbacks=[early_stopping(150), log_evaluation(0)]
            )
            
            y_val_pred = model.predict(X_val)
            rmse = np.sqrt(np.mean((y_val - y_val_pred) ** 2))
            cv_scores.append(rmse)
            print(f"  Fold {fold} RMSE: {rmse:.4f}")
        
        print(f"\n✓ Average CV RMSE: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores):.4f})")
        
        # Train final model on all data
        print("\nTraining final model on full dataset...")
        self.model = lgb.LGBMRegressor(
            n_estimators=3000,
            learning_rate=0.02,
            max_depth=12,
            num_leaves=200,
            subsample=0.80,
            colsample_bytree=0.80,
            min_child_samples=25,
            reg_alpha=0.15,
            reg_lambda=0.15,
            min_split_gain=0.01,
            random_state=42,
            n_jobs=-1,
            verbose=-1,
            extra_trees=True
        )
        
        # Use 15% validation for early stopping
        X_tr, X_val, y_tr, y_val = train_test_split(
            X_train, y_train, test_size=0.15, random_state=42
        )
        
        self.model.fit(
            X_tr, y_tr,
            eval_set=[(X_val, y_val)],
            eval_metric='rmse',
            callbacks=[early_stopping(150), log_evaluation(0)]
        )
        
        print("✓ Final model trained!")

    def predict(self, X_test):
        """Prediction with dimension check"""
        print("\nGenerating predictions...")
        
        if hasattr(self, 'expected_feature_count') and X_test.shape[1] != self.expected_feature_count:
            X_test = self.align_features(X_test)
        
        predictions = self.model.predict(X_test)
        
        # Smart clipping based on training data percentiles
        predictions = np.maximum(predictions, 0.1)
        predictions = np.minimum(predictions, 1000)  # INCREASED ceiling
        
        return predictions

In [None]:
# Configuration
TRAIN_PATH = r'68e8d1d70b66d_student_resource/student_resource/dataset/train.csv'
TEST_PATH = r'68e8d1d70b66d_student_resource/student_resource/dataset/test.csv'
SUBMISSION_DIR = r'68e8d1d70b66d_student_resource/student_resource/dataset'

os.makedirs(SUBMISSION_DIR, exist_ok=True)

In [None]:
engineer = MaxAccuracyFeatureEngineer()

# Load training data
print("Loading training data...")
df_train = pd.read_csv(TRAIN_PATH, on_bad_lines='skip', encoding='utf-8')

print(f"Training set: {len(df_train):,} records")

# Remove outliers
print("\nRemoving outliers...")
price_q1 = df_train['price'].quantile(0.005)
price_q99 = df_train['price'].quantile(0.995)

print(f"  Price range before: ${df_train['price'].min():.2f} - ${df_train['price'].max():.2f}")
df_train = df_train[(df_train['price'] >= price_q1) & 
                   (df_train['price'] <= price_q99)]
print(f"  Price range after: ${df_train['price'].min():.2f} - ${df_train['price'].max():.2f}")
print(f"  Samples after outlier removal: {len(df_train):,}")

# Log transform target
print("\nApplying log transform to target...")
y_train_original = df_train['price'].values.astype(np.float32)
y_train_log = np.log1p(y_train_original)

In [None]:
print("\nPreparing training features...")
X_train, df_train = engineer.prepare_full_features(df_train, is_training=True)

print(f"Training features shape: {X_train.shape}")
print(f"  - TF-IDF features: {engineer.tfidf.get_feature_names_out().shape[0]}")
print(f"  - Numeric features: {len(engineer.feature_columns)}")
print(f"  - Total features: {X_train.shape[1]:,}")

In [None]:
# Train with Cross Validation
engineer.train_model(X_train, y_train_log)

In [None]:
print("\nLoading test data...")
df_test = pd.read_csv(TEST_PATH, on_bad_lines='skip', encoding='utf-8')
print(f"Test set: {len(df_test):,} records")

print("Preparing test features...")
X_test, df_test = engineer.prepare_full_features(df_test, is_training=False)
print(f"Test features shape: {X_test.shape}")

# Make predictions and inverse transform
predictions_log = engineer.predict(X_test)
predictions = np.expm1(predictions_log)

# Post-processing
print("\nApplying post-processing...")
for idx, row in df_test.iterrows():
    category = row.get('category', 'other')
    
    if category in ['laptop', 'phone', 'tablet', 'gpu', 'cpu']:
        if predictions[idx] < 50:
            predictions[idx] *= 1.2
    
    elif category in ['food', 'beauty', 'cooking']:
        if predictions[idx] > 100:
            predictions[idx] *= 0.85

predictions = np.clip(predictions, 0.1, 1000)

In [None]:
# Analysis & Submission
print(f"\nPrediction Analysis:")
print(f"  Min: ${predictions.min():.2f}")
print(f"  Max: ${predictions.max():.2f}")
print(f"  Mean: ${predictions.mean():.2f}")

# Save
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
submission_file = os.path.join(SUBMISSION_DIR, f'outler_both_main{timestamp}.csv')

submission = pd.DataFrame({
    'sample_id': df_test['sample_id'], 
    'price': predictions
})
submission.to_csv(submission_file, index=False)

print(f"\n{'='*60}")
print(f"✅ SUBMISSION SAVED: {submission_file}")
print(f"{'='*60}")
print(submission.head())

# Feature Importance Plot
if hasattr(engineer.model, 'feature_importances_'):
    plt.figure(figsize=(10, 6))
    lgb.plot_importance(engineer.model, max_num_features=20, importance_type='split')
    plt.title("Top 20 Features by Split Importance")
    plt.show()