# Modeling: KNN, Decision Tree, Random Forest, Neural Network

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import pickle
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import NotFittedError

sns.set(style='whitegrid')

In [42]:
# Load data
FILE_PATH = "/media/hoang/HDD_Code/T√†i li·ªáu h·ªçc t·∫≠p/AI/Code_example/Mobile_Price_Prediction/mobiles_dataset_2025_processed.csv"
df = pd.read_csv(FILE_PATH)
print(f"Loaded {len(df)} rows and {len(df.columns)} columns")

# Target and feature selection
TARGET = 'Launched Price (USA)'

# If your file contains columns with encoded companies/processors like in prior notebook, include them.
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if TARGET not in numeric_cols:
    raise KeyError(f"Target column '{TARGET}' not found as numeric column in dataframe")
feature_cols = [c for c in numeric_cols if c != TARGET]

# Minimal cleaning: drop rows with missing target, impute numeric features later in pipeline
df = df.dropna(subset=[TARGET]).reset_index(drop=True)
X = df[feature_cols]
y = df[TARGET]

print(f"Using {len(feature_cols)} features for modeling")


Loaded 923 rows and 9 columns
Using 8 features for modeling


In [43]:
df.head()

Unnamed: 0,RAM,ROM,Battery Capacity,Front Camera,Back Camera,Screen Size,Company_encoded,Processor_encoded,Launched Price (USA)
0,3.0,0.125,5.3,3.2,10.8,6.9,4.652936,6.907234,3.79
1,0.75,0.0625,5.0,0.8,5.0,6.52,4.652936,3.714551,1.39
2,1.5,0.125,4.2,1.2,4.8,6.1,9.854878,9.362926,10.99
3,3.0,0.25,4.5,3.2,5.0,6.7,5.098215,5.716301,5.49
4,3.0,0.25,4.5,1.6,5.0,6.7,4.652936,6.850996,7.49


In [44]:
# Train/test split
RANDOM_STATE = 42
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

# Preprocessing pipeline: impute then scale (fit on train)
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

X_train_prep = numeric_pipeline.fit_transform(X_train)
X_test_prep = numeric_pipeline.transform(X_test)

print('Shapes:', X_train_prep.shape, X_test_prep.shape)


Shapes: (738, 8) (185, 8)


In [45]:
print("Current X columns:", X.columns.tolist())
print("Number of features:", X_train_prep.shape[1])

Current X columns: ['RAM', 'ROM', 'Battery Capacity', 'Front Camera', 'Back Camera', 'Screen Size', 'Company_encoded', 'Processor_encoded']
Number of features: 8


In [46]:
def evaluate_regression(true, pred):
    mae = mean_absolute_error(true, pred)
    rmse = np.sqrt(mean_squared_error(true, pred))
    r2 = r2_score(true, pred)
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2}

def print_eval(name, true, pred):
    res = evaluate_regression(true, pred)
    print(f"{name}: MAE={res['MAE']:.3f}, RMSE={res['RMSE']:.3f}, R2={res['R2']:.3f}")
    return res

def _is_fitted(est):
    try:
        check_is_fitted(est)
        return True
    except (NotFittedError, AttributeError):
        return False

In [47]:
# Train models from scratch with new processed data
import os

print("="*60)
print("TRAINING MODELS WITH TARGET-ENCODED DATA")
print("="*60)
print("\nNote: Using fresh models because data format changed")
print("(Target Encoding instead of One-Hot + TF-IDF/PCA)")

# Initialize fresh models for REGRESSION (not classification!)
best_knn = KNeighborsRegressor(
    n_neighbors=7,
    weights='distance',
    metric='euclidean'
)

best_dt = DecisionTreeRegressor(
    max_depth=15,
    min_samples_split=10,
    min_samples_leaf=4,
    random_state=RANDOM_STATE
)

rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    max_features='sqrt',
    bootstrap=True,
    random_state=RANDOM_STATE,
    n_jobs=-1
)

print("\n" + "="*60)
print("FITTING MODELS")
print("="*60)

# KNN needs scaled data
print("‚öôÔ∏è Fitting KNN with scaled data...")
best_knn.fit(X_train_prep, y_train)
print("‚úì KNN fitted")

# Decision Tree needs original data with feature names
print("‚öôÔ∏è Fitting Decision Tree with original data...")
best_dt.fit(X_train, y_train)
print("‚úì Decision Tree fitted")

# Random Forest needs original data with feature names
print("‚öôÔ∏è Fitting Random Forest with original data...")
rf.fit(X_train, y_train)
print("‚úì Random Forest fitted")

# Build pipelines that include the fitted numeric_pipeline (so preprocessing is saved together)
from sklearn.pipeline import Pipeline as SKPipeline
knn_pipe = SKPipeline([('preprocessor', numeric_pipeline), ('model', best_knn)])
dt_pipe  = SKPipeline([('preprocessor', numeric_pipeline), ('model', best_dt)])
rf_pipe  = SKPipeline([('preprocessor', numeric_pipeline), ('model', rf)])

print("\n" + "="*60)
print("All models trained and ready!")
print("="*60)

TRAINING MODELS WITH TARGET-ENCODED DATA

Note: Using fresh models because data format changed
(Target Encoding instead of One-Hot + TF-IDF/PCA)

FITTING MODELS
‚öôÔ∏è Fitting KNN with scaled data...
‚úì KNN fitted
‚öôÔ∏è Fitting Decision Tree with original data...
‚úì Decision Tree fitted
‚öôÔ∏è Fitting Random Forest with original data...


‚úì Random Forest fitted

All models trained and ready!


In [48]:
# First, let's check and debug the data shapes
print("Debug information:")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"X_train_prep shape: {X_train_prep.shape}")
print(f"X_test_prep shape: {X_test_prep.shape}")
print("\nFeature columns:", X.columns.tolist())

# Evaluate models
results = {}

# KNN evaluation - KNN needs SCALED data (X_test_prep)
try:
    knn_pred = best_knn.predict(X_test_prep)
    # Remember: predictions are scaled /100, multiply by 100 to get actual price for metrics
    results['KNN'] = print_eval('KNN', y_test, knn_pred)
except Exception as e:
    print(f"‚ùå KNN prediction failed: {e}")
    import traceback
    traceback.print_exc()

# Decision Tree evaluation - Tree models need ORIGINAL data (X_test) with feature names
try:
    dt_pred = best_dt.predict(X_test)
    results['DecisionTree'] = print_eval('DecisionTree', y_test, dt_pred)
except Exception as e:
    print(f"‚ùå Decision Tree prediction failed: {e}")
    import traceback
    traceback.print_exc()

# Random Forest evaluation - Tree models need ORIGINAL data (X_test) with feature names
try:
    rf_pred = rf.predict(X_test)
    results['RandomForest'] = print_eval('RandomForest', y_test, rf_pred)
except Exception as e:
    print(f"‚ùå Random Forest prediction failed: {e}")
    import traceback
    traceback.print_exc()

# Display results
print("\n" + "="*60)
print("MODEL EVALUATION SUMMARY")
print("="*60)

if results:
    summary = pd.DataFrame(results).T
    display(summary)
    
    # Display best model
    if 'R2' in summary.columns:
        best_model = summary['R2'].idxmax()
        print(f"\nüèÜ Best Model: {best_model} (R¬≤ = {summary.loc[best_model, 'R2']:.4f})")
        
        print("\n" + "="*60)
        print("IMPORTANT NOTE:")
        print("="*60)
        print("Target variable (price) is scaled by /100")
        print("MAE and RMSE are in scaled units (0-20 range)")
        print("To get actual price error: multiply MAE/RMSE by 100")
        print(f"  Actual MAE for {best_model}: ${summary.loc[best_model, 'MAE'] * 100:.2f}")
        print(f"  Actual RMSE for {best_model}: ${summary.loc[best_model, 'RMSE'] * 100:.2f}")
    else:
        print("‚ö†Ô∏è No R2 metric found in results")
else:
    print("‚ùå No successful predictions. Please check the error messages above.")

Debug information:
X_train shape: (738, 8)
X_test shape: (185, 8)
X_train_prep shape: (738, 8)
X_test_prep shape: (185, 8)

Feature columns: ['RAM', 'ROM', 'Battery Capacity', 'Front Camera', 'Back Camera', 'Screen Size', 'Company_encoded', 'Processor_encoded']
KNN: MAE=0.952, RMSE=1.448, R2=0.848
DecisionTree: MAE=0.991, RMSE=1.395, R2=0.859
RandomForest: MAE=0.827, RMSE=1.165, R2=0.902

MODEL EVALUATION SUMMARY


Unnamed: 0,MAE,RMSE,R2
KNN,0.952484,1.448195,0.848378
DecisionTree,0.990656,1.394972,0.859317
RandomForest,0.827317,1.164834,0.901907



üèÜ Best Model: RandomForest (R¬≤ = 0.9019)

IMPORTANT NOTE:
Target variable (price) is scaled by /100
MAE and RMSE are in scaled units (0-20 range)
To get actual price error: multiply MAE/RMSE by 100
  Actual MAE for RandomForest: $82.73
  Actual RMSE for RandomForest: $116.48


In [49]:
# Save sklearn models with pkl
import pickle
from sklearn.pipeline import Pipeline

# T·∫°o pipeline ch·ª©a c·∫£ preprocessing ƒë√£ fit + model (ti·ªán cho inference)
knn_pipe = Pipeline([('preprocessor', numeric_pipeline), ('model', best_knn)])
dt_pipe  = Pipeline([('preprocessor', numeric_pipeline), ('model', best_dt)])
rf_pipe  = Pipeline([('preprocessor', numeric_pipeline), ('model', rf)])

# L∆∞u b·∫±ng pickle (.pkl) 
with open('knn_model.pkl', 'wb') as f:
    pickle.dump(knn_pipe, f)
with open('decision_tree_model.pkl', 'wb') as f:
    pickle.dump(dt_pipe, f)
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_pipe, f)

print('Saved: knn_model.pkl, decision_tree_model.pkl, random_forest_model.pkl,')
print('       knn_model.joblib, decision_tree_model.joblib, random_forest_model.joblib,')


Saved: knn_model.pkl, decision_tree_model.pkl, random_forest_model.pkl,
       knn_model.joblib, decision_tree_model.joblib, random_forest_model.joblib,


In [50]:
import pandas as pd
import numpy as np
import pickle
import re

class MobilePreprocessor:
    def __init__(self):
        # Features for NEW Target Encoding approach (8 features)
        self.numeric_features = [
            'RAM', 'ROM', 'Battery Capacity', 
            'Front Camera', 'Back Camera', 'Screen Size'
        ]
        self.encoded_features = ['Company_encoded', 'Processor_encoded']
        
        self.target_encoder = None
        self.top_companies = ['Apple', 'Samsung', 'Oppo', 'Honor', 'Vivo']

    def clean_numeric(self, value, remove_str="", round_to_int=False):
        """Extract numeric value from string, supporting decimals"""
        try:
            value = str(value).replace(remove_str, "").replace(",", "").strip()
            # Match integers or floats (e.g., "7.6", "12", "48.5")
            match = re.search(r'\d+\.?\d*', value)
            if match:
                num = float(match.group())
                return int(num) if round_to_int else num
            return np.nan
        except:
            return np.nan
    
    def extract_rom(self, model_name):
        """Extract ROM from model name (e.g., '256GB' or '1TB')"""
        model_name = str(model_name).upper()
        match_tb = re.search(r'(\d+)TB', model_name)
        if match_tb:
            return float(match_tb.group(1))  # Keep in TB
        match_gb = re.search(r'(\d+)GB', model_name)
        if match_gb:
            return float(match_gb.group(1)) / 1024  # Convert GB to TB
        return np.nan

    def preprocess_input(self, phone_data):
        """
        Preprocess phone data using Target Encoding approach
        Input: DataFrame with columns like 'RAM', 'ROM', 'Company Name', 'Processor', etc.
        Output: DataFrame with 8 processed features ready for model prediction
        """
        processed = pd.DataFrame(index=[0])
        
        # Process numeric features with SAME scaling as training
        # RAM: Remove 'GB', divide by 4
        if 'RAM' in phone_data.columns:
            val = self.clean_numeric(phone_data['RAM'].iloc[0], 'GB', round_to_int=True)
            processed['RAM'] = (val / 4) if not pd.isna(val) else 0
        
        # ROM: Extract from 'Model Name' or 'ROM', convert to TB
        if 'Model Name' in phone_data.columns:
            val = self.extract_rom(phone_data['Model Name'].iloc[0])
        elif 'ROM' in phone_data.columns:
            val = self.clean_numeric(phone_data['ROM'].iloc[0], 'GB')
            if not pd.isna(val):
                val = val / 1024  # GB to TB
        else:
            val = np.nan
        processed['ROM'] = val if not pd.isna(val) else 0.256  # Default 256GB
        
        # Battery Capacity: Remove 'mAh', divide by 1000
        if 'Battery Capacity' in phone_data.columns:
            val = self.clean_numeric(phone_data['Battery Capacity'].iloc[0], 'mAh', round_to_int=True)
            processed['Battery Capacity'] = (val / 1000) if not pd.isna(val) else 0
        
        # Front Camera: Remove 'MP', divide by 10
        if 'Front Camera' in phone_data.columns:
            val = self.clean_numeric(phone_data['Front Camera'].iloc[0], 'MP')
            processed['Front Camera'] = (val / 10) if not pd.isna(val) else 0
        
        # Back Camera: Remove 'MP', divide by 10
        if 'Back Camera' in phone_data.columns:
            val = self.clean_numeric(phone_data['Back Camera'].iloc[0], 'MP')
            processed['Back Camera'] = (val / 10) if not pd.isna(val) else 0
        
        # Screen Size: Remove 'inches', keep as-is
        if 'Screen Size' in phone_data.columns:
            val = self.clean_numeric(phone_data['Screen Size'].iloc[0], 'inches')
            processed['Screen Size'] = val if not pd.isna(val) else 0
        
        # Prepare Company and Processor for Target Encoding
        categorical_data = pd.DataFrame(index=[0])
        
        # Simplify Company Name (same logic as training)
        if 'Company Name' in phone_data.columns:
            company = phone_data['Company Name'].iloc[0]
            categorical_data['Company'] = company if company in self.top_companies else 'Other'
        else:
            categorical_data['Company'] = 'Other'
        
        # Processor
        if 'Processor' in phone_data.columns:
            categorical_data['Processor'] = phone_data['Processor'].iloc[0]
        else:
            categorical_data['Processor'] = 'Unknown'
        
        # Apply Target Encoding using fitted encoder
        if self.target_encoder is not None:
            encoded = self.target_encoder.transform(categorical_data)
            processed['Company_encoded'] = encoded['Company_encoded'].values[0]
            processed['Processor_encoded'] = encoded['Processor_encoded'].values[0]
        else:
            # Fallback if encoder not loaded
            processed['Company_encoded'] = 5.0  # Median value
            processed['Processor_encoded'] = 5.0
        
        # Ensure correct column order (8 features)
        feature_order = self.numeric_features + self.encoded_features
        return processed[feature_order]

    def load_preprocessor(self):
        """Load saved Target Encoder"""
        try:
            with open('target_encoder_fitted.pkl', 'rb') as f:
                self.target_encoder = pickle.load(f)
            print("‚úì Loaded Target Encoder (target_encoder_fitted.pkl)")
            return True
        except Exception as e:
            print(f"‚ùå Error loading target encoder: {e}")
            print("‚ö†Ô∏è Will use fallback encoding values")
            return False


def predict_price(phone_data, model_path='random_forest_model.pkl'):
    """
    Predict phone price using Target Encoding and saved model
    
    Args:
        phone_data: DataFrame with phone specifications
        model_path: Path to saved model pickle file (default: random_forest_model.pkl)
        
    Returns:
        Predicted price in USD, or None if prediction fails
    """
    print("="*60)
    print("PHONE SPECIFICATIONS:")
    print("="*60)
    for col, val in phone_data.items():
        print(f"  {col}: {val[0]}")
    
    # Initialize preprocessor
    preprocessor = MobilePreprocessor()
    preprocessor.load_preprocessor()  # Load target encoder
    
    # Preprocess input data
    processed_data = preprocessor.preprocess_input(phone_data)
    
    print("\n" + "="*60)
    print("PROCESSED FEATURES:")
    print("="*60)
    print(processed_data.to_string())
    
    # Load and predict with model
    try:
        with open(model_path, 'rb') as f:
            model_pipeline = pickle.load(f)
        
        # Model pipeline contains: preprocessor (scaler) + model
        # We pass the already-processed data (8 features)
        prediction = model_pipeline.predict(processed_data)
        
        # IMPORTANT: Prediction is in scaled units (divided by 100)
        # Multiply by 100 to get actual price in USD
        predicted_price = float(prediction[0]) * 100
        
        return predicted_price
        
    except Exception as e:
        print(f"\n‚ùå Prediction failed: {e}")
        import traceback
        traceback.print_exc()
        return None


if __name__ == "__main__":
    print("\n" + "="*60)
    print("MOBILE PRICE PREDICTION - TARGET ENCODING VERSION")
    print("="*60)
    
    # Test case: Samsung Galaxy Z Fold 6
    phone_data = pd.DataFrame({
        'Model Name': ['Galaxy Z Fold 6 256GB'],  # ROM extracted from here
        'RAM': ['12GB'],
        'Front Camera': ['10MP'],
        'Back Camera': ['50MP'],
        'Battery Capacity': ['4400mAh'],
        'Screen Size': ['7.6 inches'],
        'Company Name': ['Samsung'],
        'Processor': ['Snapdragon 8 Gen 3']
    })
    
    # Predict using Random Forest (best model with R¬≤=0.902)
    predicted_price = predict_price(phone_data, model_path='random_forest_model.pkl')
    
    if predicted_price:
        print("\n" + "="*60)
        print(f"üí∞ PREDICTED PRICE: ${predicted_price:.2f}")
        print("="*60)
        print(f"Model: Random Forest (R¬≤ = 0.902)")
        print(f"MAE: $82.73 | RMSE: $116.48")
    
    # Test case 2: iPhone example
    print("\n" + "="*60)
    print("TEST CASE 2: iPhone 15 Pro Max")
    print("="*60)
    
    phone_data_2 = pd.DataFrame({
        'Model Name': ['iPhone 15 Pro Max 512GB'],
        'RAM': ['8GB'],
        'Front Camera': ['12MP'],
        'Back Camera': ['48MP'],
        'Battery Capacity': ['4685mAh'],
        'Screen Size': ['6.9 inches'],
        'Company Name': ['Apple'],
        'Processor': ['Apple A18 Pro']
    })
    
    predicted_price_2 = predict_price(phone_data_2, model_path='random_forest_model.pkl')
    
    if predicted_price_2:
        print("\n" + "="*60)
        print(f"üí∞ PREDICTED PRICE: ${predicted_price_2:.2f}")
        print("="*60)



MOBILE PRICE PREDICTION - TARGET ENCODING VERSION
PHONE SPECIFICATIONS:
  Model Name: Galaxy Z Fold 6 256GB
  RAM: 12GB
  Front Camera: 10MP
  Back Camera: 50MP
  Battery Capacity: 4400mAh
  Screen Size: 7.6 inches
  Company Name: Samsung
  Processor: Snapdragon 8 Gen 3
‚úì Loaded Target Encoder (target_encoder_fitted.pkl)

PROCESSED FEATURES:
   RAM   ROM  Battery Capacity  Front Camera  Back Camera  Screen Size  Company_encoded  Processor_encoded
0  3.0  0.25               4.4           1.0          5.0          7.6         7.143009            9.29864

üí∞ PREDICTED PRICE: $416.44
Model: Random Forest (R¬≤ = 0.902)
MAE: $82.73 | RMSE: $116.48

TEST CASE 2: iPhone 15 Pro Max
PHONE SPECIFICATIONS:
  Model Name: iPhone 15 Pro Max 512GB
  RAM: 8GB
  Front Camera: 12MP
  Back Camera: 48MP
  Battery Capacity: 4685mAh
  Screen Size: 6.9 inches
  Company Name: Apple
  Processor: Apple A18 Pro
‚úì Loaded Target Encoder (target_encoder_fitted.pkl)

üí∞ PREDICTED PRICE: $416.44
Model: Random




PROCESSED FEATURES:
   RAM  ROM  Battery Capacity  Front Camera  Back Camera  Screen Size  Company_encoded  Processor_encoded
0  2.0  0.5             4.685           1.2          4.8          6.9         9.854878           5.684192

üí∞ PREDICTED PRICE: $517.15

üí∞ PREDICTED PRICE: $517.15


Notes:
- Inspect feature importance from RandomForest via rf.feature_importances_ if desired.
- Further tuning (e.g., RandomizedSearchCV), target transformation (log), or categorical encoding may improve performance.
- Adjust dataset path and feature selection according to your processed CSV structure.