# Modeling: KNN, Decision Tree, Random Forest, Neural Network

In [165]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import pickle
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import NotFittedError


sns.set(style='whitegrid')


In [166]:
# Load data
FILE_PATH = "mobiles_dataset_2025_processed.csv"
df = pd.read_csv(FILE_PATH)
print(f"Loaded {len(df)} rows and {len(df.columns)} columns")

# Target and feature selection
TARGET = 'Launched Price (USA)'

# If your file contains columns with encoded companies/processors like in prior notebook, include them.
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if TARGET not in numeric_cols:
    raise KeyError(f"Target column '{TARGET}' not found as numeric column in dataframe")
feature_cols = [c for c in numeric_cols if c != TARGET]

# Minimal cleaning: drop rows with missing target, impute numeric features later in pipeline
df = df.dropna(subset=[TARGET]).reset_index(drop=True)
X = df[feature_cols]
y = df[TARGET]

print(f"Using {len(feature_cols)} features for modeling")


Loaded 908 rows and 16 columns
Using 15 features for modeling


In [167]:
# Train/test split
RANDOM_STATE = 42
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

# Preprocessing pipeline: impute then scale (fit on train)
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

X_train_prep = numeric_pipeline.fit_transform(X_train)
X_test_prep = numeric_pipeline.transform(X_test)

print('Shapes:', X_train_prep.shape, X_test_prep.shape)


Shapes: (726, 15) (182, 15)


In [168]:
def evaluate_regression(true, pred):
    mae = mean_absolute_error(true, pred)
    rmse = np.sqrt(mean_squared_error(true, pred))
    r2 = r2_score(true, pred)
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2}

def print_eval(name, true, pred):
    res = evaluate_regression(true, pred)
    print(f"{name}: MAE={res['MAE']:.3f}, RMSE={res['RMSE']:.3f}, R2={res['R2']:.3f}")
    return res

def _is_fitted(est):
    try:
        check_is_fitted(est)
        return True
    except (NotFittedError, AttributeError):
        return False

In [169]:


# Ensure estimator objects exist
if 'best_knn' in globals() and isinstance(best_knn, int):
    best_knn = KNeighborsRegressor(n_neighbors=best_knn)
elif 'best_knn' not in globals():
    best_knn = KNeighborsRegressor(n_neighbors=5)

if 'best_dt' not in globals():
    best_dt = DecisionTreeRegressor(random_state=RANDOM_STATE)

if 'rf' not in globals():
    rf = RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)

# Fit if needed (use train data already prepared)
for name, est in [('best_knn', best_knn), ('best_dt', best_dt), ('rf', rf)]:
    if not _is_fitted(est):
        print(f"{name} not fitted -> fitting now")
        est.fit(X_train_prep, y_train)
    else:
        print(f"{name} already fitted")
# Build pipelines that include the fitted numeric_pipeline (so preprocessing is saved together)
from sklearn.pipeline import Pipeline as SKPipeline
knn_pipe = SKPipeline([('preprocessor', numeric_pipeline), ('model', best_knn)])
dt_pipe  = SKPipeline([('preprocessor', numeric_pipeline), ('model', best_dt)])
rf_pipe  = SKPipeline([('preprocessor', numeric_pipeline), ('model', rf)])


best_knn already fitted
best_dt already fitted
rf already fitted


In [170]:

# Evaluate on test set (use X_test_prep which is preprocessed)
results = {}
knn_pred = best_knn.predict(X_test_prep)
results['KNN'] = print_eval('KNN', y_test, knn_pred)

dt_pred = best_dt.predict(X_test_prep)
results['DecisionTree'] = print_eval('DecisionTree', y_test, dt_pred)

rf_pred = rf.predict(X_test_prep)
results['RandomForest'] = print_eval('RandomForest', y_test, rf_pred)

summary = pd.DataFrame(results).T
display(summary)

KNN: MAE=1.101, RMSE=1.640, R2=0.838
DecisionTree: MAE=1.035, RMSE=1.574, R2=0.851
RandomForest: MAE=0.957, RMSE=1.356, R2=0.889


Unnamed: 0,MAE,RMSE,R2
KNN,1.100837,1.640172,0.838063
DecisionTree,1.035423,1.574037,0.850859
RandomForest,0.956592,1.356167,0.889288


In [171]:
# Save sklearn models with pkl
import pickle
from sklearn.pipeline import Pipeline

# Tạo pipeline chứa cả preprocessing đã fit + model (tiện cho inference)
knn_pipe = Pipeline([('preprocessor', numeric_pipeline), ('model', best_knn)])
dt_pipe  = Pipeline([('preprocessor', numeric_pipeline), ('model', best_dt)])
rf_pipe  = Pipeline([('preprocessor', numeric_pipeline), ('model', rf)])

# Lưu bằng pickle (.pkl) 
with open('knn_model.pkl', 'wb') as f:
    pickle.dump(knn_pipe, f)
with open('decision_tree_model.pkl', 'wb') as f:
    pickle.dump(dt_pipe, f)
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_pipe, f)

print('Saved: knn_model.pkl, decision_tree_model.pkl, random_forest_model.pkl,')
print('       knn_model.joblib, decision_tree_model.joblib, random_forest_model.joblib,')
print('       neural_net_model.h5')


Saved: knn_model.pkl, decision_tree_model.pkl, random_forest_model.pkl,
       knn_model.joblib, decision_tree_model.joblib, random_forest_model.joblib,
       neural_net_model.h5


In [172]:
# ...existing code...
import re, os, pickle, joblib

# 1) Tạo DataFrame input
new_phone = pd.DataFrame([{
    'Company Name': 'Oppo',
    'RAM': '12GB',
    'ROM': '256GB',
    'Front Camera': '50MP',
    'Back Camera': '50MP',
    'Battery Capacity': '5000mAh',
    'Screen Size': '6.9 inches',
    'Processor': 'Snapdragon 8 gen 2'
}])

# 2) Tiền xử lý nhỏ giống notebook
def clean_numeric(series, remove_str="", round_to_int=False):
    s = (series.astype(str)
         .str.replace(remove_str, "", regex=False)
         .str.replace("Not available", "", regex=False)
         .str.replace(",", "", regex=False)
         .str.extract(r"(\d+\.?\d*)")[0])
    out = pd.to_numeric(s, errors="coerce")
    if round_to_int:
        out = out.round().astype("Int64")
    return out

def extract_rom(model_name):
    mn = str(model_name).upper()
    m_tb = re.search(r'(\d+)TB', mn)
    if m_tb:
        return float(m_tb.group(1)) / 64 * 1024
    m_gb = re.search(r'(\d+)GB', mn)
    if m_gb:
        return float(m_gb.group(1)) / 64
    return np.nan

new_phone["RAM"] = clean_numeric(new_phone["RAM"], remove_str="GB", round_to_int=True)
new_phone['ROM'] = clean_numeric(new_phone["ROM"], remove_str="GB", round_to_int=True)
new_phone["Front Camera"] = clean_numeric(new_phone["Front Camera"], remove_str="MP") / 10
new_phone["Back Camera"]  = clean_numeric(new_phone["Back Camera"], remove_str="MP") / 10
new_phone["Battery Capacity"] = clean_numeric(new_phone["Battery Capacity"], remove_str="mAh", round_to_int=True) / 1000
new_phone["Screen Size"] = clean_numeric(new_phone["Screen Size"], remove_str="inches")

# Company dummy (safest: keep the value and create dummies later based on model's expected cols)
company_value = new_phone.loc[0, "Company Name"]

# 3) Load processor vectorizer + pca if available
proc_vec = None
proc_cols = []
if os.path.exists('processor_vectorizer.pkl') and os.path.exists('processor_pca.pkl'):
    with open('processor_vectorizer.pkl','rb') as f:
        vectorizer = pickle.load(f)
    with open('processor_pca.pkl','rb') as f:
        pca = pickle.load(f)
    try:
        tf = vectorizer.transform(new_phone["Processor"])
        pv = pca.transform(tf.toarray())
        proc_df = pd.DataFrame(pv, index=new_phone.index,
                               columns=[f"Processor_vec{i+1}" for i in range(pv.shape[1])])
        new_phone = pd.concat([new_phone.reset_index(drop=True), proc_df.reset_index(drop=True)], axis=1)
        proc_cols = proc_df.columns.tolist()
    except Exception as e:
        print("Processor transform failed:", e)

# 4) Load model (try best_model first)
model = None
for fname in ['knn_model.pkl','decision_tree_model.pkl','random_forest_model.pkl']:
    if os.path.exists(fname):
        try:
            model = joblib.load(fname) if fname.endswith('.joblib') else pickle.load(open(fname,'rb'))
            print("Loaded model from", fname)
            break
        except Exception as e:
            print("Failed loading", fname, ":", e)
if model is None:
    raise FileNotFoundError("Không tìm thấy model .pkl/.joblib")

# 5) Determine required feature names (prefer preprocessor/imputer feature_names_in_ if pipeline)
required_cols = None
if hasattr(model, 'named_steps') and 'preprocessor' in model.named_steps:
    pre = model.named_steps['preprocessor']
    # pre may be a Pipeline where imputer has feature_names_in_
    if hasattr(pre, 'feature_names_in_'):
        required_cols = list(pre.feature_names_in_)
    else:
        # try to get imputer inside pre
        try:
            imputer = pre.named_steps.get('imputer', None)
            if imputer is not None and hasattr(imputer, 'feature_names_in_'):
                required_cols = list(imputer.feature_names_in_)
        except Exception:
            required_cols = None

if required_cols is None:
    if hasattr(model, 'feature_names_in_'):
        required_cols = list(model.feature_names_in_)
    else:
        # fallback: use columns in training X if you stored them somewhere (not available) -> use new_phone cols
        required_cols = list(new_phone.columns)

# 6) Build aligned single-row DataFrame X_in with required_cols
X_in = pd.DataFrame(index=[0])
for col in required_cols:
    if col in new_phone.columns:
        X_in[col] = new_phone.loc[0, col]
    elif col.startswith("Company_"):
        # set company dummy: e.g. Company_Oppo -> 1 if company_value == "Oppo"
        comp_name = col.split("Company_")[-1]
        X_in[col] = 1 if comp_name == company_value else 0
    elif re.match(r'Processor_vec\d+', col):
        X_in[col] = new_phone.loc[0, col] if col in new_phone.columns else 0.0
    else:
        X_in[col] = 0.0

# Ensure column order matches required_cols
X_in = X_in.reindex(columns=required_cols, fill_value=0)

# 7) Predict (pipeline or estimator)
try:
    pred = model.predict(X_in)
except Exception as e:
    # if model is pipeline expecting DataFrame with original columns, try passing X_in.values
    pred = model.predict(X_in.values)
print("Giá dự đoán (raw):", pred[0])
try:
    print("≈ Giá (USD):", float(pred[0]) * 100)
except Exception:
    pass

Loaded model from knn_model.pkl
Giá dự đoán (raw): 11.656666666666666
≈ Giá (USD): 1165.6666666666665


Notes:
- Inspect feature importance from RandomForest via rf.feature_importances_ if desired.
- Further tuning (e.g., RandomizedSearchCV), target transformation (log), or categorical encoding may improve performance.
- Adjust dataset path and feature selection according to your processed CSV structure.