# Modeling: KNN, Decision Tree, Random Forest, Neural Network

In [327]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
import pickle
from sklearn.neighbors import KNeighborsRegressor
from sklearn.utils.validation import check_is_fitted
from sklearn.exceptions import NotFittedError


sns.set(style='whitegrid')


In [328]:
# Load data
FILE_PATH = "/media/hoang/HDD_Code/Tài liệu học tập/Kỳ 1 năm 4/Khai phá dữ liệu/mobiles_dataset_2025_clustered_labeled.csv"
df = pd.read_csv(FILE_PATH)
print(f"Loaded {len(df)} rows and {len(df.columns)} columns")

# Target and feature selection
TARGET = 'Launched Price (USA)'

# If your file contains columns with encoded companies/processors like in prior notebook, include them.
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if TARGET not in numeric_cols:
    raise KeyError(f"Target column '{TARGET}' not found as numeric column in dataframe")
feature_cols = [c for c in numeric_cols if c != TARGET]

# Minimal cleaning: drop rows with missing target, impute numeric features later in pipeline
df = df.dropna(subset=[TARGET]).reset_index(drop=True)
X = df[feature_cols]
y = df[TARGET]

print(f"Using {len(feature_cols)} features for modeling")


Loaded 908 rows and 17 columns
Using 16 features for modeling


In [329]:
print("Current X columns:", X.columns.tolist())
print("Number of features:", X_train_prep.shape[1])

Current X columns: ['RAM', 'Front Camera', 'Back Camera', 'Battery Capacity', 'Screen Size', 'ROM', 'Company_Apple', 'Company_Honor', 'Company_Oppo', 'Company_Other', 'Company_Samsung', 'Company_Vivo', 'Processor_vec1', 'Processor_vec2', 'Processor_vec3', 'Cluster']
Number of features: 16


In [330]:
# Train/test split
RANDOM_STATE = 42
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE
)

# Preprocessing pipeline: impute then scale (fit on train)
numeric_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

X_train_prep = numeric_pipeline.fit_transform(X_train)
X_test_prep = numeric_pipeline.transform(X_test)

print('Shapes:', X_train_prep.shape, X_test_prep.shape)


Shapes: (726, 16) (182, 16)


In [331]:
def evaluate_regression(true, pred):
    mae = mean_absolute_error(true, pred)
    rmse = np.sqrt(mean_squared_error(true, pred))
    r2 = r2_score(true, pred)
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2}

def print_eval(name, true, pred):
    res = evaluate_regression(true, pred)
    print(f"{name}: MAE={res['MAE']:.3f}, RMSE={res['RMSE']:.3f}, R2={res['R2']:.3f}")
    return res

def _is_fitted(est):
    try:
        check_is_fitted(est)
        return True
    except (NotFittedError, AttributeError):
        return False

In [332]:

# Ensure estimator objects exist
# train with knn
if 'best_knn' in globals() and isinstance(best_knn, int):
    best_knn = KNeighborsRegressor(n_neighbors=best_knn)
elif 'best_knn' not in globals():
    best_knn = KNeighborsRegressor(n_neighbors=5)

# train with decision tree
if 'best_dt' not in globals():
    best_dt = DecisionTreeRegressor(random_state=RANDOM_STATE)
# train with random forest
if 'rf' not in globals():
    rf = RandomForestRegressor(n_estimators=100, random_state=RANDOM_STATE, n_jobs=-1)

# Fit if needed (use train data already prepared)
for name, est in [('best_knn', best_knn), ('best_dt', best_dt), ('rf', rf)]:
    if not _is_fitted(est):
        print(f"{name} not fitted -> fitting now")
        est.fit(X_train_prep, y_train)
    else:
        print(f"{name} already fitted")
# Build pipelines that include the fitted numeric_pipeline (so preprocessing is saved together)
from sklearn.pipeline import Pipeline as SKPipeline
knn_pipe = SKPipeline([('preprocessor', numeric_pipeline), ('model', best_knn)])
dt_pipe  = SKPipeline([('preprocessor', numeric_pipeline), ('model', best_dt)])
rf_pipe  = SKPipeline([('preprocessor', numeric_pipeline), ('model', rf)])


best_knn already fitted
best_dt already fitted
rf already fitted


In [333]:
# First, let's check and debug the data shapes
print("Debug information:")
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"X_train_prep shape: {X_train_prep.shape}")
print(f"X_test_prep shape: {X_test_prep.shape}")
print("\nFeature columns:", X.columns.tolist())
# Evaluate models
results = {}

# KNN evaluation
try:
    knn_pred = best_knn.predict(X_test_prep)
    results['KNN'] = print_eval('KNN', y_test, knn_pred)
except Exception as e:
    print(f"KNN prediction failed: {e}")

# Decision Tree evaluation    
try:
    dt_pred = best_dt.predict(X_test_prep)
    results['DecisionTree'] = print_eval('DecisionTree', y_test, dt_pred)
except Exception as e:
    print(f"Decision Tree prediction failed: {e}")

# Random Forest evaluation
try:
    rf_pred = rf.predict(X_test_prep)
    results['RandomForest'] = print_eval('RandomForest', y_test, rf_pred)
except Exception as e:
    print(f"Random Forest prediction failed: {e}")

# Display results
summary = pd.DataFrame(results).T
display(summary)

Debug information:
X_train shape: (726, 16)
X_test shape: (182, 16)
X_train_prep shape: (726, 16)
X_test_prep shape: (182, 16)

Feature columns: ['RAM', 'Front Camera', 'Back Camera', 'Battery Capacity', 'Screen Size', 'ROM', 'Company_Apple', 'Company_Honor', 'Company_Oppo', 'Company_Other', 'Company_Samsung', 'Company_Vivo', 'Processor_vec1', 'Processor_vec2', 'Processor_vec3', 'Cluster']
KNN prediction failed: X has 16 features, but KNeighborsRegressor is expecting 15 features as input.
Decision Tree prediction failed: X has 16 features, but DecisionTreeRegressor is expecting 15 features as input.
Random Forest prediction failed: X has 16 features, but RandomForestRegressor is expecting 15 features as input.


In [334]:
# Save sklearn models with pkl
import pickle
from sklearn.pipeline import Pipeline

# Tạo pipeline chứa cả preprocessing đã fit + model (tiện cho inference)
knn_pipe = Pipeline([('preprocessor', numeric_pipeline), ('model', best_knn)])
dt_pipe  = Pipeline([('preprocessor', numeric_pipeline), ('model', best_dt)])
rf_pipe  = Pipeline([('preprocessor', numeric_pipeline), ('model', rf)])

# Lưu bằng pickle (.pkl) 
with open('knn_model.pkl', 'wb') as f:
    pickle.dump(knn_pipe, f)
with open('decision_tree_model.pkl', 'wb') as f:
    pickle.dump(dt_pipe, f)
with open('random_forest_model.pkl', 'wb') as f:
    pickle.dump(rf_pipe, f)

print('Saved: knn_model.pkl, decision_tree_model.pkl, random_forest_model.pkl,')
print('       knn_model.joblib, decision_tree_model.joblib, random_forest_model.joblib,')
print('       neural_net_model.h5')


Saved: knn_model.pkl, decision_tree_model.pkl, random_forest_model.pkl,
       knn_model.joblib, decision_tree_model.joblib, random_forest_model.joblib,
       neural_net_model.h5


Notes:
- Inspect feature importance from RandomForest via rf.feature_importances_ if desired.
- Further tuning (e.g., RandomizedSearchCV), target transformation (log), or categorical encoding may improve performance.
- Adjust dataset path and feature selection according to your processed CSV structure.