# Buat Modelling 

## Import Library

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


## Load dataset

In [2]:
df = pd.read_csv("../data/processed/laptops_clean.csv")
df_new = df.drop('original_price', axis=1)
df_new.head()

Unnamed: 0,id,name,price,discount_percentage,shop_name,shop_city,price_clean,ram_gb,storage_gb,storage_type,cpu_brand,cpu_model,cpu_series
0,102252755112,LAPTOP ACER ASPIRE LITE AL14-32P : INTEL N150/...,5349000,100.0,mdpsuperstore,Palembang,5349000,8.0,512.0,ssd,Intel,N150,N150
1,102231397091,Laptop Lenovo IdeaPad Slim 3 14IRH10 3NID Inte...,8599000,100.0,Starcomp Semarang,Semarang,8599000,8.0,512.0,ssd,Intel,I5-13420H,I5
2,102298229731,Laptop Lenovo IdeaPad Slim 3 14IAU7 GCID Intel...,5749000,100.0,Starcomp Semarang,Semarang,5749000,8.0,256.0,ssd,Intel,I3-1215U,I3
3,100780129365,Acer Laptop AL14-32P-C9VS Intel N150 8GB 512GB...,5099000,100.0,MyHartono,Surabaya,5099000,8.0,512.0,ssd,Intel,N150,N150
4,100780129080,Acer Laptop Nitro ANV16-71-79NR Intel Core i7-...,16499000,100.0,MyHartono,Surabaya,16499000,16.0,512.0,ssd,Intel,I7-14650HX,I7


## Selecting fitur

In [3]:
features = [
    'cpu_brand',
    'cpu_series',
    'ram_gb',
    'storage_gb',
]

target = 'price_clean'


## Split dataset into training and testing

In [4]:
X = df[features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


## Preprocessing Pipeline

In [5]:
categorical_cols = ['cpu_brand', 'cpu_series']
numeric_cols = ['ram_gb', 'storage_gb']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
        ('num', StandardScaler(), numeric_cols)
    ]
)

## Testing Model

### Linear Regression

In [6]:
model_lr = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', LinearRegression())
])

model_lr.fit(X_train, y_train)


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


#### Evaluation Result

In [7]:
y_pred_lr = model_lr.predict(X_test)

mae_lr = mean_absolute_error(y_test, y_pred_lr)
rmse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

mae_lr, rmse_lr, r2_lr
print(f"MSE: {mae_lr}")
print(f"RMSE: {np.sqrt(mae_lr)}")
print(f"R2: {r2_lr}")


MSE: 2734240.911033965
RMSE: 1653.5540242259897
R2: 0.7168831767459516


### Random Forest

In [8]:
model_rf = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1
    ))
])

model_rf.fit(X_train, y_train)


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,n_estimators,300
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,1.0
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


#### Evaluation Result

In [9]:
y_pred_rf = model_rf.predict(X_test)

mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

mae_rf, rmse_rf, r2_rf
print(f"MSE: {mae_rf}")
print(f"RMSE: {np.sqrt(mae_rf)}")
print(f"R2: {r2_rf}")

MSE: 2453098.0734381685
RMSE: 1566.2369148497837
R2: 0.7594258777874845


### XG Boost

In [10]:
pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [11]:

from xgboost import XGBRegressor

model_xgb = Pipeline(steps=[
    ('preprocess', preprocessor),
    ('model', XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=7,
        subsample=0.9,
        colsample_bytree=0.9,
        random_state=42
    ))
])

model_xgb.fit(X_train, y_train)


0,1,2
,steps,"[('preprocess', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('cat', ...), ('num', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.9
,device,
,early_stopping_rounds,
,enable_categorical,False


#### Evaluation Result

In [12]:
y_pred_xgb = model_xgb.predict(X_test)

mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
rmse_xgb = mean_squared_error(y_test, y_pred_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

mae_xgb, rmse_xgb, r2_xgb
print(f"MSE: {mae_xgb}")
print(f"RMSE: {np.sqrt(mae_xgb)}")
print(f"R2: {r2_xgb}")

MSE: 2444831.25
RMSE: 1563.5956158802696
R2: 0.7609754800796509


## Bandingkan semua model yg telah digunakan

In [13]:
results = pd.DataFrame({
    "Model": ["Linear Regression", "Random Forest", "XGBoost"],
    "MAE": [mae_lr, mae_rf, mae_xgb],
    "RMSE": [rmse_lr, rmse_rf, rmse_xgb],
    "R2": [r2_lr, r2_rf, r2_xgb],
})

results

Unnamed: 0,Model,MAE,RMSE,R2
0,Linear Regression,2734241.0,15579700000000.0,0.716883
1,Random Forest,2453098.0,13238610000000.0,0.759426
2,XGBoost,2444831.0,13153340000000.0,0.760975
