<a href="https://colab.research.google.com/github/naufalfrdss/DeepLearning/blob/main/UAS/Chapter%202/End_to_End_ML_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


Load Dataset

In [2]:
# Load California housing dataset dari URL
housing = pd.read_csv("https://raw.githubusercontent.com/ageron/handson-ml2/master/datasets/housing/housing.csv")

# Tampilkan 5 baris pertama
housing.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,NEAR BAY
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,NEAR BAY
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,NEAR BAY
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,NEAR BAY
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,NEAR BAY


Pembuatan kolom kategori

In [3]:
# Buat kolom baru untuk mengelompokkan pendapatan ke dalam kategori
housing["income_cat"] = pd.cut(housing["median_income"],
                               bins=[0., 1.5, 3.0, 4.5, 6., np.inf],
                               labels=[1, 2, 3, 4, 5])

Split Data

In [4]:
from sklearn.model_selection import StratifiedShuffleSplit

# Lakukan pembagian data stratifikasi berdasarkan income_cat
split = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

for train_idx, test_idx in split.split(housing, housing["income_cat"]):
    strat_train_set = housing.loc[train_idx]
    strat_test_set = housing.loc[test_idx]

Memisahkan fitur dan label

In [5]:
# Pisahkan label (target) dari fitur
housing = strat_train_set.drop("median_house_value", axis=1)      # Fitur
housing_labels = strat_train_set["median_house_value"].copy()     # Label

Preprocessing Pipeline Untuk fitur numerik

In [6]:
# Pipeline untuk fitur numerik
num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy="median")),   # Isi nilai kosong dengan median
    ('std_scaler', StandardScaler()),                # Standardisasi fitur numerik (mean=0, std=1)
])

Gabung Fitur numerik dan kategorikal

In [7]:
# Tentukan fitur numerik dan kategorikal
num_attribs = housing.drop("ocean_proximity", axis=1).columns
cat_attribs = ["ocean_proximity"]

# ColumnTransformer: gabungkan pipeline numerik & kategorikal
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_attribs),
    ("cat", OneHotEncoder(), cat_attribs),
])


transformasi data fitur

In [8]:
# Terapkan pipeline preprocessing ke data training
housing_prepared = full_pipeline.fit_transform(housing)

Train model regresi linear

In [9]:
# Buat dan latih model regresi linear
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)

Evaluasi Model

In [10]:
# Prediksi dan hitung RMSE (Root Mean Squared Error)
housing_predictions = lin_reg.predict(housing_prepared)
lin_mse = mean_squared_error(housing_labels, housing_predictions)
lin_rmse = np.sqrt(lin_mse)

print("Linear Regression RMSE:", lin_rmse)

Linear Regression RMSE: 68866.78550087014


Coba Random Forest

In [16]:
# Buat dan latih model Random Forest
forest_reg = RandomForestRegressor(n_estimators=100, random_state=42)
forest_reg.fit(housing_prepared, housing_labels)

# Evaluasi model
forest_preds = forest_reg.predict(housing_prepared)
forest_rmse = np.sqrt(mean_squared_error(housing_labels, forest_preds))
forest_r2 = r2_score(housing_labels, forest_preds)

print("Random Forest RMSE:", forest_rmse)
print("Random Forest R2:", forest_r2)

Random Forest RMSE: 18337.634681213494
Random Forest R2: 0.9748790257114038
