# Mount Google Drive dan load dataset

Hubungkan Colab ke Google Drive, baca file midterm-regresi-dataset.csv.

In [2]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np

# GANTI path ini sesuai lokasi file di Google Drive kamu
base_path = "/content/drive/MyDrive/DATASETML"
data_path = base_path + "/midterm-regresi-dataset.csv"

# Banyak dataset regresi ini tanpa header, jadi pakai header=None
df = pd.read_csv(data_path, header=None)

print("Shape dataset:", df.shape)   # (jumlah_baris, jumlah_kolom)
print("\n5 baris pertama:")
display(df.head())


Mounted at /content/drive
Shape dataset: (515345, 91)

5 baris pertama:


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,81,82,83,84,85,86,87,88,89,90
0,2001,49.94357,21.47114,73.0775,8.74861,-17.40628,-13.09905,-25.01202,-12.23257,7.83089,...,13.0162,-54.40548,58.99367,15.37344,1.11144,-23.08793,68.40795,-1.82223,-27.46348,2.26327
1,2001,48.73215,18.4293,70.32679,12.94636,-10.32437,-24.83777,8.7663,-0.92019,18.76548,...,5.66812,-19.68073,33.04964,42.87836,-9.90378,-32.22788,70.49388,12.04941,58.43453,26.92061
2,2001,50.95714,31.85602,55.81851,13.41693,-6.57898,-18.5494,-3.27872,-2.35035,16.07017,...,3.038,26.05866,-50.92779,10.93792,-0.07568,43.2013,-115.00698,-0.05859,39.67068,-0.66345
3,2001,48.2475,-1.89837,36.29772,2.58776,0.9717,-26.21683,5.05097,-10.34124,3.55005,...,34.57337,-171.70734,-16.96705,-46.67617,-12.51516,82.58061,-72.08993,9.90558,199.62971,18.85382
4,2001,50.9702,42.20998,67.09964,8.46791,-15.85279,-16.81409,-12.48207,-9.37636,12.63699,...,9.92661,-55.95724,64.92712,-17.72522,-1.49237,-7.50035,51.76631,7.88713,55.66926,28.74903


Baris pertama di tiap row adalah target (tahun rilis lagu).
Kolom sisanya adalah fitur numerik.

# Pisahkan target, cek missing, deskripsi data

In [3]:
# Kolom pertama sebagai target (tahun)
y = df.iloc[:, 0]
# Kolom kedua sampai terakhir sebagai fitur
X = df.iloc[:, 1:]

print("Shape fitur X :", X.shape)
print("Shape target y:", y.shape)

# Cek missing value
print("\nJumlah missing di tiap kolom (beberapa kolom pertama):")
missing_counts = X.isnull().sum()
print(missing_counts.head())

# Statistik dasar untuk melihat skala dan outlier
print("\nDeskripsi singkat fitur:")
display(X.describe().T.head(10))

print("\nDeskripsi target (tahun rilis):")
display(y.describe())


Shape fitur X : (515345, 90)
Shape target y: (515345,)

Jumlah missing di tiap kolom (beberapa kolom pertama):
1    0
2    0
3    0
4    0
5    0
dtype: int64

Deskripsi singkat fitur:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
1,515345.0,43.387126,6.067558,1.749,39.95469,44.2585,47.83389,61.97014
2,515345.0,1.289554,51.580351,-337.0925,-26.05952,8.41785,36.12401,384.06573
3,515345.0,8.658347,35.268585,-301.00506,-11.46271,10.47632,29.76482,322.85143
4,515345.0,1.164124,16.32279,-154.18358,-8.4875,-0.65284,8.78754,335.77182
5,515345.0,-6.553601,22.860785,-181.95337,-20.66645,-6.00777,7.74187,262.06887
6,515345.0,-9.521975,12.857751,-81.79429,-18.44099,-11.18839,-2.38896,166.23689
7,515345.0,-2.391089,14.571873,-188.214,-10.7806,-2.04667,6.50858,172.40268
8,515345.0,-1.793236,7.963827,-72.50385,-6.46842,-1.73645,2.91345,126.74127
9,515345.0,3.727876,10.582861,-126.47904,-2.29366,3.82231,9.96182,146.29795
10,515345.0,1.882385,6.530232,-41.63166,-2.44485,1.78352,6.14722,60.34535



Deskripsi target (tahun rilis):


Unnamed: 0,0
count,515345.0
mean,1998.397082
std,10.931046
min,1922.0
25%,1994.0
50%,2002.0
75%,2006.0
max,2011.0


# Bagi train dan test (internal)

Kita tidak punya file test terpisah, jadi kita buat sendiri data test untuk evaluasi.

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42
)

print("Jumlah data train:", X_train.shape[0])
print("Jumlah data test :", X_test.shape[0])


Jumlah data train: 412276
Jumlah data test : 103069


# Preprocessing numerik

In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Semua kolom X adalah numerik, tidak perlu pisah kategorik
numeric_preprocess = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

# Kita akan gabungkan numeric_preprocess dengan model di pipeline model


# Fungsi evaluasi regresi

In [6]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

def evaluate_regression(name, model, X_test, y_test):
    y_pred = model.predict(X_test)

    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"\n=== Hasil model: {name} ===")
    print("MSE  :", mse)
    print("RMSE :", rmse)
    print("MAE  :", mae)
    print("R^2  :", r2)

    return {
        "model": name,
        "mse": mse,
        "rmse": rmse,
        "mae": mae,
        "r2": r2
    }


# Model 1: Linear Regression (baseline)

In [7]:
from sklearn.linear_model import LinearRegression

linreg = LinearRegression()

linreg_pipe = Pipeline(steps=[
    ("preprocess", numeric_preprocess),
    ("model", linreg)
])

print("Training Linear Regression...")
linreg_pipe.fit(X_train, y_train)

results = []
res_lin = evaluate_regression("Linear Regression", linreg_pipe, X_test, y_test)
results.append(res_lin)


Training Linear Regression...

=== Hasil model: Linear Regression ===
MSE  : 90.69347247878291
RMSE : 9.523312054048366
MAE  : 6.778168687522753
R^2  : 0.23796616215080735


# Model 2: RandomForestRegressor (baseline tree)

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf_base = RandomForestRegressor(
    n_estimators=100,
    max_depth=None,
    n_jobs=-1,
    random_state=42
)

rf_pipe = Pipeline(steps=[
    ("preprocess", numeric_preprocess),
    ("model", rf_base)
])

print("\nTraining RandomForest (baseline)...")
rf_pipe.fit(X_train, y_train)

res_rf_base = evaluate_regression("RandomForest (baseline)", rf_pipe, X_test, y_test)
results.append(res_rf_base)



Training RandomForest (baseline)...


# Tuning sederhana hyperparameter RandomForest

In [None]:
from sklearn.model_selection import GridSearchCV

# (opsional) pakai subset untuk tuning supaya lebih ringan
MAX_TUNE_ROWS = 30000
if X_train.shape[0] > MAX_TUNE_ROWS:
    X_tune = X_train.sample(n=MAX_TUNE_ROWS, random_state=42)
    y_tune = y_train.loc[X_tune.index]
    print("Tuning memakai subset:", X_tune.shape[0], "baris")
else:
    X_tune = X_train
    y_tune = y_train

rf_model = RandomForestRegressor(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

rf_tune_pipe = Pipeline(steps=[
    ("preprocess", numeric_preprocess),
    ("model", rf_model)
])

param_grid = {
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 10, 20]
}

grid = GridSearchCV(
    estimator=rf_tune_pipe,
    param_grid=param_grid,
    cv=3,
    scoring="neg_root_mean_squared_error",  # kita pakai RMSE
    n_jobs=-1,
    verbose=2
)

print("\nMulai GridSearch RandomForest...")
grid.fit(X_tune, y_tune)

print("\nBest params:", grid.best_params_)
print("Best CV RMSE:", -grid.best_score_)

best_rf_pipe = grid.best_estimator_

res_rf_tuned = evaluate_regression("RandomForest (tuned)", best_rf_pipe, X_test, y_test)
results.append(res_rf_tuned)

results_df = pd.DataFrame(results)
print("\nRingkasan semua model:")
display(results_df)


# Train final model dan simpan

In [None]:
# Pilih model terbaik berdasarkan RMSE atau R^2 dari results_df
# Misal kita pilih RandomForest (tuned) sebagai final model

final_model = best_rf_pipe

print("\nTraining final model di seluruh data...")
final_model.fit(X, y)

# simpan model pakai joblib
import joblib
joblib.dump(final_model, "/content/final_regression_model.joblib")
print("Model disimpan ke /content/final_regression_model.joblib")
