In [None]:
import os
import warnings
warnings.filterwarnings('ignore')


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Cek apakah dataset ada
DATA_PATH = 'Life_Expectancy_WHO.csv'

# Baca dataset
df = pd.read_csv(DATA_PATH, encoding='latin1', sep=';')

print('Dataset shape:', df.shape)
print('Kolom:', list(df.columns))


# Tampilkan 5 baris pertama
print(df.head())


# Quick EDA: missing, tipe
print('\nMissing per kolom:')
print(df.isnull().sum().sort_values(ascending=False).head(20))


print('\nDescriptive statistics untuk kolom numerik:')
print(df.describe().T)


# Visualisasi sederhana (jika menjalankan secara interaktif)
try:
    plt.figure(figsize=(8,4))
    df['Life'].hist(bins=30)
    plt.title('Distribusi Life Expectancy')
    plt.xlabel('Life expectancy')
    plt.tight_layout()
    plt.show()
except Exception:
    pass


# ---------- Bagian B: Preprocessing & Feature Engineering ----------
# Asumsi nama kolom target kemungkinan bervariasi pada dataset WHO. Cari kolom target yang mirip.
possible_targets = [c for c in df.columns if 'life' in c.lower() or 'expectancy' in c.lower()]
print('\nKemungkinan kolom target:', possible_targets)


# Jika kolom target ditemukan, ambil kolom pertama sebagai target
if len(possible_targets) == 0:
    raise ValueError('Tidak menemukan kolom target Life Expectancy. Pastikan CSV yang digunakan benar.')


target_col = possible_targets[0]
print('Menggunakan target:', target_col)


# Pilih fitur numerik untuk regresi sederhana
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# Hapus kolom target dari fitur
if target_col in numeric_cols:
    numeric_cols.remove(target_col)


print('\nKolom numerik yang tersedia (contoh):', numeric_cols[:20])

FileNotFoundError: [Errno 2] No such file or directory: 'Life_Expectancy_WHO.csv'

In [1]:
from sklearn.impute import SimpleImputer
num_imputer = SimpleImputer(strategy='median')
X_num = df[numeric_cols]
X_num_imputed = pd.DataFrame(num_imputer.fit_transform(X_num), columns=numeric_cols)


# Untuk kategori (jika ada), lakukan one-hot minimal
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print('\nKolom kategorikal (jika ada):', cat_cols)
if len(cat_cols) > 0:
    X_cat = df[cat_cols].fillna('missing')
    X_cat_ohe = pd.get_dummies(X_cat, drop_first=True)
    X = pd.concat([X_num_imputed, X_cat_ohe], axis=1)
else:
    X = X_num_imputed.copy()


# Target imputation: drop baris yang targetnya kosong
y = df[target_col]
mask = y.notnull()
X = X.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)


print('\nSetelah imputasi: X shape =', X.shape, 'y shape =', y.shape)


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Train/Test sizes:', X_train.shape, X_test.shape)

NameError: name 'df' is not defined

In [2]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()


# Apply scaler only to original numeric columns
X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

NameError: name 'X_train' is not defined

In [3]:
# ---------- Bagian C: Pelatihan model (2 algoritma) ----------
# Kita gunakan: 1) Linear Regression (with regularization: Ridge) 2) Random Forest Regressor
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor


models = {
'Ridge': Ridge(random_state=42),
'RandomForest': RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
}


results = {}
cv = KFold(n_splits=5, shuffle=True, random_state=42)
for name, model in models.items():
    print('\nTraining', name)
if name == 'Ridge':
# gunakan scaled features
    scores = cross_val_score(model, X_train_scaled, y_train, scoring='neg_root_mean_squared_error', cv=cv)
    model.fit(X_train_scaled, y_train)
    preds = model.predict(X_test_scaled)
else:
# random forest tahan terhadap skala
    scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cv)
    model.fit(X_train, y_train)
    preds = model.predict(X_test)


rmse_cv = -np.mean(scores)
rmse_test = mean_squared_error(y_test, preds, squared=False)
mae_test = mean_absolute_error(y_test, preds)
r2_test = r2_score(y_test, preds)


results[name] = {
'model': model,
'rmse_cv': rmse_cv,
'rmse_test': rmse_test,
'mae_test': mae_test,
'r2_test': r2_test
}
print(f'{name} - RMSE(CV)={rmse_cv:.4f}, RMSE(Test)={rmse_test:.4f}, MAE={mae_test:.4f}, R2={r2_test:.4f}')


# Bandingkan hasil
print('\nRingkasan hasil:')
for name, r in results.items():
    print(name, r['rmse_test'], r['mae_test'], r['r2_test'])


# Pilih model terbaik berdasarkan RMSE Test
best_name = min(results, key=lambda k: results[k]['rmse_test'])
best_model = results[best_name]['model']
print('\nModel terbaik:', best_name)

NameError: name 'KFold' is not defined

In [None]:
import joblib
os.makedirs('models', exist_ok=True)
# Simpan model
joblib.dump(best_model, f'models/{best_name}_model.joblib')
# Simpan scaler untuk penggunaan di aplikasi jika model membutuhkan scaling
joblib.dump(scaler, 'models/scaler.joblib')


print('Model dan scaler tersimpan di folder models/')