In [11]:
import os
os.getcwd()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Cek apakah dataset ada
DATA_PATH = 'Life Expectancy Data.csv'
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Dataset tidak ditemukan di path: {DATA_PATH}. Pastikan file CSV tersedia.")

# Baca dataset
df = pd.read_csv(DATA_PATH, sep= ';', encoding='latin1')

print('Dataset shape:', df.shape)
print('Kolom:', list(df.columns))


# Tampilkan 5 baris pertama
print(df.head())


# Quick EDA: missing, tipe
print('\nMissing per kolom:')
print(df.isnull().sum().sort_values(ascending=False).head(20))


print('\nDescriptive statistics untuk kolom numerik:')
print(df.describe().T)


# Visualisasi sederhana (jika menjalankan secara interaktif)
try:
    plt.figure(figsize=(8,4))
    df['Life'].hist(bins=30)
    plt.title('Distribusi Life Expectancy')
    plt.xlabel('Life expectancy')
    plt.tight_layout()
    plt.show()
except Exception:
    pass


# ---------- Bagian B: Preprocessing & Feature Engineering ----------
# Asumsi nama kolom target kemungkinan bervariasi pada dataset WHO. Cari kolom target yang mirip.
possible_targets = [c for c in df.columns if 'life' in c.lower() or 'expectancy' in c.lower()]
print('\nKemungkinan kolom target:', possible_targets)


# Jika kolom target ditemukan, ambil kolom pertama sebagai target
if len(possible_targets) == 0:
    raise ValueError('Tidak menemukan kolom target Life Expectancy. Pastikan CSV yang digunakan benar.')


target_col = possible_targets[0]
print('Menggunakan target:', target_col)


# Pilih fitur numerik untuk regresi sederhana
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
# Hapus kolom target dari fitur
if target_col in numeric_cols:
    numeric_cols.remove(target_col)


print('\nKolom numerik yang tersedia (contoh):', numeric_cols[:20])

Dataset shape: (1000, 22)
Kolom: ['ï»¿Country', 'Year', 'Status', 'Life expectancy ', 'Adult Mortality', 'infant deaths', 'Alcohol', 'percentage expenditure', 'Hepatitis B', 'Measles ', ' BMI ', 'under-five deaths ', 'Polio', 'Total expenditure', 'Diphtheria ', ' HIV/AIDS', 'GDP', 'Population', ' thinness  1-19 years', ' thinness 5-9 years', 'Income composition of resources', 'Schooling']
    ï»¿Country  Year      Status Life expectancy   Adult Mortality  \
0  Afghanistan  2015  Developing               65            263.0   
1  Afghanistan  2014  Developing             59,9            271.0   
2  Afghanistan  2013  Developing             59,9            268.0   
3  Afghanistan  2012  Developing             59,5            272.0   
4  Afghanistan  2011  Developing             59,2            275.0   

   infant deaths Alcohol percentage expenditure  Hepatitis B  Measles   ...  \
0             62    0,01            71,27962362         65.0      1154  ...   
1             64    0,01     

<Figure size 800x400 with 0 Axes>

In [12]:
from sklearn.impute import SimpleImputer
num_imputer = SimpleImputer(strategy='median')
X_num = df[numeric_cols]
X_num_imputed = pd.DataFrame(num_imputer.fit_transform(X_num), columns=numeric_cols)


# Untuk kategori (jika ada), lakukan one-hot minimal
cat_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print('\nKolom kategorikal (jika ada):', cat_cols)
if len(cat_cols) > 0:
    X_cat = df[cat_cols].fillna('missing')
    X_cat_ohe = pd.get_dummies(X_cat, drop_first=True)
    X = pd.concat([X_num_imputed, X_cat_ohe], axis=1)
else:
    X = X_num_imputed.copy()


# Target imputation: drop baris yang targetnya kosong
y = df[target_col]
mask = y.notnull()
X = X.loc[mask].reset_index(drop=True)
y = y.loc[mask].reset_index(drop=True)


print('\nSetelah imputasi: X shape =', X.shape, 'y shape =', y.shape)


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print('Train/Test sizes:', X_train.shape, X_test.shape)


Kolom kategorikal (jika ada): ['ï»¿Country', 'Status', 'Life expectancy ', 'Alcohol', 'percentage expenditure', ' BMI ', 'Total expenditure', ' HIV/AIDS', 'GDP', ' thinness  1-19 years', ' thinness 5-9 years', 'Income composition of resources', 'Schooling']

Setelah imputasi: X shape = (998, 4557) y shape = (998,)
Train/Test sizes: (798, 4557) (200, 4557)


In [13]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = X_train.copy()
X_test_scaled = X_test.copy()


# Apply scaler only to original numeric columns
X_train_scaled[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test_scaled[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [None]:
# ---------- Bagian C: Pelatihan model (2 algoritma) ----------
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Pastikan tidak ada angka dengan koma
X_train = X_train.apply(lambda x: x.astype(str).str.replace(',', '.')).apply(pd.to_numeric, errors='coerce')
X_test = X_test.apply(lambda x: x.astype(str).str.replace(',', '.')).apply(pd.to_numeric, errors='coerce')
y_train = y_train.astype(str).str.replace(',', '.')
y_train = pd.to_numeric(y_train, errors='coerce')
y_test = y_test.astype(str).str.replace(',', '.')
y_test = pd.to_numeric(y_test, errors='coerce')

models = {
    'Ridge': Ridge(random_state=42),
    'RandomForest': RandomForestRegressor(n_estimators=200, random_state=42, n_jobs=-1)
}

results = {}
cv = KFold(n_splits=5, shuffle=True, random_state=42)

for name, model in models.items():
    print(f'\nTraining {name}')

    if name == 'Ridge':
        # gunakan scaled features
        scores = cross_val_score(model, X_train_scaled, y_train, scoring='neg_root_mean_squared_error', cv=cv)
        model.fit(X_train_scaled, y_train)
        preds = model.predict(X_test_scaled)
    else:
        # random forest tahan terhadap skala
        scores = cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=cv)
        model.fit(X_train, y_train)
        preds = model.predict(X_test)

    rmse_cv = -np.mean(scores)
    rmse_test = mean_squared_error(y_test, preds) ** 0.5
    mae_test = mean_absolute_error(y_test, preds)
    r2_test = r2_score(y_test, preds)

    results[name] = {
        'model': model,
        'rmse_cv': rmse_cv,
        'rmse_test': rmse_test,
        'mae_test': mae_test,
        'r2_test': r2_test
    }

    print(f'{name} - RMSE(CV)={rmse_cv:.4f}, RMSE(Test)={rmse_test:.4f}, MAE={mae_test:.4f}, R2={r2_test:.4f}')

# Bandingkan hasil
print('\nRingkasan hasil:')
for name, r in results.items():
    print(name, r['rmse_test'], r['mae_test'], r['r2_test'])

# Pilih model terbaik berdasarkan RMSE Test
best_name = min(results, key=lambda k: results[k]['rmse_test'])
best_model = results[best_name]['model']
print('\nModel terbaik:', best_name)



Training Ridge
Ridge - RMSE(CV)=3.4245, RMSE(Test)=3.7921, MAE=2.5775, R2=0.8672

Training RandomForest


In [6]:
import joblib
os.makedirs('models', exist_ok=True)
# Simpan model
joblib.dump(best_model, f'models/{best_name}_model.joblib')
# Simpan scaler untuk penggunaan di aplikasi jika model membutuhkan scaling
joblib.dump(scaler, 'models/scaler.joblib')


print('Model dan scaler tersimpan di folder models/')

Model dan scaler tersimpan di folder models/
