In [10]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
import pandas as pd
import numpy as np

# Load data
df = pd.read_csv('../data/data.csv')

# Drop unwanted columns and define target
X = df.drop(columns=['Life_expectancy', 'Country', 'Year', 'Region'])
y = df['Life_expectancy']

# Split unbalanced data
X_train_un, X_test_un, y_train_un, y_test_un = train_test_split(X, y, test_size=0.2, random_state=1)

# Initialize model
model = RandomForestRegressor(random_state=1)

# Train on unbalanced data
model.fit(X_train_un, y_train_un)
y_pred_un = model.predict(X_test_un)

# Metrics for unbalanced data
mae_un = mean_absolute_error(y_test_un, y_pred_un)
mse_un = mean_squared_error(y_test_un, y_pred_un)
r2_un = r2_score(y_test_un, y_pred_un)

# Bin 'Life_expectancy' to balance data
df['Life_expectancy_bin'] = pd.cut(y, bins=np.arange(40, 90, 5))

# Resample to balance dataset
balanced_data = pd.DataFrame()
for life_bin in df['Life_expectancy_bin'].unique():
    bin_data = df[df['Life_expectancy_bin'] == life_bin]
    if len(bin_data) > 0:
        bin_sample = resample(bin_data, replace=True, n_samples=100, random_state=1)
        balanced_data = pd.concat([balanced_data, bin_sample])

# Drop bin column and separate features and target for balanced data
balanced_data = balanced_data.drop(columns=['Life_expectancy_bin'])
X_balanced = balanced_data.drop(columns=['Life_expectancy', 'Country', 'Year', 'Region'])
y_balanced = balanced_data['Life_expectancy']

# Split balanced data
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=1)

# Train on balanced data
model.fit(X_train, y_train)
y_pred_bal = model.predict(X_test)

# Metrics for balanced data
mae_bal = mean_absolute_error(y_test, y_pred_bal)
mse_bal = mean_squared_error(y_test, y_pred_bal)
r2_bal = r2_score(y_test, y_pred_bal)

# Print results
print("Metrics for Unbalanced Data:")
print("MAE:", mae_un)
print("MSE:", mse_un)
print("R² Score:", r2_un)

print("\nMetrics for Balanced Data:")
print("MAE:", mae_bal)
print("MSE:", mse_bal)
print("R² Score:", r2_bal)


Metrics for Unbalanced Data:
MAE: 0.37302617801047033
MSE: 0.2725273856893515
R² Score: 0.9968681189204636

Metrics for Balanced Data:
MAE: 0.586027777777784
MSE: 0.7017822944444458
R² Score: 0.9958026441864478
