# Life Expectancy Prediction
This project predicts life expectancy across countries using health, economic, and demographic indicators with machine learning.

## Dataset Overview
- **Rows**: 2,848 (train), 80 (test)
- **Columns**: 18 (train), 17 (test)
- **Features**: Country, Year, Status, Population, Health indicators (Hepatitis B, Measles, etc.), Economic (GDP, Total expenditure), Lifestyle (BMI, Alcohol, Schooling)
- **Target**: `Life expectancy` (average lifespan in years)
- **Source**: train.csv, test.csv

In [None]:
# Import libraries
import numpy as np
import pandas as pd
from pathlib import Path
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

In [None]:
# Load datasets
data_dir = Path('../data')
train_data = pd.read_csv(data_dir / 'train.csv')
test_data = pd.read_csv(data_dir / 'test.csv')
train_data.head()

## Feature Engineering
### 1. Handle Missing Values
- Fill NaNs with country-specific means; fallback to global median if needed.

In [None]:
class DataPreprocessor:
    def __init__(self):
        self.means_by_country = {}
        self.global_medians = {}

    def fit(self, data):
        cols_with_na = data.columns[data.isna().any()].tolist()
        for col in cols_with_na:
            self.means_by_country[col] = data.groupby('Country')[col].mean()
            self.global_medians[col] = data[col].median()

    def transform(self, data):
        data = data.copy()
        for col in self.means_by_country:
            data[col] = data.apply(
                lambda row: self.means_by_country[col].get(row['Country'], self.global_medians[col])
                if pd.isna(row[col]) else row[col], axis=1
            )
        # Encode Status
        data['Status'] = data['Status'].map({'Developing': 0, 'Developed': 1})
        return data

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

### 2. Add New Features
- `GDP_per_capita`: GDP divided by Population.
- `Child_mortality_ratio`: Ratio of under-five deaths to infant deaths.
- `Health_expenditure_per_GDP`: Total expenditure divided by GDP.

In [None]:
def add_features(df):
    df = df.copy()
    df['GDP_per_capita'] = df['GDP'] / df['Population'].replace(0, np.nan)
    df['Child_mortality_ratio'] = df['under-five deaths'] / df['infant deaths'].replace(0, np.nan)
    df['Health_expenditure_per_GDP'] = df['Total expenditure'] / df['GDP'].replace(0, np.nan)
    return df

### 3. Prepare Data
- Split train data into train/validation sets.
- Apply preprocessing and feature engineering.

In [None]:
# Split data
X = train_data.drop('Life expectancy', axis=1)
y = train_data['Life expectancy']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess
preprocessor = DataPreprocessor()
X_train = preprocessor.fit_transform(X_train)
X_val = preprocessor.transform(X_val)
X_test = preprocessor.transform(test_data)

# Add features
X_train = add_features(X_train)
X_val = add_features(X_val)
X_test = add_features(X_test)

# Drop Country column
X_train = X_train.drop('Country', axis=1)
X_val = X_val.drop('Country', axis=1)
X_test = X_test.drop('Country', axis=1)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

## Model Training
- Use LightGBM Regressor for better performance.
- Evaluate with R² score.

In [None]:
# Train model
lgb_model = lgb.LGBMRegressor(n_estimators=1000, learning_rate=0.05, random_state=42)
lgb_model.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=50, verbose=100)

# Evaluate
y_pred_val = lgb_model.predict(X_val)
r2 = r2_score(y_val, y_pred_val)
print(f'Validation R² Score: {r2:.4f}')

## Prediction and Submission
- Predict life expectancy for test data.
- Save submission file.

In [None]:
# Predict on test data
test_pred = lgb_model.predict(X_test)
submission = pd.DataFrame({'Life expectancy': test_pred})
submission.to_csv('submission.csv', index=False)

## Save Outputs
- Compress model, submission, and notebook.

In [None]:
import joblib
import zipfile

# Save model and transformer
joblib.dump(lgb_model, 'model')
joblib.dump(preprocessor, 'preprocessor')
joblib.dump(scaler, 'scaler')

# Compress files
output_dir = Path('outputs')
output_dir.mkdir(exist_ok=True)
files = ['model', 'preprocessor', 'scaler', 'submission.csv', 'life_expectancy_prediction.ipynb']
with zipfile.ZipFile(output_dir / 'result.zip', 'w', compression=zipfile.ZIP_DEFLATED) as zf:
    for file in files:
        zf.write(file, file)