In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import log_loss
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
import optuna
from optuna.samplers import TPESampler
from optuna.pruners import MedianPruner
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
import warnings
import joblib

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# 1. Load datasets
education_train = pd.read_csv('module_Education_train_set.csv')
household_train = pd.read_csv('module_HouseholdInfo_train_set.csv')
poverty_train = pd.read_csv('module_SubjectivePoverty_train_set.csv')
education_test = pd.read_csv('module_Education_test_set.csv')
household_test = pd.read_csv('module_HouseholdInfo_test_set.csv')
sample_submission = pd.read_csv('sample_submission.csv')

# 2. Split `psu_hh_idcode` into `psu`, `hh`, and `idcode` for training
# Check if 'psu_hh_idcode' exists in poverty_train
if 'psu_hh_idcode' in poverty_train.columns:
    poverty_train[['psu', 'hh', 'idcode']] = poverty_train['psu_hh_idcode'].str.split('_', expand=True).astype(int)
else:
    raise KeyError("'psu_hh_idcode' column is missing from 'module_SubjectivePoverty_train_set.csv'")

# 3. Merge the training datasets
train_data = pd.merge(education_train, household_train, on=['psu', 'hh', 'idcode'], how='inner')
train_data = pd.merge(train_data, poverty_train, on=['psu', 'hh', 'idcode'], how='inner')

# 4. Merge the test datasets
test_data = pd.merge(education_test, household_test, on=['psu', 'hh', 'idcode'], how='inner')

# 5. Define columns to drop for training and test datasets
# For training data: drop identifiers and target columns
drop_columns_train = [
    'psu_hh_idcode',
    'subjective_poverty_1', 'subjective_poverty_2', 'subjective_poverty_3', 
    'subjective_poverty_4', 'subjective_poverty_5', 'subjective_poverty_6', 
    'subjective_poverty_7', 'subjective_poverty_8', 'subjective_poverty_9', 
    'subjective_poverty_10', 'psu', 'hh', 'idcode'
]

# For test data: ensure 'psu_hh_idcode' exists, then drop only identifier columns
if 'psu_hh_idcode' not in test_data.columns:
    if {'psu', 'hh', 'idcode'}.issubset(test_data.columns):
        test_data['psu_hh_idcode'] = (
            test_data['psu'].astype(str) + '_' + 
            test_data['hh'].astype(str) + '_' + 
            test_data['idcode'].astype(str)
        )
        print("'psu_hh_idcode' column was missing in test_data and has been created.")
    else:
        raise KeyError("test_data must contain 'psu', 'hh', and 'idcode' columns to create 'psu_hh_idcode'.")

drop_columns_test = [
    'psu', 'hh', 'idcode'  # Do NOT drop 'psu_hh_idcode' from test_data
]

# 6. Define features and target for training data
X = train_data.drop(columns=drop_columns_train)
# Extract the target variable
y = train_data[['subjective_poverty_1', 'subjective_poverty_2', 'subjective_poverty_3', 
               'subjective_poverty_4', 'subjective_poverty_5', 'subjective_poverty_6', 
               'subjective_poverty_7', 'subjective_poverty_8', 'subjective_poverty_9', 
               'subjective_poverty_10']].idxmax(axis=1).str.split('_').str[-1].astype(int) - 1

# 7. Define features for test data
X_test = test_data.drop(columns=drop_columns_test)

# 8. Identify numerical and categorical columns
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# 9. Define preprocessing for numerical and categorical data
# Numerical features: impute missing values with median
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

# Categorical features: impute missing values with most frequent and then One-Hot Encode
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# 10. Apply preprocessing to the data
X_preprocessed = preprocessor.fit_transform(X)
X_test_preprocessed = preprocessor.transform(X_test)

# 11. Encode target variable
# Since the target is multiclass (0-9), ensure it's properly encoded
# No need for LabelEncoder as y is already integer

# 12. Split data into training and validation sets using Stratified Split
X_train_main, X_val, y_train_main, y_val = train_test_split(
    X_preprocessed, y, 
    test_size=0.2, 
    random_state=42, 
    stratify=y
)

print("Data preprocessing and splitting complete.")

# 13. Define the Optuna optimization function
def objective(trial):
    # Define hyperparameter ranges for each model
    catboost_params = {
        'learning_rate': trial.suggest_loguniform("catboost_learning_rate", 0.01, 0.1),
        'depth': trial.suggest_int("catboost_depth", 4, 10),
        'iterations': trial.suggest_int("catboost_iterations", 100, 1000),
        'l2_leaf_reg': trial.suggest_float("catboost_l2_leaf_reg", 1e-3, 10.0, log=True),
        'random_state': 42,
        'verbose': False,
        'task_type': 'GPU',        # Ensure GPU usage if available
        'thread_count': 1          # Limit threads to prevent GPU conflicts
    }
    
    xgb_params = {
        'learning_rate': trial.suggest_loguniform("xgb_learning_rate", 0.01, 0.1),
        'max_depth': trial.suggest_int("xgb_max_depth", 3, 12),
        'n_estimators': trial.suggest_int("xgb_n_estimators", 100, 1000),
        'subsample': trial.suggest_float("xgb_subsample", 0.5, 1.0),
        'colsample_bytree': trial.suggest_float("xgb_colsample_bytree", 0.5, 1.0),
        'gamma': trial.suggest_float("xgb_gamma", 0, 5.0),
        'random_state': 42,
        'use_label_encoder': False,
        'eval_metric': 'mlogloss',
        'tree_method': 'gpu_hist',  # Ensure GPU usage if available
        'n_jobs': 1                 # Limit threads to prevent GPU conflicts
    }
    
    lgb_params = {
        'learning_rate': trial.suggest_loguniform("lgb_learning_rate", 0.01, 0.1),
        'max_depth': trial.suggest_int("lgb_max_depth", 3, 12),
        'num_leaves': trial.suggest_int("lgb_num_leaves", 20, 150),
        'bagging_fraction': trial.suggest_float("lgb_bagging_fraction", 0.5, 1.0),
        'feature_fraction': trial.suggest_float("lgb_feature_fraction", 0.5, 1.0),
        'n_estimators': trial.suggest_int("lgb_n_estimators", 100, 1000),
        'random_state': 42,
        'n_jobs': 1  # Limit threads to prevent potential conflicts
    }
    
    # Initialize base models with current hyperparameters
    catboost = CatBoostClassifier(**catboost_params)
    xgb = XGBClassifier(**xgb_params)
    lgb = LGBMClassifier(**lgb_params)
    
    # Define the stacking model
    stacking_model = StackingClassifier(
        estimators=[
            ('catboost', catboost),
            ('xgb', xgb),
            ('lgb', lgb)
        ],
        final_estimator=LogisticRegression(max_iter=1000, random_state=42),
        passthrough=False,
        cv=3,
        n_jobs=1  # Limit to 1 to prevent GPU conflicts
    )
    
    # Define cross-validation strategy
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)
    
    # Perform cross-validation and calculate average log loss
    cv_log_loss = []
    for train_idx, valid_idx in skf.split(X_train_main, y_train_main):
        X_train_cv, X_valid_cv = X_train_main[train_idx], X_train_main[valid_idx]
        y_train_cv, y_valid_cv = y_train_main.iloc[train_idx], y_train_main.iloc[valid_idx]
        
        stacking_model.fit(X_train_cv, y_train_cv)
        y_pred_probs = stacking_model.predict_proba(X_valid_cv)
        loss = log_loss(y_valid_cv, y_pred_probs)
        cv_log_loss.append(loss)
    
    avg_log_loss = np.mean(cv_log_loss)
    return avg_log_loss

# 14. Set up Optuna study
study = optuna.create_study(
    direction="minimize",
    sampler=TPESampler(),
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=30)
)

# 15. Optimize the study
study.optimize(objective, n_trials=50, timeout=3600)  # Adjust timeout as needed

# 16. Display the best parameters and log loss
print("Best Parameters:", study.best_params)
print("Best Log Loss:", study.best_value)

# 17. Train final models with optimized hyperparameters
# Extract the best parameters
best_params = study.best_params

# Initialize models with the optimized parameters
catboost_final = CatBoostClassifier(
    learning_rate=best_params.get("catboost_learning_rate", 0.03),
    depth=best_params.get("catboost_depth", 6),
    iterations=best_params.get("catboost_iterations", 500),
    l2_leaf_reg=best_params.get("catboost_l2_leaf_reg", 3),
    random_state=42,
    verbose=False,
    task_type='GPU',        # Ensure GPU usage if available
    thread_count=1          # Limit threads to prevent GPU conflicts
)

xgb_final = XGBClassifier(
    learning_rate=best_params.get("xgb_learning_rate", 0.03),
    max_depth=best_params.get("xgb_max_depth", 6),
    n_estimators=best_params.get("xgb_n_estimators", 500),
    subsample=best_params.get("xgb_subsample", 0.8),
    colsample_bytree=best_params.get("xgb_colsample_bytree", 0.8),
    gamma=best_params.get("xgb_gamma", 1.0),
    random_state=42,
    use_label_encoder=False,
    eval_metric='mlogloss',
    tree_method='gpu_hist',  # Ensure GPU usage if available
    n_jobs=1                 # Limit threads to prevent GPU conflicts
)

lgb_final = LGBMClassifier(
    learning_rate=best_params.get("lgb_learning_rate", 0.05),
    max_depth=best_params.get("lgb_max_depth", 7),
    num_leaves=best_params.get("lgb_num_leaves", 31),
    bagging_fraction=best_params.get("lgb_bagging_fraction", 0.8),
    feature_fraction=best_params.get("lgb_feature_fraction", 0.8),
    n_estimators=best_params.get("lgb_n_estimators", 500),
    random_state=42,
    n_jobs=1  # Limit threads to prevent potential conflicts
)

# Define the final stacking model
final_stacking_model = StackingClassifier(
    estimators=[
        ('catboost', catboost_final),
        ('xgb', xgb_final),
        ('lgb', lgb_final)
    ],
    final_estimator=LogisticRegression(max_iter=1000, random_state=42),
    passthrough=False,
    cv=5,
    n_jobs=1  # Limit to 1 to prevent GPU conflicts
)

# 18. Train the final stacked model on the entire training data
final_stacking_model.fit(X_train_main, y_train_main)
print("Final stacked model trained successfully with optimized parameters.")

# 19. Evaluate on the validation set
y_val_pred_probs = final_stacking_model.predict_proba(X_val)
validation_log_loss = log_loss(y_val, y_val_pred_probs)
print(f"Validation Log Loss: {validation_log_loss}")

# 20. Generate predictions for the test set
test_pred_probs = final_stacking_model.predict_proba(X_test_preprocessed)

# 21. Ensure 'psu_hh_idcode' exists in test_data (already handled earlier)
# Already ensured in step 5

# 22. Prepare the submission DataFrame
submission = pd.DataFrame(test_pred_probs, columns=[
    "subjective_poverty_1", "subjective_poverty_2", "subjective_poverty_3",
    "subjective_poverty_4", "subjective_poverty_5", "subjective_poverty_6",
    "subjective_poverty_7", "subjective_poverty_8", "subjective_poverty_9",
    "subjective_poverty_10"
])

# Include the identifier column
submission.insert(0, "psu_hh_idcode", test_data["psu_hh_idcode"])

# Ensure probabilities sum to 1
submission.iloc[:, 1:] = submission.iloc[:, 1:].div(submission.iloc[:, 1:].sum(axis=1), axis=0)

# 23. Save the submission file
submission.to_csv('submission.csv', index=False)
print("Submission file created and saved as 'submission.csv'.")


FileNotFoundError: [Errno 2] No such file or directory: 'module_Education_train_set.csv'