In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

# Load datasets
education_train = pd.read_csv('module_Education_train_set.csv')
household_train = pd.read_csv('module_HouseholdInfo_train_set.csv')
poverty_train = pd.read_csv('module_SubjectivePoverty_train_set.csv')
education_test = pd.read_csv('module_Education_test_set.csv')
household_test = pd.read_csv('module_HouseholdInfo_test_set.csv')

# Split `psu_hh_idcode` into `psu`, `hh`, and `idcode`
poverty_train[['psu', 'hh', 'idcode']] = poverty_train['psu_hh_idcode'].str.split('_', expand=True).astype(int)

# Merge the training datasets
train_data = pd.merge(education_train, household_train, on=['psu', 'hh', 'idcode'], how='inner')
train_data = pd.merge(train_data, poverty_train, on=['psu', 'hh', 'idcode'], how='inner')

# Merge the test datasets
test_data = pd.merge(education_test, household_test, on=['psu', 'hh', 'idcode'], how='inner')

# Define features and target
X = train_data.drop(columns=['psu_hh_idcode', 'subjective_poverty_1', 'subjective_poverty_2', 'subjective_poverty_3', 'subjective_poverty_4', 'subjective_poverty_5', 'subjective_poverty_6', 'subjective_poverty_7', 'subjective_poverty_8', 'subjective_poverty_9', 'subjective_poverty_10', 'psu', 'hh', 'idcode'])
y = train_data[['subjective_poverty_1', 'subjective_poverty_2', 'subjective_poverty_3', 'subjective_poverty_4', 'subjective_poverty_5', 'subjective_poverty_6', 'subjective_poverty_7', 'subjective_poverty_8', 'subjective_poverty_9', 'subjective_poverty_10']].idxmax(axis=1).str.split('_').str[-1].astype(int) - 1

# Align the test data columns with training data features
X_test = test_data[X.columns]

# Impute missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
X_test_imputed = imputer.transform(X_test)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X_imputed, y, test_size=0.2, random_state=42)



In [10]:
from xgboost import XGBClassifier
from sklearn.metrics import log_loss

# Initialize the XGBClassifier with GPU support
model = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    objective='multi:softprob',
    num_class=10,
    random_state=42,
    tree_method='gpu_hist'  # Use GPU for training if available
)

# Fit the model on the training data
model.fit(X_train, y_train)

# Predict probabilities on the validation set
y_val_pred_proba = model.predict_proba(X_val)

# Calculate Log Loss on validation set
validation_log_loss = log_loss(y_val, y_val_pred_proba)
print("Validation Log Loss on Original Data:", validation_log_loss)

ModuleNotFoundError: No module named 'xgboost'

In [None]:
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from sklearn.metrics import log_loss

# Initialize and train the XGBoost model with best parameters and GPU support
model_optimized = XGBClassifier(
    colsample_bytree=0.8,
    learning_rate=0.1,
    max_depth=4,
    n_estimators=100,
    subsample=0.8,
    objective='multi:softprob',
    num_class=10,
    random_state=42,
    tree_method='gpu_hist'  # Enable GPU support if available
)
model_optimized.fit(X_train, y_train)

# Calibrate the model using isotonic regression
calibrated_model = CalibratedClassifierCV(estimator=model_optimized, method='isotonic', cv=3)
calibrated_model.fit(X_train, y_train)

# Predict calibrated probabilities on the validation set
y_val_pred_proba_calibrated = calibrated_model.predict_proba(X_val)

# Calculate Log Loss with calibrated probabilities
validation_log_loss_calibrated = log_loss(y_val, y_val_pred_proba_calibrated)
print("Validation Log Loss after Calibration:", validation_log_loss_calibrated)

In [None]:
!pip install lightgbm

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import log_loss
import numpy as np

# Initialize models with GPU support
model_xgb = XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    objective='multi:softprob',
    num_class=10,
    random_state=42,
    tree_method='gpu_hist'  # Enable GPU for XGBoost
)

model_lgb = LGBMClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1,
    objective='multiclass',
    num_class=10,
    random_state=42,
    device='gpu'  # Enable GPU for LightGBM
)

# Train both models on the training data
model_xgb.fit(X_train, y_train)
model_lgb.fit(X_train, y_train)

# Predict probabilities on the validation set
y_val_pred_proba_xgb = model_xgb.predict_proba(X_val)
y_val_pred_proba_lgb = model_lgb.predict_proba(X_val)

# Calculate log loss for each model
log_loss_xgb = log_loss(y_val, y_val_pred_proba_xgb)
log_loss_lgb = log_loss(y_val, y_val_pred_proba_lgb)

print("Validation Log Loss for XGBoost Model:", log_loss_xgb)
print("Validation Log Loss for LightGBM Model:", log_loss_lgb)

# Ensemble the predictions by averaging and normalize the ensemble probabilities
y_val_pred_proba_ensemble = (y_val_pred_proba_xgb + y_val_pred_proba_lgb) / 2
y_val_pred_proba_ensemble = y_val_pred_proba_ensemble / y_val_pred_proba_ensemble.sum(axis=1, keepdims=True)

# Calculate log loss for the ensemble model
validation_log_loss_ensemble_normalized = log_loss(y_val, y_val_pred_proba_ensemble)
print("Validation Log Loss with Normalized Ensemble:", validation_log_loss_ensemble_normalized)

In [None]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBClassifier

# Define the parameter grid
param_grid = {
    'n_estimators': [200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [4, 6, 8]
}

# Initialize GridSearchCV with XGBoost Classifier, setting `tree_method` to 'gpu_hist' for GPU acceleration
grid_search = GridSearchCV(
    XGBClassifier(random_state=42, tree_method='gpu_hist', use_label_encoder=False, eval_metric='mlogloss'),
    param_grid,
    cv=3,
    scoring='neg_log_loss',
    verbose=2,
    n_jobs=-1  # Use all available cores for faster computation
)

# Run the grid search
grid_search.fit(X_train, y_train)

# Display best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Log Loss:", -grid_search.best_score_)

In [None]:
# Drop identifiers and target columns to get the list of feature names
original_feature_names = train_data.drop(columns=['psu', 'hh', 'idcode',
                                                  'subjective_poverty_1',
                                                  'subjective_poverty_2',
                                                  'subjective_poverty_3',
                                                  'subjective_poverty_4',
                                                  'subjective_poverty_5',
                                                  'subjective_poverty_6',
                                                  'subjective_poverty_7',
                                                  'subjective_poverty_8',
                                                  'subjective_poverty_9',
                                                  'subjective_poverty_10']).columns.tolist()

print("Feature names:", original_feature_names)

In [None]:
if 'psu_hh_idcode' in original_feature_names:
    original_feature_names.remove('psu_hh_idcode')

print("Updated feature names without 'psu_hh_idcode':", original_feature_names)

In [None]:
# Recreate X_train and X_test with consistent columns
X_train = train_data[original_feature_names].copy()
X_test = test_data[original_feature_names].copy()

print("X_train and X_test recreated with consistent feature names.")
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)

In [None]:
from sklearn.impute import SimpleImputer

# Impute missing values
imputer = SimpleImputer(strategy="median")
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

print("Missing values handled successfully.")

In [None]:
# Drop columns that are entirely missing
columns_to_drop = ['Q16', 'Q31', 'Q49', 'Q60']
X_train = X_train.drop(columns=columns_to_drop)
X_test = X_test.drop(columns=columns_to_drop)

print("Columns with only missing values dropped.")
print("New shape of X_train:", X_train.shape)
print("New shape of X_test:", X_test.shape)

In [None]:
# Impute missing values again after dropping columns
imputer = SimpleImputer(strategy="median")
X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed = imputer.transform(X_test)

print("Missing values handled successfully after dropping columns with all missing values.")

In [None]:
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

# Generate polynomial features up to degree 2
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly = poly.fit_transform(X_train_imputed)

# Scale the transformed features
scaler = StandardScaler()
X_train_poly_scaled = scaler.fit_transform(X_train_poly)

# Apply the same transformations to the test set
X_test_poly = poly.transform(X_test_imputed)
X_test_poly_scaled = scaler.transform(X_test_poly)

print("Polynomial features created and scaled for both training and test sets.")

In [None]:
pip install xgboost

In [None]:
# Check the shapes of X_train_poly_scaled and y_train
print("Shape of X_train_poly_scaled:", X_train_poly_scaled.shape)
print("Length of y_train:", len(y_train))

In [None]:
# Assuming the target columns are `subjective_poverty_1` through `subjective_poverty_10`
target_columns = [
    "subjective_poverty_1", "subjective_poverty_2", "subjective_poverty_3",
    "subjective_poverty_4", "subjective_poverty_5", "subjective_poverty_6",
    "subjective_poverty_7", "subjective_poverty_8", "subjective_poverty_9",
    "subjective_poverty_10"
]

# Extract the target class from train_data based on the maximum probability column
y_train = train_data[target_columns].idxmax(axis=1).str.split("_").str[-1].astype(int)

# Ensure y_train has the same number of rows as X_train
print("Recreated y_train with length:", len(y_train))

In [None]:
# Check that lengths now match
print("Shape of X_train_poly_scaled:", X_train_poly_scaled.shape)
print("Length of y_train:", len(y_train))

In [None]:
!pip install lightgbm catboost

In [None]:
from sklearn.ensemble import StackingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

# Define base models with GPU support
base_models = [
    ('catboost', CatBoostClassifier(iterations=200, learning_rate=0.01, depth=4, task_type="GPU", logging_level="Silent")),
    ('xgb', XGBClassifier(n_estimators=200, learning_rate=0.01, max_depth=4, use_label_encoder=False, eval_metric='mlogloss', tree_method='gpu_hist'))
]

# Define meta-model
meta_model = LogisticRegression()

# Initialize the stacking model
stacked_model = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_model,
    cv=5
)

# Train the stacked model on the polynomial, scaled features
stacked_model.fit(X_train_poly_scaled, y_train)

print("Stacked model trained successfully with GPU support.")

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss
import numpy as np

# Initialize Stratified K-Fold
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_log_loss = []

# Loop through each fold
for train_index, val_index in kfold.split(X_train_poly_scaled, y_train):
    # Split the data into training and validation for the current fold
    X_train_fold, X_val_fold = X_train_poly_scaled[train_index], X_train_poly_scaled[val_index]
    y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

    # Train the stacking model on the current fold
    stacked_model.fit(X_train_fold, y_train_fold)

    # Predict probabilities on the validation set
    val_pred_probs = stacked_model.predict_proba(X_val_fold)

    # Calculate log loss for the current fold and store it
    fold_log_loss = log_loss(y_val_fold, val_pred_probs)
    cv_log_loss.append(fold_log_loss)
    print(f"Log Loss for current fold: {fold_log_loss}")

# Calculate the average log loss across all folds
average_cv_log_loss = np.mean(cv_log_loss)
print("Average CV Log Loss:", average_cv_log_loss)

In [None]:
!pip install optuna

In [None]:
import optuna
from optuna.samplers import TPESampler  # Bayesian optimization sampler
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

# Split validation data for Optuna tuning
X_train_main, X_val, y_train_main, y_val = train_test_split(X_train_poly_scaled, y_train, test_size=0.2, random_state=42)

def objective(trial):
    # Narrowed hyperparameter ranges around the previous best values
    catboost_learning_rate = trial.suggest_loguniform("catboost_learning_rate", 0.03, 0.05)
    catboost_depth = trial.suggest_int("catboost_depth", 5, 7)
    catboost_iterations = trial.suggest_int("catboost_iterations", 300, 500)
    
    xgb_learning_rate = trial.suggest_loguniform("xgb_learning_rate", 0.03, 0.05)
    xgb_max_depth = trial.suggest_int("xgb_max_depth", 4, 6)
    xgb_n_estimators = trial.suggest_int("xgb_n_estimators", 400, 500)

    # Define CatBoost model with GPU support
    catboost = CatBoostClassifier(
        learning_rate=catboost_learning_rate,
        depth=catboost_depth,
        iterations=catboost_iterations,
        task_type="GPU",
        logging_level="Silent"
    )
    
    # Define XGBoost model with GPU support
    xgb = XGBClassifier(
        learning_rate=xgb_learning_rate,
        max_depth=xgb_max_depth,
        n_estimators=xgb_n_estimators,
        eval_metric='mlogloss',
        use_label_encoder=False,
        tree_method='gpu_hist'
    )

    # Define the stacking model with a LogisticRegression meta-model
    stacked_model = StackingClassifier(
        estimators=[('catboost', catboost), ('xgb', xgb)],
        final_estimator=LogisticRegression(),
        cv=10  # Increasing to 10-fold CV for better generalization
    )

    # Fit model on the training fold and evaluate on validation set
    stacked_model.fit(X_train_main, y_train_main)
    val_pred_probs = stacked_model.predict_proba(X_val)
    return log_loss(y_val, val_pred_probs)

# Optimize with Optuna
study = optuna.create_study(direction="minimize", sampler=TPESampler())
study.optimize(objective, n_trials=30)  # Increase trials for better tuning

# Display the best parameters and log loss
print("Best Parameters:", study.best_params)
print("Best Log Loss:", study.best_value)

In [None]:
import pandas as pd
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
import optuna
from optuna.samplers import TPESampler
from sklearn.metrics import log_loss

# Load datasets
education_train = pd.read_csv('module_Education_train_set.csv')
household_train = pd.read_csv('module_HouseholdInfo_train_set.csv')
poverty_train = pd.read_csv('module_SubjectivePoverty_train_set.csv')
education_test = pd.read_csv('module_Education_test_set.csv')
household_test = pd.read_csv('module_HouseholdInfo_test_set.csv')

# Split psu_hh_idcode for merge and processing
poverty_train[['psu', 'hh', 'idcode']] = poverty_train['psu_hh_idcode'].str.split('_', expand=True).astype(int)

# Merge datasets
train_data = pd.merge(education_train, household_train, on=['psu', 'hh', 'idcode'], how='inner')
train_data = pd.merge(train_data, poverty_train, on=['psu', 'hh', 'idcode'], how='inner')
test_data = pd.merge(education_test, household_test, on=['psu', 'hh', 'idcode'], how='inner')

# Define features and target
X = train_data.drop(columns=['psu_hh_idcode', 'subjective_poverty_1', 'subjective_poverty_2', 'subjective_poverty_3', 'subjective_poverty_4', 'subjective_poverty_5', 'subjective_poverty_6', 'subjective_poverty_7', 'subjective_poverty_8', 'subjective_poverty_9', 'subjective_poverty_10', 'psu', 'hh', 'idcode'])
y = train_data[['subjective_poverty_1', 'subjective_poverty_2', 'subjective_poverty_3', 'subjective_poverty_4', 'subjective_poverty_5', 'subjective_poverty_6', 'subjective_poverty_7', 'subjective_poverty_8', 'subjective_poverty_9', 'subjective_poverty_10']].idxmax(axis=1).str.split('_').str[-1].astype(int) - 1

X_test = test_data[X.columns]

# Impute missing values
imputer = SimpleImputer(strategy='median')
X_imputed = imputer.fit_transform(X)
X_test_imputed = imputer.transform(X_test)

# Polynomial feature transformation
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_train_poly_scaled = poly.fit_transform(X_imputed)
X_test_poly_scaled = poly.transform(X_test_imputed)

# Train-validation split
X_train_main, X_val, y_train_main, y_val = train_test_split(X_train_poly_scaled, y, test_size=0.2, random_state=42)

# Optuna hyperparameter tuning function
def objective(trial):
    # Narrowed hyperparameter ranges around the previous best values
    catboost_learning_rate = trial.suggest_loguniform("catboost_learning_rate", 0.03, 0.05)
    catboost_depth = trial.suggest_int("catboost_depth", 5, 7)
    catboost_iterations = trial.suggest_int("catboost_iterations", 300, 500)

    xgb_learning_rate = trial.suggest_loguniform("xgb_learning_rate", 0.03, 0.05)
    xgb_max_depth = trial.suggest_int("xgb_max_depth", 4, 6)
    xgb_n_estimators = trial.suggest_int("xgb_n_estimators", 400, 500)

    # Define CatBoost model with GPU support
    catboost = CatBoostClassifier(
        learning_rate=catboost_learning_rate,
        depth=catboost_depth,
        iterations=catboost_iterations,
        task_type="GPU",
        logging_level="Silent"
    )

    # Define XGBoost model with GPU support
    xgb = XGBClassifier(
        learning_rate=xgb_learning_rate,
        max_depth=xgb_max_depth,
        n_estimators=xgb_n_estimators,
        eval_metric='mlogloss',
        use_label_encoder=False,
        tree_method='gpu_hist'
    )

    # Define the stacking model with a LogisticRegression meta-model
    stacked_model = StackingClassifier(
        estimators=[('catboost', catboost), ('xgb', xgb)],
        final_estimator=LogisticRegression(),
        cv=10  # 10-fold CV for better generalization
    )

    # Fit model on the training fold and evaluate on validation set
    stacked_model.fit(X_train_main, y_train_main)
    val_pred_probs = stacked_model.predict_proba(X_val)
    return log_loss(y_val, val_pred_probs)

# Optimize with Optuna
study = optuna.create_study(direction="minimize", sampler=TPESampler())
study.optimize(objective, n_trials=30)  # Increase trials for better tuning

# Display the best parameters and log loss
print("Best Parameters:", study.best_params)
print("Best Log Loss:", study.best_value)

# Retrieve the best parameters from the Optuna study
best_params = study.best_params

# Initialize models with the optimized parameters
catboost_best = CatBoostClassifier(
    learning_rate=best_params["catboost_learning_rate"],
    depth=best_params["catboost_depth"],
    iterations=best_params["catboost_iterations"],
    task_type="GPU",
    logging_level="Silent"
)

xgb_best = XGBClassifier(
    learning_rate=best_params["xgb_learning_rate"],
    max_depth=best_params["xgb_max_depth"],
    n_estimators=best_params["xgb_n_estimators"],
    eval_metric='mlogloss',
    use_label_encoder=False,
    tree_method='gpu_hist'
)

# Define the final stacking model with the optimized base models
stacked_model_best = StackingClassifier(
    estimators=[('catboost', catboost_best), ('xgb', xgb_best)],
    final_estimator=LogisticRegression(),
    cv=10
)

# Train the final stacked model on the full training data
stacked_model_best.fit(X_train_poly_scaled, y_train)
print("Final stacked model trained successfully with optimized parameters.")

# Generate predictions for the test set
test_pred_probs = stacked_model_best.predict_proba(X_test_poly_scaled)

# Check if 'psu_hh_idcode' exists in test_data, create it if necessary
if 'psu_hh_idcode' not in test_data.columns:
    if {'psu', 'hh', 'idcode'}.issubset(test_data.columns):
        test_data['psu_hh_idcode'] = (
            test_data['psu'].astype(str) + '_' + 
            test_data['hh'].astype(str) + '_' + 
            test_data['idcode'].astype(str)
        )
    else:
        raise ValueError("test_data is missing 'psu_hh_idcode' and 'psu', 'hh', 'idcode' columns.")

# Create a submission DataFrame in the required format
target_columns = [
    "subjective_poverty_1", "subjective_poverty_2", "subjective_poverty_3",
    "subjective_poverty_4", "subjective_poverty_5", "subjective_poverty_6",
    "subjective_poverty_7", "subjective_poverty_8", "subjective_poverty_9",
    "subjective_poverty_10"
]
submission = pd.DataFrame(test_pred_probs, columns=target_columns)

# Include the identifier column
submission.insert(0, "psu_hh_idcode", test_data["psu_hh_idcode"])
# Save the submission file to the local directory
submission.to_csv('submission.csv', index=False)
print("Submission file 'submission.csv' created and saved to the local directory.")