### Second Competition Model: Stacking Classifier

For the second competition, we used a **Stacking Classifier** that combines multiple base models (Logistic Regression, Random Forest, K-Nearest Neighbors, and Support Vector Classifier) with a **Logistic Regression** meta-model. This ensemble method improves performance by leveraging the strengths of each base model. The final predictions were made using this stacking approach, and the results were evaluated using balanced accuracy.


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import StackingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

In [None]:
train = pd.read_csv("./data/train_processed.csv")
test = pd.read_csv("./data/test_processed.csv")

In [None]:
X = train.drop('income', axis=1)  # Assuming 'income' is the target variable
y = train['income']

In [None]:
# Split the data into training and test sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [None]:
# Pipeline for preprocessing (imputation + scaling)
preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())  # Standard scaling of features
])

In [None]:
# Apply the preprocessor to your data
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

In [None]:
# Define base learners for the Stacking Classifier
base_learners = [
    ('lr', LogisticRegression(max_iter=1000)),  # Logistic Regression
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),  # Random Forest
    ('svc', SVC(probability=True, random_state=42))  # Support Vector Classifier
]

In [None]:
# Stacking Classifier
stacking_model = StackingClassifier(
    estimators=base_learners,  # Base models
    # Final model to combine base model predictions
    final_estimator=LogisticRegression(),
    cv=5  # Cross-validation for base models
)

In [None]:
# Train the Stacking Classifier
stacking_model.fit(X_train_scaled, y_train)

# Predict on the test set
y_pred = stacking_model.predict(X_test_scaled)

In [None]:
# Evaluate performance
print("Stacking Classifier Performance on Test Data:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
test_scaled = preprocessor.transform(test)
test_predictions = stacking_model.predict(test_scaled)

In [None]:
submission = pd.DataFrame({'income': test_predictions})
submission.to_csv('census_income_mustafozoda_sharif_2_stacking.csv', index=False)

print("Predictions saved to 'census_income_mustafozoda_sharif_2_stacking.csv'")

In [None]:
import os
from nbconvert import HTMLExporter
import nbformat


notebook_filename = 'census_income_mustafozoda_sharif_2_stacking.ipynb' 
with open(notebook_filename, 'r', encoding='utf-8') as f:
    notebook_content = nbformat.read(f, as_version=4)

html_exporter = HTMLExporter()
html_data, resources = html_exporter.from_notebook_node(notebook_content)

output_filename = 'census_income_mustafozoda_sharif_2_stacking.html'
with open(output_filename, 'w', encoding='utf-8') as f:
    f.write(html_data)

print(f"Notebook successfully exported to {output_filename}")