<a href="https://colab.research.google.com/github/osommersell264/MLsessions/blob/main/Sommersell_Ensemble_Exercise.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Your Task: Create an Ensemble



# Get the data

In [1]:
!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [2]:
# Some useful imports - feel free to modify
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix



In [3]:
# fetch dataset
predict_students_dropout_and_academic_success = fetch_ucirepo(id=697)

# data (as pandas dataframes)
X = predict_students_dropout_and_academic_success.data.features
y = predict_students_dropout_and_academic_success.data.targets

# Drop enrolled
X = X[y != 'Enrolled']
y = y[y != 'Enrolled']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)
# Why Big X and little y?
# X represents the matrix of x features, y represents the vector of y outcome.

In [5]:
Calculate Base-Rate Accuracy (Naive Model)
base_rate_accuracy = y.value_counts().max() / len(y)
print(f"Base-Rate Accuracy (Naive Model): {base_rate_accuracy:.2f}")

SyntaxError: invalid syntax (<ipython-input-5-5bbfcd1fa78f>, line 1)

# Create a few classifiers classifiers
- you don't need to fit or predict here, just initialize the model


In [6]:
# Create a few classifiers classifiers
modelA = DecisionTreeClassifier(max_depth=5, random_state=42)  # Simple decision tree
modelB = LogisticRegression(max_iter=1000, random_state=42)    # Logistic regression for binary classification
modelC = RandomForestClassifier(n_estimators=100, random_state=42)  # More complex decision trees combined

In [None]:
# Data Cleaning & Preprocessing
# Convert target variable to binary format: Graduate = 1, Dropout = 0
y = y.map({"Graduate": 1, "Dropout": 0})

In [None]:
# Encode categorical variables to numerical values
for col in X.select_dtypes(include=['object']).columns:
    X[col] = LabelEncoder().fit_transform(X[col])

In [None]:
# Standardize numerical features to improve model performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
#Train-Test Split
# We split the dataset into 80% training and 20% testing to evaluate model performance.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


# Create an Ensemble

In [None]:
# Step 6: Create the Ensemble Model
ensemble = VotingClassifier(
    estimators=[
        ('decision_tree', modelA),
        ('logistic_regression', modelB),
        ('random_forest', modelC)
    ],
    voting='soft'  # Changed to soft voting for better probability distribution
)


# Train you ensemble (hint use `.fit()`)
ensemble.fit(X_train, y_train)

# Create predictions (hint use `.predict()`)
y_pred_ensemble = ensemble.predict(X_test)

In [None]:
#Evaluate the Ensemble Model
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
print(f"Ensemble Model Accuracy: {ensemble_accuracy:.2f}")
print("Ensemble Classification Report:\n", classification_report(y_test, y_pred_ensemble))

#Feature Importance Analysis (Random Forest)
feature_importances = pd.Series(modelC.feature_importances_, index=X.columns)
print("Feature Importances (Random Forest):\n", feature_importances.sort_values(ascending=False))


In [None]:
# Compare with Individual Models
modelA.fit(X_train, y_train)
y_pred_dt = modelA.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {dt_accuracy:.2f}")

modelB.fit(X_train, y_train)
y_pred_lr = modelB.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {lr_accuracy:.2f}")

modelC.fit(X_train, y_train)
y_pred_rf = modelC.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {rf_accuracy:.2f}")

In [None]:
# Step 12: Visualizing Confusion Matrices
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

sns.heatmap(confusion_matrix(y_test, y_pred_ensemble), annot=True, fmt='d', cmap='Reds', ax=axes[0, 0])
axes[0, 0].set_title('Ensemble Model')

sns.heatmap(confusion_matrix(y_test, y_pred_dt), annot=True, fmt='d', cmap='Blues', ax=axes[0, 1])
axes[0, 1].set_title('Decision Tree')

sns.heatmap(confusion_matrix(y_test, y_pred_lr), annot=True, fmt='d', cmap='Greens', ax=axes[1, 0])
axes[1, 0].set_title('Logistic Regression')

sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap='Oranges', ax=axes[1, 1])
axes[1, 1].set_title('Random Forest')

plt.tight_layout()
plt.show()

In [None]:
# Calculate accuracy metrics
# how are FP/FN different between your models, what balance is the ensemble striking?