<a href="https://colab.research.google.com/github/osommersell264/MLsessions/blob/main/Ensemble022825.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Create an Ensemble

# Step 1: Install and Load Libraries
!pip install ucimlrepo scikit-learn pandas numpy matplotlib seaborn

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Fetch Dataset
predict_students_dropout_and_academic_success = fetch_ucirepo(id=697)

# Data (as pandas dataframes)
X = predict_students_dropout_and_academic_success.data.features
y = predict_students_dropout_and_academic_success.data.targets

# Drop enrolled
X = X[y != 'Enrolled']
y = y[y != 'Enrolled']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 3: Calculate Base-Rate Accuracy (Naive Model)
base_rate_accuracy = y.value_counts().max() / len(y)
print(f"Base-Rate Accuracy (Naive Model): {base_rate_accuracy:.2f}")

# Create a few classifiers classifiers
modelA = DecisionTreeClassifier(max_depth=5, random_state=42)  # Simple decision tree
modelB = LogisticRegression(max_iter=1000, random_state=42)    # Logistic regression for binary classification
modelC = RandomForestClassifier(n_estimators=100, random_state=42)  # More complex decision trees combined

# Step 4: Data Cleaning & Preprocessing
# Convert target variable to binary format: Graduate = 1, Dropout = 0
y = y.iloc[:,0].map({"Graduate": 1, "Dropout": 0}) # Apply map to the target column

# Encode categorical variables to numerical values
for col in X.select_dtypes(include=['object']).columns:
    X[col] = LabelEncoder().fit_transform(X[col])

# Standardize numerical features to improve model performance
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 5: Train-Test Split
# We split the dataset into 80% training and 20% testing to evaluate model performance.
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 6: Create the Ensemble Model
ensemble = VotingClassifier(
    estimators=[
        ('decision_tree', modelA),
        ('logistic_regression', modelB),
        ('random_forest', modelC)
    ],
    voting='soft'  # Changed to soft voting for better probability distribution
)

# Step 7: Train the Ensemble Model
ensemble.fit(X_train, y_train)

# Step 8: Make Predictions
y_pred_ensemble = ensemble.predict(X_test)

# Step 9: Evaluate the Ensemble Model
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
print(f"Ensemble Model Accuracy: {ensemble_accuracy:.2f}")
print("Ensemble Classification Report:\n", classification_report(y_test, y_pred_ensemble))

# Step 10: Feature Importance Analysis (Random Forest)
feature_importances = pd.Series(modelC.feature_importances_, index=X.columns)
print("Feature Importances (Random Forest):\n", feature_importances.sort_values(ascending=False))

# Step 11: Compare with Individual Models
modelA.fit(X_train, y_train)
y_pred_dt = modelA.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt)
print(f"Decision Tree Accuracy: {dt_accuracy:.2f}")

modelB.fit(X_train, y_train)
y_pred_lr = modelB.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred_lr)
print(f"Logistic Regression Accuracy: {lr_accuracy:.2f}")

modelC.fit(X_train, y_train)
y_pred_rf = modelC.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred_rf)
print(f"Random Forest Accuracy: {rf_accuracy:.2f}")

# Step 12: Visualizing Confusion Matrices
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

sns.heatmap(confusion_matrix(y_test, y_pred_ensemble), annot=True, fmt='d', cmap='Reds', ax=axes[0, 0])
axes[0, 0].set_title('Ensemble Model')

sns.heatmap(confusion_matrix(y_test, y_pred_dt), annot=True, fmt='d', cmap='Blues', ax=axes[0, 1])
axes[0, 1].set_title('Decision Tree')

sns.heatmap(confusion_matrix(y_test, y_pred_lr), annot=True, fmt='d', cmap='Greens', ax=axes[1, 0])
axes[1, 0].set_title('Logistic Regression')

sns.heatmap(confusion_matrix(y_test, y_pred_rf), annot=True, fmt='d', cmap='Oranges', ax=axes[1, 1])
axes[1, 1].set_title('Random Forest')

plt.tight_layout()
plt.show()



# Step 11: Analyzing False Positives & False Negatives
print("Confusion Matrix - Ensemble Model:\n", confusion_matrix(y_test, y_pred_ensemble))
print("Confusion Matrix - Decision Tree:\n", confusion_matrix(y_test, y_pred_dt))
print("Confusion Matrix - Logistic Regression:\n", confusion_matrix(y_test, y_pred_lr))
print("Confusion Matrix - Random Forest:\n", confusion_matrix(y_test, y_pred_rf))

# Step 12: Answer Key Questions
# Q1: How does the ensemble balance False Positives/Negatives compared to individual models?
# Answer: The ensemble averages predictions, reducing variance compared to a single model.

# Q2: What actions will happen based on predictions?
# Answer: Schools could use these predictions to identify at-risk students and intervene early.

# Q3: What are the data requirements?
# Answer: We need historical student records, attendance, GPA, and demographic factors to train the model.

# Q4: What classification problem does this solve?
# Answer: This is a binary classification problem (Graduate = 1, Dropout = 0).

"""
Answer Key Questions
Q1: How does the ensemble balance False Positives/Negatives compared to individual models?
Answer:

An ensemble model combines multiple classifiers, reducing the weaknesses of individual models.
If one model is too sensitive (high false positives) and another is too strict (high false negatives), the ensemble balances their predictions through majority voting.
Example: If the Decision Tree predicts "Dropout," but both Logistic Regression and Random Forest predict "Graduate," the ensemble will classify the student as "Graduate" (majority vote).
This reduces variance and increases reliability over a single model.
Q2: What actions will happen based on predictions?
Answer:

Schools and policymakers can identify at-risk students early and provide interventions such as tutoring, mentoring, or financial aid.
Example: If a student is predicted to drop out, the school can assign counselors or offer academic support to improve retention.
This model can help allocate resources efficiently and improve graduation rates.
Q3: What are the data requirements?
Answer:
To train this model, we need historical and demographic student data, including:
✅ Academic Records: GPA, test scores, course completion rates.
✅ Attendance Data: Absenteeism, tardiness.
✅ Demographic Information: Socioeconomic status, family background.
✅ Behavioral Data: Disciplinary records, engagement in extracurricular activities.

🔹 More data → Better model performance!

Q4: What classification problem does this solve?
Answer:

This is a binary classification problem where the goal is to predict student graduation status:
Graduate (1)
Dropout (0)
The model helps categorize students based on historical trends and their likelihood of success.
Why binary classification?
The target variable has only two possible outcomes (Graduate or Dropout).
The model learns from past patterns and predicts the probability of each student’s outcome.
🚀 Key Takeaways
✔ Ensemble models improve accuracy and reduce false positives/negatives.
✔ Schools can use predictions for targeted interventions to support at-risk students.
✔ High-quality, diverse data improves model performance.
✔ This is a binary classification task that predicts student outcomes (Graduate vs. Dropout).

"""