### Data:
* Number of records: 214
* Number of features: 9 + 1 (target feature)
* Repository URL: https://archive.ics.uci.edu/dataset/42/glass+identification
 
##### Problems:
a. Find the best two models by creating a complete pipeline per each model, that explores both models and parameters. Comment and compare the results.\
b. Benchmark the best two models in __a.__ by different cross-validation techniques (at least 3). Comment results.\
c. Run one AutoML calculation on the dataset. How do these results compare with the obtained in __a.__? 

In [2]:
# !pip install ucimlrepo

In [1]:
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
glass_identification = fetch_ucirepo(id=42) 
X = glass_identification.data.features 
y = glass_identification.data.targets

print(sns.heatmap(X.corr().round(2), annot=True, cmap='coolwarm', center=0))

print("\n1. DATASET EXPLORATION")
print("-" * 80)
print(f"Dataset shape: {X.shape}")
print(f"\nFeatures:\n{X.columns.tolist()}")
print(f"\nTarget variable:\n{y.columns.tolist()}\n")
print(f"\nClass distribution:\n{y.value_counts()}")
print(f"\nFeature statistics:\n{X.describe()}")
print(f"\nMissing values:\n{X.isnull().sum()}")

y = y.values.ravel()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print(f"\nTraining set size: {X_train.shape[0]}")
print(f"Test set size: {X_test.shape[0]}")

print("2. MODEL 1: RANDOM FOREST CLASSIFIER")

rf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('rf', RandomForestClassifier(random_state=42))
])

rf_param_grid = {
    'rf__n_estimators': [50, 100, 200, 300],
    'rf__max_depth': [None, 10, 20, 30],
    'rf__min_samples_split': [2, 5, 10],
    'rf__min_samples_leaf': [1, 2, 4],
    'rf__max_features': ['sqrt', 'log2']
}

print("\nHyperparameter grid:")
for param, values in rf_param_grid.items():
    print(f"  {param}: {values}")

print("\nPerforming GridSearchCV (this may take a moment)...")
rf_grid_search = GridSearchCV(
    rf_pipeline, 
    rf_param_grid, 
    cv=5, 
    scoring='accuracy',
    n_jobs=-1,
    verbose=0
)
rf_grid_search.fit(X_train, y_train)

print("\n--- Random Forest Results ---")
print(f"Best parameters: {rf_grid_search.best_params_}")
print(f"Best cross-validation accuracy: {rf_grid_search.best_score_:.4f}")

rf_best_model = rf_grid_search.best_estimator_
y_pred_rf = rf_best_model.predict(X_test)
rf_test_accuracy = accuracy_score(y_test, y_pred_rf)

print(f"\nTest Set Performance:")
print(f"Accuracy: {rf_test_accuracy:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_rf))
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_rf))

feature_importance_rf = rf_best_model.named_steps['rf'].feature_importances_
feature_names = X_train.columns

feature_importance_df_rf = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance_rf
}).sort_values('Importance', ascending=False)

print(f"\nFeature Importance (Top 5):")
print(feature_importance_df_rf.head())

print("3. MODEL 2: GRADIENT BOOSTING CLASSIFIER")

gb_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('gb', GradientBoostingClassifier(random_state=42))
])

gb_param_grid = {
    'gb__n_estimators': [50, 100, 200],
    'gb__learning_rate': [0.01, 0.1, 0.2],
    'gb__max_depth': [3, 5, 7],
    'gb__min_samples_split': [2, 5, 10],
    'gb__min_samples_leaf': [1, 2, 4],
    'gb__subsample': [0.8, 1.0]
}

print("\nHyperparameter grid:")
for param, values in gb_param_grid.items():
    print(f"  {param}: {values}")

# Perform Grid Search with Cross-Validation
print("\nPerforming GridSearchCV (this may take a moment)...")
gb_grid_search = GridSearchCV(
    gb_pipeline, 
    gb_param_grid, 
    cv=5, 
    scoring='accuracy',
    n_jobs=-1,
    verbose=0
)

gb_grid_search.fit(X_train, y_train)

print("\n--- Gradient Boosting Results ---")
print(f"Best parameters: {gb_grid_search.best_params_}")
print(f"Best cross-validation accuracy: {gb_grid_search.best_score_:.4f}")

gb_best_model = gb_grid_search.best_estimator_
y_pred_gb = gb_best_model.predict(X_test)
gb_test_accuracy = accuracy_score(y_test, y_pred_gb)

print(f"\nTest Set Performance:")
print(f"Accuracy: {gb_test_accuracy:.4f}")
print(f"\nClassification Report:")
print(classification_report(y_test, y_pred_gb))
print(f"\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred_gb))

# Feature importance
feature_importance_gb = gb_best_model.named_steps['gb'].feature_importances_
feature_importance_df_gb = pd.DataFrame({
    'Feature': X.columns,
    'Importance': feature_importance_gb
}).sort_values('Importance', ascending=False)

print(f"\nFeature Importance (Top 5):")
print(feature_importance_df_gb.head())

print("4. COMPREHENSIVE MODEL COMPARISON")
print("=" * 80)

comparison_df = pd.DataFrame({
    'Model': ['Random Forest', 'Gradient Boosting'],
    'Best CV Accuracy': [rf_grid_search.best_score_, gb_grid_search.best_score_],
    'Test Accuracy': [rf_test_accuracy, gb_test_accuracy],
    'CV-Test Gap': [rf_grid_search.best_score_ - rf_test_accuracy, 
                    gb_grid_search.best_score_ - gb_test_accuracy]
})

print("\n" + comparison_df.to_string(index=False))

print("\n" + "-" * 80)
print("DETAILED ANALYSIS AND COMMENTS")
print("-" * 80)

# Determine winner
if rf_test_accuracy > gb_test_accuracy:
    winner = "Random Forest"
    winner_accuracy = rf_test_accuracy
    diff = rf_test_accuracy - gb_test_accuracy
else:
    winner = "Gradient Boosting"
    winner_accuracy = gb_test_accuracy
    diff = gb_test_accuracy - rf_test_accuracy

print(f"\n✓ WINNER: {winner}")
print(f"  - Test Accuracy: {winner_accuracy:.4f}")
print(f"  - Advantage: {diff:.4f} ({diff*100:.2f}%) over the other model")

print(f"\n1. PERFORMANCE ANALYSIS:")
print(f"   - Random Forest achieved {rf_test_accuracy:.2%} test accuracy")
print(f"   - Gradient Boosting achieved {gb_test_accuracy:.2%} test accuracy")
print(f"   - Both models show {'good' if min(rf_test_accuracy, gb_test_accuracy) > 0.70 else 'moderate'} performance")

print(f"\n2. GENERALIZATION:")
rf_gap = rf_grid_search.best_score_ - rf_test_accuracy
gb_gap = gb_grid_search.best_score_ - gb_test_accuracy
print(f"   - Random Forest CV-Test gap: {rf_gap:.4f} ({'overfitting' if rf_gap > 0.05 else 'good generalization'})")
print(f"   - Gradient Boosting CV-Test gap: {gb_gap:.4f} ({'overfitting' if gb_gap > 0.05 else 'good generalization'})")

print(f"\n3. MODEL CHARACTERISTICS:")
print(f"   Random Forest:")
print(f"   - Ensemble of decision trees with bagging")
print(f"   - Best params: {rf_grid_search.best_params_}")
print(f"   - More robust to outliers and less prone to overfitting")
print(f"   - Faster training with parallel processing")
print(f"\n   Gradient Boosting:")
print(f"   - Sequential ensemble with boosting")
print(f"   - Best params: {gb_grid_search.best_params_}")
print(f"   - Better at capturing complex patterns")
print(f"   - More sensitive to hyperparameters")

print(f"\n4. FEATURE IMPORTANCE COMPARISON:")
print(f"\n   Top 3 features - Random Forest:")
for idx, row in feature_importance_df_rf.head(3).iterrows():
    print(f"   - {row['Feature']}: {row['Importance']:.4f}")
print(f"\n   Top 3 features - Gradient Boosting:")
for idx, row in feature_importance_df_gb.head(3).iterrows():
    print(f"   - {row['Feature']}: {row['Importance']:.4f}")

print("\n5. RECOMMENDATIONS:")
if winner == "Random Forest":
    print(f"   ✓ Use Random Forest for deployment")
    print(f"   - Higher test accuracy")
    print(f"   - Faster prediction times")
    print(f"   - More interpretable and stable")
else:
    print(f"   ✓ Use Gradient Boosting for deployment")
    print(f"   - Higher test accuracy")
    print(f"   - Better at handling complex relationships")
    print(f"   - Consider ensemble of both models for production")