In [1]:
# ============================================
# 📘 NOTEBOOK 3: Feature Importance & Explainability
# ============================================

# 🎯 Objective:
# Identify which material features most strongly affect the predicted bandgap.
# We'll use the Random Forest model (best from previous notebook) and visualize feature importances.
# Optionally, we'll use SHAP for deeper model interpretability.

# --------------------------------------------
# 🧩 STEP 1. Import Libraries
# --------------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error

# Optional: install and import SHAP for model explainability
try:
    import shap
    has_shap = True
except ImportError:
    print("⚠️ SHAP not installed. Skipping SHAP analysis.")
    has_shap = False

# --------------------------------------------
# 🧩 STEP 2. Load Dataset
# --------------------------------------------
data = pd.read_csv("sample_bandgap_data.csv")
print("✅ Dataset loaded successfully!")
display(data.head())

# --------------------------------------------
# 🧩 STEP 3. Prepare Features and Target
# --------------------------------------------
X = data.drop(columns=["bandgap"])
y = data["bandgap"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --------------------------------------------
# 🧩 STEP 4. Train the Best Model (Random Forest)
# --------------------------------------------
rf = RandomForestRegressor(random_state=42, n_estimators=200)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print(f"✅ Model trained! R² = {r2:.3f}, MSE = {mse:.3f}")

# --------------------------------------------
# 🧩 STEP 5. Feature Importance Visualization
# --------------------------------------------
importances = rf.feature_importances_
feature_names = X.columns
feat_imp = pd.DataFrame({"Feature": feature_names, "Importance": importances})
feat_imp = feat_imp.sort_values(by="Importance", ascending=False)

plt.figure(figsize=(8,5))
sns.barplot(data=feat_imp.head(10), x="Importance", y="Feature", palette="viridis")
plt.title("Top 10 Important Features (Random Forest)")
plt.show()

# --------------------------------------------
# 🧩 STEP 6. SHAP Analysis (Optional)
# --------------------------------------------
if has_shap:
    explainer = shap.TreeExplainer(rf)
    shap_values = explainer.shap_values(X_test)

    shap.summary_plot(shap_values, X_test, plot_type="bar")
    shap.summary_plot(shap_values, X_test)

# --------------------------------------------
# 🧩 STEP 7. Interpretation
# --------------------------------------------
print("""
🔍 **Interpretation:**

- The top-ranked features are those the model relies on most when predicting bandgaps.
- Physically, features linked to average electronegativity, atomic volume, or mean bond strength
  often dominate bandgap behavior.
- If SHAP is available, its plots show how each feature value (high vs low) affects predictions
  — providing a more interpretable understanding of materials design.
""")


✅ Dataset loaded successfully!


Unnamed: 0,formula,bandgap
0,Fe2O3,2.1
1,SiO2,1.2
2,Al2O3,3.5
3,TiO2,3.0


ValueError: could not convert string to float: 'TiO2'