In [2]:
# ============================================
# 📘 NOTEBOOK 2: Model Comparison for Bandgap Prediction
# ============================================

# 🎯 Objective:
# Compare multiple machine learning models (Linear Regression, Random Forest, XGBoost)
# on the same Magpie feature set to understand trade-offs in accuracy and complexity.

# --------------------------------------------
# 🧩 STEP 1. Import Libraries
# --------------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor

# Try importing XGBoost; skip if unavailable
try:
    from xgboost import XGBRegressor
    has_xgb = True
except:
    print("⚠️ XGBoost not found. Skipping.")
    has_xgb = False

# --------------------------------------------
# 🧩 STEP 2. Load Dataset
# --------------------------------------------
# 🔹 You can use your previously saved dataset from the feature-comparison notebook
#    (for now, let's mock a small Magpie-like dataset)

data = pd.read_csv("sample_bandgap_data.csv")
print("✅ Dataset loaded successfully!")
display(data.head())

# --------------------------------------------
# 🧩 STEP 3. Prepare Features and Target
# --------------------------------------------
X = data.drop(columns=["bandgap"])
y = data["bandgap"]

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# --------------------------------------------
# 🧩 STEP 4. Train Multiple Models
# --------------------------------------------
results = []

# 1️⃣ Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred_lr = lr.predict(X_test_scaled)
results.append({
    "Model": "Linear Regression",
    "MSE": mean_squared_error(y_test, y_pred_lr),
    "R2": r2_score(y_test, y_pred_lr)
})

# 2️⃣ Random Forest
rf = RandomForestRegressor(random_state=42, n_estimators=100)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
results.append({
    "Model": "Random Forest",
    "MSE": mean_squared_error(y_test, y_pred_rf),
    "R2": r2_score(y_test, y_pred_rf)
})

# 3️⃣ XGBoost (if available)
if has_xgb:
    xgb = XGBRegressor(random_state=42, n_estimators=200, learning_rate=0.1)
    xgb.fit(X_train, y_train)
    y_pred_xgb = xgb.predict(X_test)
    results.append({
        "Model": "XGBoost",
        "MSE": mean_squared_error(y_test, y_pred_xgb),
        "R2": r2_score(y_test, y_pred_xgb)
    })

# --------------------------------------------
# 🧩 STEP 5. Compare Performance
# --------------------------------------------
results_df = pd.DataFrame(results)
display(results_df)

# Bar Plot
plt.figure(figsize=(8,4))
sns.barplot(data=results_df, x="Model", y="R2", palette="crest")
plt.title("R² Score Comparison")
plt.show()

# --------------------------------------------
# 🧩 STEP 6. Interpretation
# --------------------------------------------
print("""
🔍 **Interpretation:**

- Linear Regression gives a baseline; it assumes linear relationships.
- Random Forest captures non-linear patterns, usually improving accuracy.
- XGBoost (if used) may outperform others on tabular data due to boosting.

The best model balances accuracy (R² ↑, MSE ↓) and interpretability.
""")


✅ Dataset loaded successfully!


Unnamed: 0,formula,bandgap
0,Fe2O3,2.1
1,SiO2,1.2
2,Al2O3,3.5
3,TiO2,3.0


ValueError: could not convert string to float: 'TiO2'