In [2]:
# Basic Libraries
import pandas as pd
import numpy as np

# Matminer and Pymatgen for features
from matminer.datasets import load_dataset
from pymatgen.core.composition import Composition
from matminer.featurizers.composition import ElementProperty

# Scikit-learn for modeling
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style="whitegrid")
# Load Matbench bandgap dataset
df = load_dataset("matbench_bandgap")
print("Dataset Shape:", df.shape)
df.head()
# Drop missing values
df = df.dropna()

# Convert formula to Pymatgen Composition object
df["composition"] = df["formula"].apply(Composition)

print("After cleaning:", df.shape)
df.head()
# Initialize featurizers
magpie = ElementProperty.from_preset("magpie")
deml = ElementProperty.from_preset("deml")

# Apply featurizers
df_magpie = magpie.featurize_dataframe(df.copy(), "composition")
df_deml = deml.featurize_dataframe(df.copy(), "composition")

print("Magpie features shape:", df_magpie.shape)
print("DEML features shape:", df_deml.shape)
# Define X and y
y = df["band_gap"]

X_magpie = df_magpie.drop(columns=["formula","composition","band_gap"])
X_deml = df_deml.drop(columns=["formula","composition","band_gap"])

# Split data (80% train, 20% test)
X_train_m, X_test_m, y_train, y_test = train_test_split(X_magpie, y, test_size=0.2, random_state=42)
X_train_d, X_test_d, _, _ = train_test_split(X_deml, y, test_size=0.2, random_state=42)
# Initialize models
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42)
}

# Function to train and evaluate
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    return mse, r2

# Evaluate on both feature sets
results = []

for name, model in models.items():
    mse_m, r2_m = evaluate_model(model, X_train_m, X_test_m, y_train, y_test)
    mse_d, r2_d = evaluate_model(model, X_train_d, X_test_d, y_train, y_test)
    results.append([name, "Magpie", mse_m, r2_m])
    results.append([name, "DEML", mse_d, r2_d])

results_df = pd.DataFrame(results, columns=["Model","Feature_Set","MSE","R2"])
results_df
plt.figure(figsize=(8,5))
sns.barplot(x="Model", y="R2", hue="Feature_Set", data=results_df)
plt.title("Model Performance (R²) by Feature Set")
plt.show()



ModuleNotFoundError: No module named 'matminer'