In [None]:
import numpy as np 
import pandas as pd

In [None]:
df=pd.read_excel("input/1-s2.0-S0038092X23004784-mmc1.xlsx")

In [None]:
df.head()

In [None]:
df = df.apply(pd.to_numeric, errors='coerce')

threshold = len(df) * 0.5

df = df.dropna(thresh=threshold, axis=1)

df = df[df['PCE(%)'].notna() & (df['PCE(%)'] != "")]

for column in df.columns:
    mode_value = df[column].mode()[0]
    df[column].fillna(mode_value, inplace=True)

In [None]:

# Print the complete describe() output
print(df.describe())


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [None]:
correlation_matrix = df.corr()

# Plot the heatmap
plt.figure(figsize=(36, 36))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
df.columns

In [None]:
lst=[ 'Thickness():a', 'Thickness():c', 
       'Band gap,Eg():a',
       'Band gap,Eg():c', 
       'Band gap,Eg():e', 'Electron affinity,χ():a',
        'Electron affinity,χ():e', 
        'Dielectric permittivity,ℇr:a',
       'Dielectric permittivity,ℇr:c', 'Dielectric permittivity,ℇr:e',
       'CB effective density of states,Nc(cm-3):c',
       'VB effective density of states,Nc(cm-3):a',
       'VB effective density of states,Nc(cm-3):c',
       'VB effective density of states,Nc(cm-3):e',
       'Hole mobility ,μp(cm2/Vs):a', 'Hole mobility ,μp(cm2/Vs):c',
       'Hole mobility ,μp(cm2/Vs):e', 
       'Shallow donor density,ND(cm-3):c', 
       'Shallow donor density,ND(cm-3):e',
       'Shallow acceptor density,NA(cm-3):a',
       'Shallow acceptor density,NA(cm-3):c', 'Defect density,Nt(cm-3):a']

len(lst)

In [None]:
dataset=df[lst]

In [None]:
dataset.describe()

In [None]:
param_mapping = {
    "Thickness():a": "Th1",
    "Thickness():c": "Th2",
    "Thickness():e": "Th3",
    "Band gap,Eg():a": "Eg1",
    "Band gap,Eg():c": "Eg2",
    "Band gap,Eg():e": "Eg3",
    "Electron affinity,χ():a": "X1",
    "Electron affinity,χ():e": "X2",
    "Dielectric permittivity,ℇr:a": "DP1",
    "Dielectric permittivity,ℇr:c": "DP2",
    "Dielectric permittivity,ℇr:e": "DP3",
    "CB effective density of states,Nc(cm-3):c": "CB2",
    "VB effective density of states,Nc(cm-3):a": "VB1",
    "VB effective density of states,Nc(cm-3):c": "VB2",
    "VB effective density of states,Nc(cm-3):e": "VB3",
    "Hole mobility ,μp(cm2/Vs):a": "HM1",
    "Hole mobility ,μp(cm2/Vs):c": "HM2",
    "Hole mobility ,μp(cm2/Vs):e": "HM3",
    "Shallow donor density,ND(cm-3):c": "Nd2",
    "Shallow donor density,ND(cm-3):e": "Nd3",
    "Shallow acceptor density,NA(cm-3):c": "Na2",
    "Shallow acceptor density,NA(cm-3):a": "Na1",
    "Defect density,Nt(cm-3):a": "Nt1"
}

heat_map_df=dataset.rename(columns=param_mapping)

In [None]:
correlation_matrix = heat_map_df.corr()

# Plot the heatmap
plt.figure(figsize=(18, 18))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
from sklearn.ensemble import RandomForestRegressor,ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error, r2_score, mean_squared_error, roc_auc_score
from bayes_opt import BayesianOptimization
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from sklearn.tree import DecisionTreeRegressor


In [None]:
X = dataset
y = df['PCE(%)']    

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)


In [None]:
y_train.describe()

In [None]:
model_rf = RandomForestRegressor(n_estimators=100, random_state=42)

model_rf.fit(X_train_scale, y_train)


In [None]:
regressor = DecisionTreeRegressor(criterion="squared_error", max_depth=5, random_state=42)
regressor.fit(X_train_scale, y_train)

In [None]:
threshold = np.median(y)
y_binary = (y > threshold).astype(int) 
y_train_binary = (y_train > threshold).astype(int)
y_test_binary = (y_test > threshold).astype(int)

# Define Regression Models
models_regression = {
    "Decision Tree": DecisionTreeRegressor(max_depth=5, random_state=42),
    "KNN": KNeighborsRegressor(n_neighbors=5),
    "LightGBM": lgb.LGBMRegressor(n_estimators=100, random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Extra Trees": ExtraTreesRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, random_state=42)
}
# Store evaluation metrics
mae_values = []
r2_values = []
auc_values = []
model_names = []

# Train and Evaluate Regression Models
for name, model in models_regression.items():
    model.fit(X_train_scale, y_train)
    y_pred = model.predict(X_test_scale)
    
    # Calculate MSE and R² Score
    mae = mean_absolute_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    # Calculate AUC-ROC
    auc = roc_auc_score(y_test_binary, y_pred)

    print(name,mae,r2,auc)
    
    # Append metrics and names for visualization
    mae_values.append(mae)
    r2_values.append(r2)
    auc_values.append(float(auc))
    model_names.append(name)

# Visualization: Grouped Bar Chart
x = np.arange(len(model_names))  # Model indices
width = 0.25  # Width of bars

fig, ax = plt.subplots(figsize=(12, 6))

# Bar plots for each metric
# ax.bar(x - width, mse_values, width, label='MSE', color='skyblue')
ax.bar(x, r2_values, width, label='R² Score', color='lightgreen')
ax.bar(x + width, auc_values, width, label='AUC-ROC', color='salmon')

# Add labels, title, and legend
ax.set_xlabel("Models", fontsize=12)
ax.set_title("Comparison of Models on R², and AUC-ROC", fontsize=14)
ax.set_xticks(x)
ax.set_xticklabels(model_names, rotation=45, fontsize=10)
ax.set_ylabel("Metric Value", fontsize=12)
ax.legend()

# Adjust layout and display
plt.tight_layout()
print(r2_values)
print(auc_values)
print(model_names)
plt.show()

for i in range(6):
    print(model_names[i],mae_values[i],r2_values[i],auc_values[i])



In [None]:
lgbm = lgb.LGBMRegressor(
    n_estimators=100,  
    learning_rate=0.1, 
    max_depth=7,      
    random_state=42   
)

lgbm.fit(X_train_scale, y_train)

In [None]:
model_et = ExtraTreesRegressor(n_estimators=100, random_state=42)
model_et.fit(X_train_scale, y_train)

In [None]:
y_predicted=model_et.predict(X_test)

# Create the scatter plot
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_predicted, color='blue', alpha=0.5)

# Add labels and title
plt.xlabel('Actual Values', fontsize=12)
plt.ylabel('Predicted Values', fontsize=12)

# Show plot
plt.grid(True)
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
importances = model_et.feature_importances_
feature_names = X.columns

In [None]:

importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})
importance_df['Feature']=importance_df['Feature'].replace(param_mapping)
importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(10, 6))
sns.barplot(x='Importance', y='Feature', data=importance_df, palette='viridis')

plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

In [None]:
importance_df

In [None]:
xgboost_regressor = XGBRegressor(
    objective='reg:squarederror',
    max_depth=6,                 
    learning_rate=0.1,           
    n_estimators=100,            
    random_state=42              
)

xgboost_regressor.fit(X_train_scale, y_train)


In [None]:
knn = KNeighborsRegressor(n_neighbors=3) 
knn.fit(X_train_scale, y_train)

In [None]:

X_train

In [None]:
column_names = [f"{i}" for i in lst]  # Example column names
X_train_df = pd.DataFrame(X_train, columns=column_names)

# Create dictionary with min and max values for each column
dct = {}
for i in X_train_df.columns:
    dct[str(i)] = (X_train_df[i].min(), X_train_df[i].max())

print(dct)

In [None]:
target_pce = float(input("Enter the target PCE percentage: "))

In [None]:
def objective(**kwargs):
    features = np.array([kwargs[key] for key in pbounds.keys()]).reshape(1, -1)
    
    pred_pce = model_et.predict(features)[0]
    
    return -abs(pred_pce - target_pce)


pbounds = dct  

optimizer = BayesianOptimization(
    f=objective,
    pbounds=pbounds,
    random_state=42,
)

optimizer.maximize(init_points=10, n_iter=100)

best_features = optimizer.max['params']

print("Feature values for the target PCE percentage:", best_features)


In [None]:
from tabulate import tabulate
table = [(param_mapping[key], val) for key, val in best_features.items()]

headers = ['Feature', 'Value']

print(tabulate(table, headers=headers, tablefmt='pretty'))


In [None]:
def objective(**kwargs):
    features = np.array([kwargs[key] for key in pbounds.keys()]).reshape(1, -1)
    
    pred_pce = 0.47*model_rf.predict(features)[0] + 0.53*xgboost_regressor.predict(features)[0]
    
    return -abs(pred_pce - target_pce)


pbounds = dct  

optimizer = BayesianOptimization(
    f=objective,
    pbounds=pbounds,
    random_state=42,
)

optimizer.maximize(init_points=10, n_iter=100)

best_features = optimizer.max['params']

print("Feature values for the target PCE percentage:", best_features)


In [None]:

table = [(key, val) for key, val in best_features.items()]

# Define column names
headers = ['Feature', 'Value']

# Print the table
print(tabulate(table, headers=headers, tablefmt='pretty'))

    