In [None]:
from pymatgen.core import Composition
from matminer.featurizers.conversions import StrToComposition
import pandas as pd
def convert_formula(formula):
    comp = Composition(formula)
    new_formula = ""
    for element, amount in comp.items():
        element_str = str(element)
        if amount == 1:
            new_formula += element_str
        else:
            new_formula += element_str + str(round(amount, 4))
    return new_formula
all_data=pd.read_excel('./O_p_band.xlsx')
all_data['formula composition'] = all_data['formula'].apply(convert_formula)
cathode_data = StrToComposition(target_col_id='formula_obj').featurize_dataframe(all_data, 'formula composition')
all_data['formula_obj'] = cathode_data['formula_obj']


StrToComposition:   0%|          | 0/230 [00:00<?, ?it/s]

In [2]:
all_data.tail()

Unnamed: 0,formula,p Band Center,formula composition,formula_obj
225,Sr0.500Li0.500Fe1O3,-1.679989,Sr0.5Li0.5FeO3.0,"(Sr, Li, Fe, O)"
226,La0.167Sr0.167K0.167Nd0.167Gd0.167Bi0.167Fe1O3,-1.077504,La0.167Sr0.167K0.167Nd0.167Gd0.167Bi0.167FeO3.0,"(La, Sr, K, Nd, Gd, Bi, Fe, O)"
227,La0.333Sr0.333Bi0.333Fe1O3,-2.048143,La0.333Sr0.333Bi0.333FeO3.0,"(La, Sr, Bi, Fe, O)"
228,La0.333Sr0.333Pr0.333Fe1O3,-1.683672,La0.333Sr0.333Pr0.333FeO3.0,"(La, Sr, Pr, Fe, O)"
229,La0.333Sr0.333Gd0.333Fe1O3,-1.759687,La0.333Sr0.333Gd0.333FeO3.0,"(La, Sr, Gd, Fe, O)"


In [None]:
import math
import pandas as pd
from pymatgen.core import Composition

A_sites = ["Li", "Na", "K", "Ca", "Sr", "Ba",
           "La", "Ce", "Pr", "Nd", "Sm", "Eu", "Gd", "Y"]
B_sites = ["Ti", "Zr", "Hf", "V", "Nb", "Ta", "Cr", "Mo", "W",
           "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Ga", "In", "Sn", "Pb","Mg"]


def extract_element_moles(formula):
    """
    Extract each element and its corresponding mole amount from the chemical formula, 
    and classify them into A, B, and O sites
    :param formula: Chemical formula string
    :return: Dictionaries of mole amounts for A, B, and O site elements
    """
    A_moles = {}
    B_moles = {}
    O_moles = {}
    elements = formula.split()
    for element in elements:
        element_symbol = ''.join(filter(str.isalpha, element))
        # Correctly extract mole amounts containing decimals
        element_moles = float(''.join(filter(lambda x: x.isdigit() or x == '.', element)))
        if element_symbol in A_sites:
            A_moles[element_symbol] = element_moles
        elif element_symbol in B_sites:
            B_moles[element_symbol] = element_moles
        elif element_symbol == 'O':
            O_moles[element_symbol] = element_moles
    return A_moles, B_moles, O_moles


def calculate_position_entropy(moles_dict):
    """
    Calculate the entropy contribution of a specific site (A, B, or O site)
    :param moles_dict: Dictionary of mole amounts for elements at the site
    :return: Entropy contribution of the site
    """
    total_moles = sum(moles_dict.values())
    entropy = 0
    for element, moles in moles_dict.items():
        if moles > 0:
            mole_fraction = moles / total_moles
            entropy -= mole_fraction * math.log(mole_fraction)
    return entropy


def calculate_entropy(formula):
    """
    Calculate the perovskite entropy value of a given chemical formula (in R units)
    :param formula: Chemical formula string
    :return: Perovskite entropy value (in R units)
    """
    if isinstance(formula, Composition):
        formula = str(formula)
    A_moles, B_moles, O_moles = extract_element_moles(formula)
    A_entropy = calculate_position_entropy(A_moles)
    B_entropy = calculate_position_entropy(B_moles)
    O_entropy = calculate_position_entropy(O_moles)
    total_entropy_R = -(A_entropy + B_entropy + O_entropy)
    return total_entropy_R

all_data['formula entropy'] = all_data['formula_obj'].apply(lambda x: calculate_entropy(str(x)))

In [5]:
all_data

Unnamed: 0,formula,p Band Center,formula composition,formula_obj,formula entropy
0,La0.167Sr0.167Mg0.167Pr0.167Nd0.167Bi0.167Fe1O3,-2.342580,La0.167Sr0.167Mg0.167Pr0.167Nd0.167Bi0.167FeO3.0,"(La, Sr, Mg, Pr, Nd, Bi, Fe, O)",-1.796849
1,La0.167Sr0.167Na0.167Mg0.167Sm0.167Bi0.167Fe1O3,-2.102172,La0.167Sr0.167Na0.167Mg0.167Sm0.167Bi0.167FeO3.0,"(La, Sr, Na, Mg, Sm, Bi, Fe, O)",-1.796849
2,La0.167Sr0.167Mg0.167Nd0.167Gd0.167Bi0.167Fe1O3,-2.360420,La0.167Sr0.167Mg0.167Nd0.167Gd0.167Bi0.167FeO3.0,"(La, Sr, Mg, Nd, Gd, Bi, Fe, O)",-1.796849
3,La0.167Sr0.167Mg0.167Sm0.167Eu0.167Bi0.167Fe1O3,-2.428931,La0.167Sr0.167Mg0.167Sm0.167Eu0.167Bi0.167FeO3.0,"(La, Sr, Mg, Sm, Eu, Bi, Fe, O)",-1.796849
4,La0.167Sr0.167Mg0.167Nd0.167Gd0.167Y0.167Fe1O3,-1.959921,La0.167Sr0.167Mg0.167Nd0.167Gd0.167Y0.167FeO3.0,"(La, Sr, Mg, Nd, Gd, Y, Fe, O)",-2.019993
...,...,...,...,...,...
225,Sr0.500Li0.500Fe1O3,-1.679989,Sr0.5Li0.5FeO3.0,"(Sr, Li, Fe, O)",-0.693147
226,La0.167Sr0.167K0.167Nd0.167Gd0.167Bi0.167Fe1O3,-1.077504,La0.167Sr0.167K0.167Nd0.167Gd0.167Bi0.167FeO3.0,"(La, Sr, K, Nd, Gd, Bi, Fe, O)",-1.609438
227,La0.333Sr0.333Bi0.333Fe1O3,-2.048143,La0.333Sr0.333Bi0.333FeO3.0,"(La, Sr, Bi, Fe, O)",-0.693147
228,La0.333Sr0.333Pr0.333Fe1O3,-1.683672,La0.333Sr0.333Pr0.333FeO3.0,"(La, Sr, Pr, Fe, O)",-1.098612


In [7]:
all_data.head()


Unnamed: 0,formula,p Band Center,formula composition,formula_obj,formula entropy,A_site_obj,B_site_obj
0,La0.167Sr0.167Mg0.167Pr0.167Nd0.167Bi0.167Fe1O3,-2.34258,La0.167Sr0.167Mg0.167Pr0.167Nd0.167Bi0.167FeO3.0,La0.167 Sr0.167 Mg0.167 Pr0.167 Nd0.167 Bi0.16...,-1.796849,La0.167Sr0.167Mg0.167Pr0.167Nd0.167Bi0.167,Fe1
1,La0.167Sr0.167Na0.167Mg0.167Sm0.167Bi0.167Fe1O3,-2.102172,La0.167Sr0.167Na0.167Mg0.167Sm0.167Bi0.167FeO3.0,La0.167 Sr0.167 Na0.167 Mg0.167 Sm0.167 Bi0.16...,-1.796849,La0.167Sr0.167Na0.167Mg0.167Sm0.167Bi0.167,Fe1
2,La0.167Sr0.167Mg0.167Nd0.167Gd0.167Bi0.167Fe1O3,-2.36042,La0.167Sr0.167Mg0.167Nd0.167Gd0.167Bi0.167FeO3.0,La0.167 Sr0.167 Mg0.167 Nd0.167 Gd0.167 Bi0.16...,-1.796849,La0.167Sr0.167Mg0.167Nd0.167Gd0.167Bi0.167,Fe1
3,La0.167Sr0.167Mg0.167Sm0.167Eu0.167Bi0.167Fe1O3,-2.428931,La0.167Sr0.167Mg0.167Sm0.167Eu0.167Bi0.167FeO3.0,La0.167 Sr0.167 Mg0.167 Sm0.167 Eu0.167 Bi0.16...,-1.796849,La0.167Sr0.167Mg0.167Sm0.167Eu0.167Bi0.167,Fe1
4,La0.167Sr0.167Mg0.167Nd0.167Gd0.167Y0.167Fe1O3,-1.959921,La0.167Sr0.167Mg0.167Nd0.167Gd0.167Y0.167FeO3.0,La0.167 Sr0.167 Mg0.167 Nd0.167 Gd0.167 Y0.167...,-2.019993,La0.167Sr0.167Mg0.167Nd0.167Gd0.167Y0.167,Fe1


In [None]:
import re
def parse_formula(formula):
    """
    Parse chemical formula and return elements with their corresponding ratios
    """
    elements = {}
    # Regular expression to match element symbols and their corresponding numbers
    pattern = re.compile(r'([A-Z][a-z]*)(\d*(?:\.\d+)?)')
    matches = pattern.findall(formula)
    for element, ratio in matches:
        if ratio == '':
            ratio = 1
        else:
            ratio = float(ratio)
        elements[element] = ratio
    return elements


def calculate_weighted_average(formula, element_table):
    """
    Calculate the weighted average of elemental properties in the chemical formula
    """
    Z_sum=0
    elements = parse_formula(formula)
    total_ratio = sum(elements.values())
    ionization_energy_sum = 0
    electronegativity_sum = 0
    radius_sum = 0
    lewis_acid_strengths_sum = 0
    g_sum=0
    p_sum=0
    mass_sum=0
    Valence_electron_sum=0
    Polarization_rate_sum=0
    for element, ratio in elements.items():
        element_data = element_table[element_table['Element'] == element]
        if not element_data.empty:
            Z=element_data['Z'].values[0]
            ionization_energy = element_data['The first ionization energy'].values[0]
            electronegativity = element_data['Eletronegativity'].values[0]
            radius = element_data['Radius'].values[0]
            lewis_acid_strengths = element_data['Lewis acid strengths'].values[0]
            g=element_data['g'].values[0]
            p=element_data['p'].values[0]
            mass=element_data['mass'].values[0]
            Valence_electron=element_data['Valence electron'].values[0]
            Polarization_rate=element_data['Polarization rate'].values[0]
            Z_sum+=Z*ratio
            ionization_energy_sum += ionization_energy * ratio
            electronegativity_sum += electronegativity * ratio
            radius_sum += radius * ratio
            lewis_acid_strengths_sum += lewis_acid_strengths * ratio
            g_sum+=g*ratio
            p_sum+=p*ratio
            mass_sum+=mass*ratio
            Valence_electron_sum+=Valence_electron*ratio
            Polarization_rate_sum+=Polarization_rate*ratio

    if total_ratio > 0:
        return (Z_sum/total_ratio,
                ionization_energy_sum / total_ratio,
                electronegativity_sum / total_ratio,
                radius_sum / total_ratio,
                lewis_acid_strengths_sum / total_ratio,
                g_sum/total_ratio,
                p_sum/total_ratio,
                mass_sum/total_ratio,
                Valence_electron_sum/total_ratio,
                Polarization_rate_sum/total_ratio)
    return (None, None, None, None,None,None,None,None,None)


# Read two tables from element.xlsx file
element_A = pd.read_excel('element.xlsx', sheet_name='A_site_element')
element_B = pd.read_excel('element.xlsx', sheet_name='B_site_element')

# Read data.csv file
data = all_data

# Calculate properties of Cathode_A_site_obj
data['A_site_Z']=data['A_site_obj'].apply(lambda x: calculate_weighted_average(x, element_A)[0])
data['A_site_ionization_energy'] = data['A_site_obj'].apply(
    lambda x: calculate_weighted_average(x, element_A)[1])
data['A_site_eletronegativity'] = data['A_site_obj'].apply(
    lambda x: calculate_weighted_average(x, element_A)[2])
data['A_site_radius'] = data['A_site_obj'].apply(
    lambda x: calculate_weighted_average(x, element_A)[3])
data['A_site_lewis_acid_strengths'] = data['A_site_obj'].apply(
    lambda x: calculate_weighted_average(x, element_A)[4])
data['A_site_g'] = data['A_site_obj'].apply(lambda x: calculate_weighted_average(x, element_A)[5])
data['A_site_p'] = data['A_site_obj'].apply(lambda x: calculate_weighted_average(x, element_A)[6])
data['A_site_mass'] = data['A_site_obj'].apply(lambda x: calculate_weighted_average(x, element_A)[7])
data['A_site_Valence_electron'] = data['A_site_obj'].apply(lambda x: calculate_weighted_average(x, element_A)[8])
data['A_site_Polarization_rate'] = data['A_site_obj'].apply(lambda x: calculate_weighted_average(x, element_A)[9])


# Define oxygen ion radius
r_O = 1.40

# Calculate tolerance factor
data['Tolerance_factor'] = (data['A_site_radius'] + r_O) / (
        (2 ** 0.5) * (0.645+ r_O))

# Output results to a new CSV file
output_file = 'data.csv'
data.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")

结果已保存到 data.csv


In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans

# === 1. Read data ===
df = pd.read_csv("./data.csv")

# === 2. Feature engineering (add new features) ===
df['A_minus_B_electronegativity'] = df['A_site_eletronegativity'] - 1.83
df['polarizability_density'] = df['A_site_Polarization_rate'] / (df['A_site_radius'] ** 3)
df['ionic_potential'] = df['A_site_ionization_energy']/ df['A_site_radius']
df['radius_times_eneg'] = df['A_site_radius'] * df['A_site_eletronegativity']
df['entropy_times_eneg'] = df['formula entropy'] * df['A_site_eletronegativity']
df['entropy_times_polarization']=df['formula entropy'] * df['A_site_Polarization_rate']
df['entropy_per_valence']=df['formula entropy'] / df['A_site_Valence_electron']
df['ionization_energy_per_radius'] = df['A_site_ionization_energy'] / df['A_site_radius']
df['polarization_per_mass'] = df['A_site_Polarization_rate'] / df['A_site_mass']
df['lewis_strength_per_radius'] = df['A_site_lewis_acid_strengths'] / df['A_site_radius']
df['mass_times_radius'] = df['A_site_mass'] * df['A_site_radius']
df['entropy_times_ionization'] = df['formula entropy'] * df['A_site_ionization_energy']
df['log_mass'] = np.log(df['A_site_mass']) 
df['radius_squared'] = df['A_site_radius'] ** 2
df['mass_per_radius3'] = df['A_site_mass'] / (df['A_site_radius'] ** 3)
df['mass_times_ionization'] = df['A_site_mass'] * df['A_site_ionization_energy']
columns_to_drop = ["formula composition","formula_obj","A_site_obj","B_site_obj"]
df=df.drop(columns_to_drop, axis=1)
df.to_csv('data_features.csv', index=False)