In [1]:
import pandas as pd
import numpy as np

In [2]:
from mp_api.client import MPRester # Importing mp_api library
api_key = "ZbGxsvtmixirV3kWQRUGTzNaLQqdZQHD" #this is my api_key
mpr = MPRester(api_key=api_key) # creating MPRester object

In [3]:
# Define A-site (alkaline and rare earth metals) and B-site (transition metals)
alkaline_metals = ["Li", "Na", "K", "Rb", "Cs", "Be", "Mg", "Ca", "Sr", "Ba"]
rare_earth_metals = ["La", "Ce", "Pr", "Nd", "Sm", "Eu", "Gd", "Tb", "Dy", "Ho", "Er", "Tm", "Yb", "Lu"]
transition_metals = ["Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn", "Y", "Zr", "Nb", "Mo", "Ru", "Rh", 
                     "Pd", "Ag", "Cd", "Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au"]

# Combine alkaline and rare earth metals for A-site elements
elements_A = alkaline_metals + rare_earth_metals
anions_X = ["O","Cl", "Br", "I", "F", "S", "Se", "Te"]  # Common X-site elements
all_elements = elements_A+anions_X+transition_metals

In [4]:
# Querying the data (oxides)
data = mpr.materials.summary.search(formula="AABCX6", fields= ['formula_pretty','nsites','chemsys','volume','density','density_atomic',
                                                             'energy_per_atom','formation_energy_per_atom','is_stable',
                                                            'band_gap','cbm','vbm','efermi','is_gap_direct',
                                                            'is_magnetic','ordering','total_magnetization',
                                                             'total_magnetization_normalized_vol','total_magnetization_normalized_formula_units',
                                                            'structure','is_metal','symmetry','energy_above_hull','elements'])

Retrieving SummaryDoc documents:   0%|          | 0/4817 [00:00<?, ?it/s]

In [5]:
def is_double_perovskite(row):
    elements = [str(el) for el in row['elements']]  # Convert elements to string format

    # A site should contain alkali, alkaline earth, or rare earth metals
    A_elements = [elem for elem in elements if elem in elements_A]
    
    # B and C sites should contain transition metals
    B_C_elements = [elem for elem in elements if elem in transition_metals]
    
    # X site should be in the anions list (oxides or halides)
    X_elements = [elem for elem in elements if elem in anions_X]
    
    # Check if the structure matches A2BCX6: 2 A elements, 2 B/C elements, 1 X element
    return len(A_elements) >= 1 and len(B_C_elements) >= 2 and len(X_elements) == 1

In [6]:
df = pd.DataFrame([doc.dict() for doc in data])

In [7]:
df['is_double_perovskite'] = df.apply(is_double_perovskite, axis=1)

In [8]:
df_double_perovskites = df[df['is_double_perovskite']]

In [9]:
df_double_perovskites[['formula_pretty','chemsys','elements']]

Unnamed: 0,formula_pretty,chemsys,elements
43,Ba2CdMoO6,Ba-Cd-Mo-O,"[Ba, Cd, Mo, O]"
44,Ba2CdOsO6,Ba-Cd-O-Os,"[Ba, Cd, O, Os]"
45,Ba2CdReO6,Ba-Cd-O-Re,"[Ba, Cd, O, Re]"
71,Ba2CoMoO6,Ba-Co-Mo-O,"[Ba, Co, Mo, O]"
72,Ba2CoMoO6,Ba-Co-Mo-O,"[Ba, Co, Mo, O]"
...,...,...,...
4806,Rb2YAuI6,Au-I-Rb-Y,"[Au, I, Rb, Y]"
4807,Rb2YCuBr6,Br-Cu-Rb-Y,"[Br, Cu, Rb, Y]"
4808,Rb2YCuCl6,Cl-Cu-Rb-Y,"[Cl, Cu, Rb, Y]"
4809,Rb2YCuF6,Cu-F-Rb-Y,"[Cu, F, Rb, Y]"


In [10]:
df_double_perovskites.info()

<class 'pandas.core.frame.DataFrame'>
Index: 920 entries, 43 to 4810
Data columns (total 71 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   builder_meta                                  0 non-null      object 
 1   nsites                                        920 non-null    int64  
 2   elements                                      920 non-null    object 
 3   nelements                                     0 non-null      object 
 4   composition                                   0 non-null      object 
 5   composition_reduced                           0 non-null      object 
 6   formula_pretty                                920 non-null    object 
 7   formula_anonymous                             0 non-null      object 
 8   chemsys                                       920 non-null    object 
 9   volume                                        920 non-null    float6

In [11]:
columns_of_interest = ['nsites','chemsys','volume','density','density_atomic',
                       'energy_per_atom','formation_energy_per_atom','is_stable',
                        'band_gap','cbm','vbm','efermi','is_gap_direct','is_metal',
                        'is_magnetic','ordering','total_magnetization',
                        'total_magnetization_normalized_vol','total_magnetization_normalized_formula_units',
                      'symmetry','structure','energy_above_hull']

In [12]:
refined_perovskite_df = df_double_perovskites[columns_of_interest]

In [13]:
refined_perovskite_df

Unnamed: 0,nsites,chemsys,volume,density,density_atomic,energy_per_atom,formation_energy_per_atom,is_stable,band_gap,cbm,...,is_gap_direct,is_metal,is_magnetic,ordering,total_magnetization,total_magnetization_normalized_vol,total_magnetization_normalized_formula_units,symmetry,structure,energy_above_hull
43,10,Ba-Cd-Mo-O,152.172247,6.318198,15.217225,-7.002457,-2.466738,True,2.5376,4.6264,...,False,False,False,NM,0.000024,1.550874e-07,0.000024,"{'crystal_system': 'Cubic', 'symbol': 'Fm-3m',...","{'@module': 'pymatgen.core.structure', '@class...",0.000000
44,10,Ba-Cd-O-Os,150.605592,7.423540,15.060559,-6.794457,-2.220566,True,0.0000,,...,False,True,True,FM,1.999998,1.327971e-02,1.999998,"{'crystal_system': 'Cubic', 'symbol': 'Fm-3m',...","{'@module': 'pymatgen.core.structure', '@class...",0.000000
45,10,Ba-Cd-O-Re,151.219499,7.349226,15.121950,-7.155684,-2.459718,False,0.0000,,...,False,True,True,FM,1.000072,6.613381e-03,1.000072,"{'crystal_system': 'Cubic', 'symbol': 'Fm-3m',...","{'@module': 'pymatgen.core.structure', '@class...",0.000618
71,20,Ba-Co-Mo-O,276.766068,6.306065,13.838303,-7.613764,-2.460704,True,0.0000,,...,False,True,True,AFM,0.010982,3.968008e-05,0.005491,"{'crystal_system': 'Cubic', 'symbol': 'Fm-3m',...","{'@module': 'pymatgen.core.structure', '@class...",0.000000
72,20,Ba-Co-Mo-O,277.167172,6.296940,13.858359,-7.604548,-2.451488,False,1.2740,4.2504,...,True,False,True,AFM,0.000002,7.576655e-09,0.000001,"{'crystal_system': 'Tetragonal', 'symbol': 'I4...","{'@module': 'pymatgen.core.structure', '@class...",0.009216
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4806,10,Au-I-Rb-Y,443.643517,4.559802,44.364352,-3.403616,-1.345192,False,1.1678,2.4738,...,False,False,False,NM,0.001489,3.355848e-06,0.001489,"{'crystal_system': 'Cubic', 'symbol': 'Fm-3m',...","{'@module': 'pymatgen.core.structure', '@class...",0.105849
4807,10,Br-Cu-Rb-Y,332.890138,4.004624,33.289014,-4.048370,-1.859617,False,2.3405,3.4050,...,False,False,False,NM,0.000269,8.068728e-07,0.000269,"{'crystal_system': 'Cubic', 'symbol': 'Fm-3m',...","{'@module': 'pymatgen.core.structure', '@class...",0.035860
4808,10,Cl-Cu-Rb-Y,283.652711,3.138430,28.365271,-4.496769,-2.130710,False,2.6201,4.4170,...,False,False,False,NM,0.000190,6.698332e-07,0.000190,"{'crystal_system': 'Cubic', 'symbol': 'Fm-3m',...","{'@module': 'pymatgen.core.structure', '@class...",0.042300
4809,10,Cu-F-Rb-Y,179.308176,4.050474,17.930818,-5.549890,-3.146066,False,0.9338,3.3825,...,False,False,False,NM,0.000227,1.267092e-06,0.000227,"{'crystal_system': 'Cubic', 'symbol': 'Fm-3m',...","{'@module': 'pymatgen.core.structure', '@class...",0.084484


In [14]:
refined_perovskite_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 920 entries, 43 to 4810
Data columns (total 22 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   nsites                                        920 non-null    int64  
 1   chemsys                                       920 non-null    object 
 2   volume                                        920 non-null    float64
 3   density                                       920 non-null    float64
 4   density_atomic                                920 non-null    float64
 5   energy_per_atom                               920 non-null    float64
 6   formation_energy_per_atom                     920 non-null    float64
 7   is_stable                                     920 non-null    bool   
 8   band_gap                                      920 non-null    float64
 9   cbm                                           594 non-null    float6

In [15]:
# Now, as we see above, the structure and symmetry columns don't look good
# We need to extract the information useful for us from those columns and make separate columns

# Extracting crystal_system
refined_perovskite_df['crystal_system'] = refined_perovskite_df['symmetry'].apply(lambda x: x['crystal_system'])

# Extracting abc and angles
def extract_abc_angles(structure):
    abc = structure['lattice']['a'], structure['lattice']['b'], structure['lattice']['c']
    angles = structure['lattice']['alpha'], structure['lattice']['beta'], structure['lattice']['gamma']
    return pd.Series({'a': abc[0], 'b': abc[1], 'c': abc[2], 'alpha': angles[0], 'beta': angles[1], 'gamma': angles[2]})

refined_perovskite_df[['a', 'b', 'c', 'alpha', 'beta', 'gamma']] = refined_perovskite_df['structure'].apply(extract_abc_angles)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  refined_perovskite_df['crystal_system'] = refined_perovskite_df['symmetry'].apply(lambda x: x['crystal_system'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  refined_perovskite_df[['a', 'b', 'c', 'alpha', 'beta', 'gamma']] = refined_perovskite_df['structure'].apply(extract_abc_angles)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.ht

In [16]:
refined_perovskite_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 920 entries, 43 to 4810
Data columns (total 29 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   nsites                                        920 non-null    int64  
 1   chemsys                                       920 non-null    object 
 2   volume                                        920 non-null    float64
 3   density                                       920 non-null    float64
 4   density_atomic                                920 non-null    float64
 5   energy_per_atom                               920 non-null    float64
 6   formation_energy_per_atom                     920 non-null    float64
 7   is_stable                                     920 non-null    bool   
 8   band_gap                                      920 non-null    float64
 9   cbm                                           594 non-null    float6

In [17]:
# We can observe that we have successfully extracted the crystal_system, abc and angles
# Now we have to extract data from the chemsys column. We need to separate the elements and store in separate columns

# Extracting chemsys data
# Split the chemsys column into separate elements
split_chemsys = refined_perovskite_df['chemsys'].str.split('-', expand=True)

# Assign new columns for each element
refined_perovskite_df['element_1'] = split_chemsys[0]
refined_perovskite_df['element_2'] = split_chemsys[1]
refined_perovskite_df['element_3'] = split_chemsys[2]
refined_perovskite_df['element_4'] = split_chemsys[3]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  refined_perovskite_df['element_1'] = split_chemsys[0]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  refined_perovskite_df['element_2'] = split_chemsys[1]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  refined_perovskite_df['element_3'] = split_chemsys[2]
A value is trying to be set on a copy of a 

In [18]:
refined_perovskite_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 920 entries, 43 to 4810
Data columns (total 33 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   nsites                                        920 non-null    int64  
 1   chemsys                                       920 non-null    object 
 2   volume                                        920 non-null    float64
 3   density                                       920 non-null    float64
 4   density_atomic                                920 non-null    float64
 5   energy_per_atom                               920 non-null    float64
 6   formation_energy_per_atom                     920 non-null    float64
 7   is_stable                                     920 non-null    bool   
 8   band_gap                                      920 non-null    float64
 9   cbm                                           594 non-null    float6

In [19]:
# From the above output, we have successfully extracted the elements in the perovskite
# But, ML models need numbers to work with
# We can encode the elements coulmns with their respective atomic numbers

# Predefined dictionary for atomic numbers
atomic_numbers = {
    'H': 1, 'He': 2, 'Li': 3, 'Be': 4, 'B': 5, 'C': 6, 'N': 7, 'O': 8, 'F': 9, 'Ne': 10,
    'Na': 11, 'Mg': 12, 'Al': 13, 'Si': 14, 'P': 15, 'S': 16, 'Cl': 17, 'Ar': 18, 'K': 19, 'Ca': 20,
    'Sc': 21, 'Ti': 22, 'V': 23, 'Cr': 24, 'Mn': 25, 'Fe': 26, 'Co': 27, 'Ni': 28, 'Cu': 29, 'Zn': 30,
    'Ga': 31, 'Ge': 32, 'As': 33, 'Se': 34, 'Br': 35, 'Kr': 36, 'Rb': 37, 'Sr': 38, 'Y': 39, 'Zr': 40,
    'Nb': 41, 'Mo': 42, 'Tc': 43, 'Ru': 44, 'Rh': 45, 'Pd': 46, 'Ag': 47, 'Cd': 48, 'In': 49, 'Sn': 50,
    'Sb': 51, 'Te': 52, 'I': 53, 'Xe': 54, 'Cs': 55, 'Ba': 56, 'La': 57, 'Ce': 58, 'Pr': 59, 'Nd': 60,
    'Pm': 61, 'Sm': 62, 'Eu': 63, 'Gd': 64, 'Tb': 65, 'Dy': 66, 'Ho': 67, 'Er': 68, 'Tm': 69, 'Yb': 70,
    'Lu': 71, 'Hf': 72, 'Ta': 73, 'W': 74, 'Re': 75, 'Os': 76, 'Ir': 77, 'Pt': 78, 'Au': 79, 'Hg': 80,
    'Tl': 81, 'Pb': 82, 'Bi': 83, 'Po': 84, 'At': 85, 'Rn': 86, 'Fr': 87, 'Ra': 88, 'Ac': 89, 'Th': 90,
    'Pa': 91, 'U': 92, 'Np': 93, 'Pu': 94, 'Am': 95, 'Cm': 96, 'Bk': 97, 'Cf': 98, 'Es': 99, 'Fm': 100,
    'Md': 101, 'No': 102, 'Lr': 103, 'Rf': 104, 'Db': 105, 'Sg': 106, 'Bh': 107, 'Hs': 108, 'Mt': 109,
    'Ds': 110, 'Rg': 111, 'Cn': 112, 'Nh': 113, 'Fl': 114, 'Mc': 115, 'Lv': 116, 'Ts': 117, 'Og': 118
}

# Function to extract elements and atomic numbers
def extract_elements_atomic_numbers(row):
    elements = row['chemsys'].split('-')
    atomic_nums = [atomic_numbers[element] for element in elements]
    return pd.Series(atomic_nums)

refined_perovskite_df[['atomic_num1', 'atomic_num2', 'atomic_num3','atomic_num4']] = refined_perovskite_df.apply(extract_elements_atomic_numbers, axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  refined_perovskite_df[['atomic_num1', 'atomic_num2', 'atomic_num3','atomic_num4']] = refined_perovskite_df.apply(extract_elements_atomic_numbers, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  refined_perovskite_df[['atomic_num1', 'atomic_num2', 'atomic_num3','atomic_num4']] = refined_perovskite_df.apply(extract_elements_atomic_numbers, axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: 

In [21]:
refined_perovskite_df

Unnamed: 0,nsites,chemsys,volume,density,density_atomic,energy_per_atom,formation_energy_per_atom,is_stable,band_gap,cbm,...,beta,gamma,element_1,element_2,element_3,element_4,atomic_num1,atomic_num2,atomic_num3,atomic_num4
43,10,Ba-Cd-Mo-O,152.172247,6.318198,15.217225,-7.002457,-2.466738,True,2.5376,4.6264,...,60.000000,60.000000,Ba,Cd,Mo,O,56,48,42,8
44,10,Ba-Cd-O-Os,150.605592,7.423540,15.060559,-6.794457,-2.220566,True,0.0000,,...,60.000000,60.000000,Ba,Cd,O,Os,56,48,8,76
45,10,Ba-Cd-O-Re,151.219499,7.349226,15.121950,-7.155684,-2.459718,False,0.0000,,...,60.000000,60.000000,Ba,Cd,O,Re,56,48,8,75
71,20,Ba-Co-Mo-O,276.766068,6.306065,13.838303,-7.613764,-2.460704,True,0.0000,,...,90.021108,119.962715,Ba,Co,Mo,O,56,27,42,8
72,20,Ba-Co-Mo-O,277.167172,6.296940,13.858359,-7.604548,-2.451488,False,1.2740,4.2504,...,119.353138,73.335321,Ba,Co,Mo,O,56,27,42,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4806,10,Au-I-Rb-Y,443.643517,4.559802,44.364352,-3.403616,-1.345192,False,1.1678,2.4738,...,60.000000,60.000000,Au,I,Rb,Y,79,53,37,39
4807,10,Br-Cu-Rb-Y,332.890138,4.004624,33.289014,-4.048370,-1.859617,False,2.3405,3.4050,...,60.000000,60.000000,Br,Cu,Rb,Y,35,29,37,39
4808,10,Cl-Cu-Rb-Y,283.652711,3.138430,28.365271,-4.496769,-2.130710,False,2.6201,4.4170,...,60.000000,60.000000,Cl,Cu,Rb,Y,17,29,37,39
4809,10,Cu-F-Rb-Y,179.308176,4.050474,17.930818,-5.549890,-3.146066,False,0.9338,3.3825,...,60.000000,60.000000,Cu,F,Rb,Y,29,9,37,39


In [22]:
# We have successfully extracted the atomic numbers also
# Now we can drop the symmetry and structure columns as we have the data

refined_perovskite_df = refined_perovskite_df.drop(columns=['symmetry','structure'])

In [24]:
refined_perovskite_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 920 entries, 43 to 4810
Data columns (total 35 columns):
 #   Column                                        Non-Null Count  Dtype  
---  ------                                        --------------  -----  
 0   nsites                                        920 non-null    int64  
 1   chemsys                                       920 non-null    object 
 2   volume                                        920 non-null    float64
 3   density                                       920 non-null    float64
 4   density_atomic                                920 non-null    float64
 5   energy_per_atom                               920 non-null    float64
 6   formation_energy_per_atom                     920 non-null    float64
 7   is_stable                                     920 non-null    bool   
 8   band_gap                                      920 non-null    float64
 9   cbm                                           594 non-null    float6

In [25]:
from mendeleev import element

# Step 1: Extract unique elements from your DataFrame
unique_elements = pd.concat([refined_perovskite_df['element_1'], 
                             refined_perovskite_df['element_2'], 
                             refined_perovskite_df['element_3'],
                             refined_perovskite_df['element_4']]).unique()

# Step 2: Fetch properties for each unique element
element_properties = {}

In [28]:
## The below properties are extracted from pymatgen ##

from pymatgen.core import Element

# List of properties to extract
properties = ['group', 'row', 'electronegativity', 'atomic_radius', 'atomic_radius_calculated', 
              'van_der_waals_radius', 'mendeleev_no', 'molar_volume', 
              'electron_affinity', 'ionization_energy', 'average_ionic_radius', 
              'coefficient_of_linear_thermal_expansion', 'density_of_solid', 
              'boiling_point', 'melting_point', 'thermal_conductivity']

for elem in unique_elements:
    try:
        element = Element(elem)
        element_properties[elem] = {
            'group': element.group,
            'row': element.row,
            'electronegativity': element.X,
            'atomic_radius': element.atomic_radius,
            'atomic_radius_calculated': element.atomic_radius_calculated,
            'van_der_waals_radius': element.van_der_waals_radius,
            'mendeleev_no': element.mendeleev_no,
            'molar_volume': element.molar_volume,
            'electron_affinity': element.electron_affinity,
            'ionization_energy': element.ionization_energies[0] if element.ionization_energies else np.nan,
            'average_ionic_radius': element.average_ionic_radius,
            'coefficient_of_linear_thermal_expansion': element.coefficient_of_linear_thermal_expansion,
            'density_of_solid': element.density_of_solid,
            'boiling_point': element.boiling_point,
            'melting_point': element.melting_point,
            'thermal_conductivity': element.thermal_conductivity
        }
    except Exception as e:
        # If data is missing, fill with NaN
        element_properties[elem] = {prop: np.nan for prop in properties}
        print(f"Data not available for element {elem}: {e}")

# Now mapping the properties to the DataFrame
for i in range(1, 5):  # element_1, element_2, element_3, element_4
    for prop in properties:
        refined_perovskite_df[f'element_{i}_{prop}'] = refined_perovskite_df[f'element_{i}'].map(lambda x: element_properties[x][prop])

# After this, the periodic table properties for each element will be added to the dataframe.



In [30]:
refined_perovskite_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 920 entries, 43 to 4810
Data columns (total 99 columns):
 #   Column                                             Non-Null Count  Dtype  
---  ------                                             --------------  -----  
 0   nsites                                             920 non-null    int64  
 1   chemsys                                            920 non-null    object 
 2   volume                                             920 non-null    float64
 3   density                                            920 non-null    float64
 4   density_atomic                                     920 non-null    float64
 5   energy_per_atom                                    920 non-null    float64
 6   formation_energy_per_atom                          920 non-null    float64
 7   is_stable                                          920 non-null    bool   
 8   band_gap                                           920 non-null    float64
 9   cbm          

In [31]:
from mendeleev import element
for elem in unique_elements:
    try:
        el = element(elem)
        element_properties[elem] = {
            'covalent_radius': el.covalent_radius,
            'fusion_heat': el.fusion_heat,  # Heat of fusion
            'atomic_mass': el.atomic_weight,
            'specific_heat': el.specific_heat,  # Specific heat
            'evaporation_heat': el.evaporation_heat,  # Heat of vaporization
            'dipole_polarizability': el.dipole_polarizability,  # Dipole polarizability
            'density': el.density
        }
    except Exception as e:
        element_properties[elem] = {key: None for key in [
            'covalent_radius', 'fusion_heat', 'atomic_mass', 'specific_heat', 
            'evaporation_heat', 'dipole_polarizability', 'density'
        ]}
        print(f"Data not available for element {elem}: {e}")

# Step 3: Map properties back to the DataFrame
for i in range(1, 5):  # element_1, element_2, element_3 columns
    refined_perovskite_df[f'element_{i}_covalent_radius'] = refined_perovskite_df[f'element_{i}'].map(lambda x: element_properties[x]['covalent_radius'])
    refined_perovskite_df[f'element_{i}_fusion_heat'] = refined_perovskite_df[f'element_{i}'].map(lambda x: element_properties[x]['fusion_heat'])
    refined_perovskite_df[f'element_{i}_atomic_mass'] = refined_perovskite_df[f'element_{i}'].map(lambda x: element_properties[x]['atomic_mass'])
    refined_perovskite_df[f'element_{i}_specific_heat'] = refined_perovskite_df[f'element_{i}'].map(lambda x: element_properties[x]['specific_heat'])
    refined_perovskite_df[f'element_{i}_evaporation_heat'] = refined_perovskite_df[f'element_{i}'].map(lambda x: element_properties[x]['evaporation_heat'])
    refined_perovskite_df[f'element_{i}_dipole_polarizability'] = refined_perovskite_df[f'element_{i}'].map(lambda x: element_properties[x]['dipole_polarizability'])
    refined_perovskite_df[f'element_{i}_density'] = refined_perovskite_df[f'element_{i}'].map(lambda x: element_properties[x]['density'])


  refined_perovskite_df[f'element_{i}_specific_heat'] = refined_perovskite_df[f'element_{i}'].map(lambda x: element_properties[x]['specific_heat'])
  refined_perovskite_df[f'element_{i}_evaporation_heat'] = refined_perovskite_df[f'element_{i}'].map(lambda x: element_properties[x]['evaporation_heat'])
  refined_perovskite_df[f'element_{i}_dipole_polarizability'] = refined_perovskite_df[f'element_{i}'].map(lambda x: element_properties[x]['dipole_polarizability'])
  refined_perovskite_df[f'element_{i}_density'] = refined_perovskite_df[f'element_{i}'].map(lambda x: element_properties[x]['density'])
  refined_perovskite_df[f'element_{i}_covalent_radius'] = refined_perovskite_df[f'element_{i}'].map(lambda x: element_properties[x]['covalent_radius'])
  refined_perovskite_df[f'element_{i}_fusion_heat'] = refined_perovskite_df[f'element_{i}'].map(lambda x: element_properties[x]['fusion_heat'])
  refined_perovskite_df[f'element_{i}_atomic_mass'] = refined_perovskite_df[f'element_{i}'].map(lambda

In [34]:
refined_perovskite_df

Unnamed: 0,nsites,chemsys,volume,density,density_atomic,energy_per_atom,formation_energy_per_atom,is_stable,band_gap,cbm,...,element_3_evaporation_heat,element_3_dipole_polarizability,element_3_density,element_4_covalent_radius,element_4_fusion_heat,element_4_atomic_mass,element_4_specific_heat,element_4_evaporation_heat,element_4_dipole_polarizability,element_4_density
43,10,Ba-Cd-Mo-O,152.172247,6.318198,15.217225,-7.002457,-2.466738,True,2.5376,4.6264,...,590.0,87.0,10.200000,63.0,,15.99900,0.918,,5.3,0.001308
44,10,Ba-Cd-O-Os,150.605592,7.423540,15.060559,-6.794457,-2.220566,True,0.0000,,...,,5.3,0.001308,129.0,31.7,190.23000,0.130,738.0,57.0,22.587200
45,10,Ba-Cd-O-Re,151.219499,7.349226,15.121950,-7.155684,-2.459718,False,0.0000,,...,,5.3,0.001308,131.0,34.0,186.20700,0.137,704.0,62.0,20.800000
71,20,Ba-Co-Mo-O,276.766068,6.306065,13.838303,-7.613764,-2.460704,True,0.0000,,...,590.0,87.0,10.200000,63.0,,15.99900,0.918,,5.3,0.001308
72,20,Ba-Co-Mo-O,277.167172,6.296940,13.858359,-7.604548,-2.451488,False,1.2740,4.2504,...,590.0,87.0,10.200000,63.0,,15.99900,0.918,,5.3,0.001308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4806,10,Au-I-Rb-Y,443.643517,4.559802,44.364352,-3.403616,-1.345192,False,1.1678,2.4738,...,75.8,319.8,1.530000,163.0,11.5,88.90584,0.298,367.0,162.0,4.470000
4807,10,Br-Cu-Rb-Y,332.890138,4.004624,33.289014,-4.048370,-1.859617,False,2.3405,3.4050,...,75.8,319.8,1.530000,163.0,11.5,88.90584,0.298,367.0,162.0,4.470000
4808,10,Cl-Cu-Rb-Y,283.652711,3.138430,28.365271,-4.496769,-2.130710,False,2.6201,4.4170,...,75.8,319.8,1.530000,163.0,11.5,88.90584,0.298,367.0,162.0,4.470000
4809,10,Cu-F-Rb-Y,179.308176,4.050474,17.930818,-5.549890,-3.146066,False,0.9338,3.3825,...,75.8,319.8,1.530000,163.0,11.5,88.90584,0.298,367.0,162.0,4.470000


In [35]:
# Later, while doing the data analysis, some columns will be dropped and some will be added
# We should save this data in csv format so that all we need not to repeat all these steps

csv_file = 'MaterialsProject_Double_Perovskite_data.csv'
refined_perovskite_df.to_csv(csv_file, index=False) # Converting the extracted data into a csv and saving it!