In [1]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:
# DFT Calculations
dft_calc = pd.read_csv("data/dft_calc.csv").replace(" ", 0)

# Elemental Properties
elemental_prop = pd.read_csv("data/elemental_properties.csv")
elemental_prop = elemental_prop.replace(" ", 0)

df = dft_calc.copy()

In [3]:
dft_calc.head()

Unnamed: 0,COMPOSITION,A_SITE_1,A_SITE_2,A_SITE_3,B_SITE_1,B_SITE_2,B_SITE_3,X_SITE,NUM_ELEMS,ENERGY_ABOVE_HULL,FORMATION_ENERGY
0,Ba1Sr7V8O24,Ba,Sr,,V,,,O,4,29.747707,-2.113335
1,Ba2Bi2Pr4Co8O24,Ba,Bi,Pr,Co,,,O,5,106.702335,-1.311863
2,Ba2Ca6Fe8O24,Ba,Ca,,Fe,,,O,4,171.608093,-1.435607
3,Ba2Cd2Pr4Ni8O24,Ba,Cd,Pr,Ni,,,O,5,284.89819,-0.868639
4,Ba2Dy6Fe8O24,Ba,Dy,,Fe,,,O,4,270.007913,-1.746806


In [4]:
elemental_prop.head()

Unnamed: 0,SYMBOL,IONIC_RADIUS,MOD_OF_ELASTICITY,BP,MP,DENSITY,AT_WT,BCC_EFF_LAT_CNT,BCC_ENERGY,BCC_ENERGY_DIFF,...,IS_NONMETAL,ND_UNFILLED,ND_VALENCE,NF_UNFILLED,NF_VALENCE,NP_UNFILLED,NP_VALENCE,NS_UNFILLED,NS_VALENCE,N_UNFILLED
0,H,1.54,,20.28,13.81,0.0899,1.00797,3.589268,-2.135811,1.19548,...,1,0,0,0,0,0,0,1,1,1
1,He,0.0,,4.216,0.95,0.1785,4.0026,5.373995,-2.000673,-2.001808,...,1,0,0,0,0,0,0,0,2,0
2,Li,0.76,10.0,1615.0,453.7,0.53,6.941,6.416364,-1.865535,0.004352,...,0,0,0,0,0,0,0,1,1,1
3,Be,0.45,301.0,3243.0,1560.0,1.85,9.01218,4.997332,-3.655272,0.099767,...,0,0,0,0,0,0,0,0,2,0
4,B,0.23,441.0,4275.0,2365.0,2.34,10.811,4.60667,-4.966431,1.711267,...,0,0,0,0,0,5,1,0,2,5


In [5]:
df.head()

Unnamed: 0,COMPOSITION,A_SITE_1,A_SITE_2,A_SITE_3,B_SITE_1,B_SITE_2,B_SITE_3,X_SITE,NUM_ELEMS,ENERGY_ABOVE_HULL,FORMATION_ENERGY
0,Ba1Sr7V8O24,Ba,Sr,,V,,,O,4,29.747707,-2.113335
1,Ba2Bi2Pr4Co8O24,Ba,Bi,Pr,Co,,,O,5,106.702335,-1.311863
2,Ba2Ca6Fe8O24,Ba,Ca,,Fe,,,O,4,171.608093,-1.435607
3,Ba2Cd2Pr4Ni8O24,Ba,Cd,Pr,Ni,,,O,5,284.89819,-0.868639
4,Ba2Dy6Fe8O24,Ba,Dy,,Fe,,,O,4,270.007913,-1.746806


In [6]:
suffixes = "A1 A2 A3 B1 B2 B3".split()
site_names = list(dft_calc.columns[1:7])
symbol_names = [i+"_SYMBOL" for i in suffixes]
elemental_prop_names = [i+"_elemental_prop" for i in suffixes]
elemental_prop_col_names = []

In [7]:
for i in range(6):
    placeholder_df = elemental_prop.copy()
    placeholder_df.columns = suffixes[i] + "_" + placeholder_df.columns.values
    dft_calc = pd.merge(dft_calc, placeholder_df, how="left", left_on=site_names[i], right_on=symbol_names[i])
    
    placeholder_df = placeholder_df.drop(columns=[suffixes[i] + "_SYMBOL"])
    elemental_prop_col_names.append(placeholder_df.columns)

In [8]:
def num_of_sites(site):
    nums =[]
    
    for i in range(dft_calc.shape[0]):
        matches = re.findall(r"(\D*)(\d*)", dft_calc["COMPOSITION"].iloc[i])

        if type(dft_calc[site].iloc[i]) == str:
            
            for j in range(len(matches)):
                if matches[j][0] == dft_calc[site].iloc[i]:
                    nums.append(matches[j][1])
                    continue
                else:
                    continue
        else:
            nums.append(np.nan)
    return nums 

In [9]:
num_sites = []

for i in site_names:
    num_sites.append(num_of_sites(i))

In [10]:
num_col_names = ["NUM_" + i for i in suffixes]
num_col_names

['NUM_A1', 'NUM_A2', 'NUM_A3', 'NUM_B1', 'NUM_B2', 'NUM_B3']

In [11]:
for i in range(6):
    dft_calc[num_col_names[i]] = num_sites[i]

In [12]:
ionic_radius_names = [i for i in dft_calc if "_IONIC_RADIUS" in i]
ionic_radius_names

['A1_IONIC_RADIUS',
 'A2_IONIC_RADIUS',
 'A3_IONIC_RADIUS',
 'B1_IONIC_RADIUS',
 'B2_IONIC_RADIUS',
 'B3_IONIC_RADIUS']

In [13]:
np.argmax(list(dft_calc[ionic_radius_names[0:3]].astype(float).iloc[0].dropna()))


0

In [14]:
list(dft_calc[ionic_radius_names[0:3]].astype(float).iloc[0].dropna())

[1.42, 1.26]

In [15]:
# # clean version

# gs = []
# of = []
# ab = []
# ao = []
# bo = []
# a_max = []
# b_max = []
# #a_min = []
# #b_min = []

# for i in range(dft_calc.shape[0]):
    
#     ionic_radii_list_a = list(dft_calc[ionic_radius_names[0:3]].astype(float).iloc[i].dropna())
#     a_max.append(np.argmax(ionic_radii_list_a))
#     num_list_a = list(dft_calc[num_col_names[0:3]].astype(float).iloc[i].dropna())
    

#     ionic_radii_list_b = list(dft_calc[ionic_radius_names[3:6]].astype(float).iloc[i].dropna())
#     b_max.append(np.argmax(ionic_radii_list_b))
#     num_list_b = list(dft_calc[num_col_names[3:6]].astype(float).iloc[i].dropna())
    
#     a_sum = 0
    
#     sam_list_a = len(ionic_radii_list_a)
#     sam_list_b = len(ionic_radii_list_b)
    
#     for j in range(sam_list_a):
#         a = ionic_radii_list_a[j] * num_list_a[j]
#         a_sum = a_sum + a
    
#     b_sum = 0
#     for k in range(sam_list_b):
#         b = ionic_radii_list_b[k] * num_list_b[k]
#         b_sum = b_sum + b
#         of_ = (b_sum/8)/1.4

#     gs_tf = ((a_sum/8) + 1.4)/(np.sqrt(2)*((b_sum/8)+1.4))    
    
#     gs.append(gs_tf)
#     of.append(of_)
#     ab.append((a_sum/8)+(b_sum/8))
#     ao.append((a_sum/8)+1.4)
#     bo.append((b_sum/8)+1.4)
    
# dft_calc["GOLDSCHMIDT_TF"] = gs
# dft_calc["OCTAHEDRAL_FACTOR"] = of
# dft_calc["A_B"] = ab
# dft_calc["A_O"] = ao
# dft_calc["B_O"] = bo
# dft_calc["A_MAX"] = a_max
# dft_calc["B_MAX"] = b_max

In [16]:
site_names

['A_SITE_1', 'A_SITE_2', 'A_SITE_3', 'B_SITE_1', 'B_SITE_2', 'B_SITE_3']

In [17]:
gs = []
of = []
ab = []
ao = []
bo = []
a_max = []
b_max = []
a_wt_avg = []
b_wt_avg = []

for i in range(dft_calc.shape[0]):
    
    ionic_radii_list_a = list(dft_calc[ionic_radius_names[0:3]].astype(float).iloc[i].dropna())
    num_list_a = list(dft_calc[num_col_names[0:3]].astype(float).iloc[i].dropna())
    a_max.append(dft_calc[site_names[np.argmax(num_list_a)]].iloc[i])
    a_wt_avg.append(num_list_a)
    
    ionic_radii_list_b = list(dft_calc[ionic_radius_names[3:6]].astype(float).iloc[i].dropna())
    num_list_b = list(dft_calc[num_col_names[3:6]].astype(float).iloc[i].dropna())
    b_max.append(dft_calc[site_names[np.argmax(num_list_b)+3]].iloc[i])
    b_wt_avg.append(num_list_b)
    
    a_sum = 0
    
    sam_list_a = len(ionic_radii_list_a)
    sam_list_b = len(ionic_radii_list_b)
    
    for j in range(sam_list_a):
        a = ionic_radii_list_a[j] * num_list_a[j]
        a_sum = a_sum + a
    
    b_sum = 0
    for k in range(sam_list_b):
        b = ionic_radii_list_b[k] * num_list_b[k]
        b_sum = b_sum + b
        of_ = (b_sum/8)/1.4

    gs_tf = ((a_sum/8) + 1.4)/(np.sqrt(2)*((b_sum/8)+1.4))    
    
    gs.append(gs_tf)
    of.append(of_)
    ab.append((a_sum/8)+(b_sum/8))
    ao.append((a_sum/8)+1.4)
    bo.append((b_sum/8)+1.4)
    
dft_calc["GOLDSCHMIDT_TF"] = gs
dft_calc["OCTAHEDRAL_FACTOR"] = of
dft_calc["A_B"] = ab
dft_calc["A_O"] = ao
dft_calc["B_O"] = bo
dft_calc["A_MAX"] = a_max
dft_calc["B_MAX"] = b_max

In [18]:
df["GOLDSCHMIDT_TF"] = gs
df["OCTAHEDRAL_FACTOR"] = of
df["A_B"] = ab
df["A_O"] = ao
df["B_O"] = bo

In [19]:
# gs = []
# of = []
# ab = []
# ao = []
# bo = []
# a_max = []
# b_max = []

# for i in range(dft_calc.shape[0]):
    
#     ionic_radii_list_a = list(dft_calc[ionic_radius_names[0:3]].astype(float).iloc[i].dropna())
#     num_list_a = list(dft_calc[num_col_names[0:3]].astype(float).iloc[i].dropna())
#     a_max.append(dft_calc[site_names[np.argmax(num_list_a)]].iloc[i])
    

#     ionic_radii_list_b = list(dft_calc[ionic_radius_names[3:6]].astype(float).iloc[i].dropna())
#     num_list_b = list(dft_calc[num_col_names[3:6]].astype(float).iloc[i].dropna())
#     b_max.append(dft_calc[site_names[np.argmax(num_list_b)+3]].iloc[i])
    
#     a_sum = 0
    
#     sam_list_a = len(ionic_radii_list_a)
#     sam_list_b = len(ionic_radii_list_b)
    
#     for j in range(sam_list_a):
#         a = ionic_radii_list_a[j] * num_list_a[j]
#         a_sum = a_sum + a
    
#     b_sum = 0
#     for k in range(sam_list_b):
#         b = ionic_radii_list_b[k] * num_list_b[k]
#         b_sum = b_sum + b
#         of_ = (b_sum/8)/1.4

#     gs_tf = ((a_sum/8) + 1.4)/(np.sqrt(2)*((b_sum/8)+1.4))    
    
#     gs.append(gs_tf)
#     of.append(of_)
#     ab.append((a_sum/8)+(b_sum/8))
#     ao.append((a_sum/8)+1.4)
#     bo.append((b_sum/8)+1.4)
    
# dft_calc["GOLDSCHMIDT_TF"] = gs
# dft_calc["OCTAHEDRAL_FACTOR"] = of
# dft_calc["A_B"] = ab
# dft_calc["A_O"] = ao
# dft_calc["B_O"] = bo
# dft_calc["A_MAX"] = a_max
# dft_calc["B_MAX"] = b_max

In [20]:
df["A_MAX"] = a_max
df["B_MAX"] = b_max

In [21]:
placeholder_df = elemental_prop.copy()
placeholder_df.columns = "A_MAX_" + placeholder_df.columns.values
df = pd.merge(df, placeholder_df, how="inner", left_on="A_MAX", right_on="A_MAX_SYMBOL")
df = df.drop(columns=["A_MAX", "A_MAX_SYMBOL"])

In [22]:
df.head()

Unnamed: 0,COMPOSITION,A_SITE_1,A_SITE_2,A_SITE_3,B_SITE_1,B_SITE_2,B_SITE_3,X_SITE,NUM_ELEMS,ENERGY_ABOVE_HULL,...,A_MAX_IS_NONMETAL,A_MAX_ND_UNFILLED,A_MAX_ND_VALENCE,A_MAX_NF_UNFILLED,A_MAX_NF_VALENCE,A_MAX_NP_UNFILLED,A_MAX_NP_VALENCE,A_MAX_NS_UNFILLED,A_MAX_NS_VALENCE,A_MAX_N_UNFILLED
0,Ba1Sr7V8O24,Ba,Sr,,V,,,O,4,29.747707,...,0,0,0,0,0,0,0,0,2,0
1,Ba2Sr6Co8O24,Ba,Sr,,Co,,,O,4,106.745759,...,0,0,0,0,0,0,0,0,2,0
2,Ba2Sr6Fe4Co4O24,Ba,Sr,,Fe,Co,,O,5,109.488956,...,0,0,0,0,0,0,0,0,2,0
3,Ba2Sr6Fe4Ni4O24,Ba,Sr,,Fe,Ni,,O,5,183.271284,...,0,0,0,0,0,0,0,0,2,0
4,Ba2Sr6Fe6Co2O24,Ba,Sr,,Fe,Co,,O,5,105.72218,...,0,0,0,0,0,0,0,0,2,0


In [23]:
placeholder_df = elemental_prop.copy()
placeholder_df.columns = "B_MAX_" + placeholder_df.columns.values
df = pd.merge(df, placeholder_df, how="inner", left_on="B_MAX", right_on="B_MAX_SYMBOL")
df = df.drop(columns=["B_MAX", "B_MAX_SYMBOL"])

In [24]:
df.head()

Unnamed: 0,COMPOSITION,A_SITE_1,A_SITE_2,A_SITE_3,B_SITE_1,B_SITE_2,B_SITE_3,X_SITE,NUM_ELEMS,ENERGY_ABOVE_HULL,...,B_MAX_IS_NONMETAL,B_MAX_ND_UNFILLED,B_MAX_ND_VALENCE,B_MAX_NF_UNFILLED,B_MAX_NF_VALENCE,B_MAX_NP_UNFILLED,B_MAX_NP_VALENCE,B_MAX_NS_UNFILLED,B_MAX_NS_VALENCE,B_MAX_N_UNFILLED
0,Ba1Sr7V8O24,Ba,Sr,,V,,,O,4,29.747707,...,0,7,3,0,0,0,0,0,2,7
1,Ba2Sr6V8O24,Ba,Sr,,V,,,O,4,42.133507,...,0,7,3,0,0,0,0,0,2,7
2,Sr4Ca4V8O24,Sr,Ca,,V,,,O,4,66.990941,...,0,7,3,0,0,0,0,0,2,7
3,Sr4Dy4V8O24,Sr,Dy,,V,,,O,4,183.475519,...,0,7,3,0,0,0,0,0,2,7
4,Sr4Gd4V8O24,Sr,Gd,,V,,,O,4,290.221571,...,0,7,3,0,0,0,0,0,2,7


In [25]:
df.iloc[:, 11:50]

Unnamed: 0,GOLDSCHMIDT_TF,OCTAHEDRAL_FACTOR,A_B,A_O,B_O,A_MAX_IONIC_RADIUS,A_MAX_MOD_OF_ELASTICITY,A_MAX_BP,A_MAX_MP,A_MAX_DENSITY,...,A_MAX_ELECT_AFF,A_MAX_AT_RAD,A_MAX_AT_VOL,A_MAX_MEN_NUM,A_MAX_N_WS_THIRD,A_MAX_1_ION_POT,A_MAX_2_ION_POT,A_MAX_3_ION_POT,A_MAX_CTE,A_MAX_SP_HEAT_CAP
0,0.976828,0.385714,1.820,2.680,1.94,1.26,15.0,1655,1042,2.54,...,-168,2.45,33.7,8.0,0.84,5.695,11.03,43.6,22.5,0.3
1,0.984118,0.385714,1.840,2.700,1.94,1.26,15.0,1655,1042,2.54,...,-168,2.45,33.7,8.0,0.84,5.695,11.03,43.6,22.5,0.3
2,0.922155,0.385714,1.670,2.530,1.94,1.26,15.0,1655,1042,2.54,...,-168,2.45,33.7,8.0,0.84,5.695,11.03,43.6,22.5,0.3
3,0.927622,0.385714,1.685,2.545,1.94,1.26,15.0,1655,1042,2.54,...,-168,2.45,33.7,8.0,0.84,5.695,11.03,43.6,22.5,0.3
4,0.931267,0.385714,1.695,2.555,1.94,1.26,15.0,1655,1042,2.54,...,-168,2.45,33.7,8.0,0.84,5.695,11.03,43.6,22.5,0.3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1921,0.977471,0.457143,2.060,2.820,2.04,1.42,13.0,2078,1002,3.59,...,-52,2.76,39.24,9.0,0.81,5.212,10.004,0,20.6,0.204
1922,0.982286,0.450000,2.050,2.820,2.03,1.42,13.0,2078,1002,3.59,...,-52,2.76,39.24,9.0,0.81,5.212,10.004,0,20.6,0.204
1923,1.033182,0.378571,1.950,2.820,1.93,1.42,13.0,2078,1002,3.59,...,-52,2.76,39.24,9.0,0.81,5.212,10.004,0,20.6,0.204
1924,0.963305,0.478571,2.090,2.820,2.07,1.42,13.0,2078,1002,3.59,...,-52,2.76,39.24,9.0,0.81,5.212,10.004,0,20.6,0.204


In [26]:
placeholder_df = elemental_prop.iloc[:, 1:40].copy()
placeholder_df.columns = "AB_AVG_" + placeholder_df.columns.values
ab_avg_list_names = placeholder_df.columns

for i in range(len(ab_avg_list_names)):
    df[ab_avg_list_names[i]] = (df.iloc[:, 11+i].astype(float) + df.iloc[:, 92+i].astype(float))/2

In [27]:
placeholder_df = elemental_prop.iloc[:, 1:40].copy()
placeholder_df.columns = "DIFF_" + placeholder_df.columns.values
ab_diff_list_names = placeholder_df.columns

for i in range(len(ab_diff_list_names)):
    df[ab_diff_list_names[i]] = abs(df.iloc[:, 11+i].astype(float) - df.iloc[:, 92+i].astype(float))

In [28]:
placeholder_df = elemental_prop.iloc[:, 1:40].copy()
placeholder_df.columns = "DIFF_" + placeholder_df.columns.values
ratio_list_names = placeholder_df.columns

for i in range(len(ratio_list_names)):
    df[ratio_list_names[i]] = df.iloc[:, 11+i].astype(float) / df.iloc[:, 92+i].astype(float)

In [29]:
dft_calc[["A1_IONIC_RADIUS", "A2_IONIC_RADIUS", "A3_IONIC_RADIUS"]].iloc[0].dropna()

A1_IONIC_RADIUS    1.42
A2_IONIC_RADIUS    1.26
Name: 0, dtype: object

In [30]:
a_prop_names = []
b_prop_names = []

a_wt_avg_names = []
b_wt_avg_names = []

for i in elemental_prop.columns[1:82]:
    triplet_props_a = [j + "_" + i for j in suffixes[0:3]]
    triplet_props_b = [j + "_" + i for j in suffixes[3:6]]
    
    a_prop_names.append(triplet_props_a)
    b_prop_names.append(triplet_props_b)
    
    a_wt_avg_names.append("A_WT_AVG_" + i)
    b_wt_avg_names.append("B_WT_AVG_" + i)
    

In [31]:
a_max_all_names = ["ALL_MAX_A_" + i for i in elemental_prop.columns[1:82]]
b_max_all_names = ["ALL_MAX_B_" + i for i in elemental_prop.columns[1:82]]

a_min_all_names = ["ALL_MIN_A_" + i for i in elemental_prop.columns[1:82]]
b_min_all_names = ["ALL_MIN_B_" + i for i in elemental_prop.columns[1:82]]

a_range_names = ["RANGE_A_" + i for i in elemental_prop.columns[1:82]]
b_range_names= ["RANGE_B_" + i for i in elemental_prop.columns[1:82]]

In [32]:
for j in range(81):
    
    a_vals = []
    b_vals = []
    a_max_alls = []
    b_max_alls = []
    a_min_alls = []
    b_min_alls = []
    a_ranges = []
    b_ranges = []
    
    for i in range(dft_calc.shape[0]):
        
        num_list_a = list(dft_calc[num_col_names[0:3]].astype(float).iloc[i].dropna())
        num_list_b = list(dft_calc[num_col_names[3:6]].astype(float).iloc[i].dropna())
        
        len_a = len(num_list_a)
        len_b = len(num_list_b)

        a_properties = list(dft_calc[a_prop_names[j]].astype(float).iloc[i].dropna())
        b_properties = list(dft_calc[b_prop_names[j]].astype(float).iloc[i].dropna())
        
        a_max_all = max(a_properties)
        b_max_all = max(b_properties)
        
        a_min_all = min(a_properties)
        b_min_all = min(b_properties)
        
        a_range = a_max_all - a_min_all
        b_range = b_max_all - b_min_all
        
        a = np.sum(np.multiply(a_properties, num_list_a))/(8)
        b = np.sum(np.multiply(b_properties, num_list_b))/(8)
        
        a_vals.append(a)
        b_vals.append(b)
        a_max_alls.append(a_max_all)
        b_max_alls.append(b_max_all)
        a_min_alls.append(a_min_all)
        b_min_alls.append(b_min_all)
        a_ranges.append(a_range)
        b_ranges.append(b_range)
        
    df[a_wt_avg_names[j]] = a_vals
    df[b_wt_avg_names[j]] = b_vals
    
    df[a_max_all_names[j]] = a_max_alls
    df[b_max_all_names[j]] = b_max_alls
    
    df[a_min_all_names[j]] = a_min_alls
    df[b_min_all_names[j]] = b_min_alls
    
    df[a_min_all_names[j]] = a_ranges
    df[b_min_all_names[j]] = b_ranges