In [1]:
import pandas as pd
import numpy as np
import re
import warnings
warnings.filterwarnings("ignore")

In [2]:
# DFT Calculations
dft_calc = pd.read_csv("data/dft_calc.csv").replace(" ", 0)

# Elemental Properties
elemental_prop = pd.read_csv("data/elemental_properties.csv")
elemental_prop = elemental_prop.replace(" ", 0)

df = dft_calc.copy()

In [3]:
dft_calc.head()

Unnamed: 0,COMPOSITION,A_SITE_1,A_SITE_2,A_SITE_3,B_SITE_1,B_SITE_2,B_SITE_3,X_SITE,NUM_ELEMS,ENERGY_ABOVE_HULL,FORMATION_ENERGY
0,Ba1Sr7V8O24,Ba,Sr,,V,,,O,4,29.747707,-2.113335
1,Ba2Bi2Pr4Co8O24,Ba,Bi,Pr,Co,,,O,5,106.702335,-1.311863
2,Ba2Ca6Fe8O24,Ba,Ca,,Fe,,,O,4,171.608093,-1.435607
3,Ba2Cd2Pr4Ni8O24,Ba,Cd,Pr,Ni,,,O,5,284.89819,-0.868639
4,Ba2Dy6Fe8O24,Ba,Dy,,Fe,,,O,4,270.007913,-1.746806


In [4]:
dft_calc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1926 entries, 0 to 1925
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   COMPOSITION        1926 non-null   object 
 1   A_SITE_1           1926 non-null   object 
 2   A_SITE_2           1159 non-null   object 
 3   A_SITE_3           34 non-null     object 
 4   B_SITE_1           1926 non-null   object 
 5   B_SITE_2           1247 non-null   object 
 6   B_SITE_3           33 non-null     object 
 7   X_SITE             1926 non-null   object 
 8   NUM_ELEMS          1926 non-null   int64  
 9   ENERGY_ABOVE_HULL  1926 non-null   float64
 10  FORMATION_ENERGY   1926 non-null   float64
dtypes: float64(2), int64(1), object(8)
memory usage: 165.6+ KB


In [5]:
df.drop(columns=["SYMBOL"]) = df.drop(columns=["SYMBOL"])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 110 entries, 0 to 109
Data columns (total 82 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   SYMBOL                  110 non-null    object 
 1   IONIC_RADIUS            110 non-null    object 
 2   MOD_OF_ELASTICITY       81 non-null     float64
 3   BP                      110 non-null    object 
 4   MP                      110 non-null    object 
 5   DENSITY                 110 non-null    object 
 6   AT_WT                   110 non-null    object 
 7   BCC_EFF_LAT_CNT         110 non-null    float64
 8   BCC_ENERGY              110 non-null    float64
 9   BCC_ENERGY_DIFF         110 non-null    float64
 10  BCC_FERMI               110 non-null    float64
 11  BCC_MAG_MOM             110 non-null    float64
 12  BCC_VOLUME_PA           110 non-null    float64
 13  BCC_VOLUME_DIFF         110 non-null    float64
 14  GS_BANDGAP              110 non-null    fl

In [None]:
df.head()

In [None]:
suffixes = "A1 A2 A3 B1 B2 B3".split()
site_names = list(dft_calc.columns[1:7])
symbol_names = [i+"_SYMBOL" for i in suffixes]
elemental_prop_names = [i+"_elemental_prop" for i in suffixes]
elemental_prop_col_names = []

In [None]:
for i in range(6):
    placeholder_df = elemental_prop.copy()
    placeholder_df.columns = suffixes[i] + "_" + placeholder_df.columns.values
    dft_calc = pd.merge(dft_calc, placeholder_df, how="left", left_on=site_names[i], right_on=symbol_names[i])
    
    placeholder_df = placeholder_df.drop(columns=[suffixes[i] + "_SYMBOL"])
    elemental_prop_col_names.append(placeholder_df.columns)

In [None]:
objects_cols = [i for i in dft_calc.columns if dft_calc[i].dtypes == "object"]

In [None]:
def num_of_sites(site):
    nums =[]
    
    for i in range(dft_calc.shape[0]):
        matches = re.findall(r"(\D*)(\d*)", dft_calc["COMPOSITION"].iloc[i])

        if type(dft_calc[site].iloc[i]) == str:
            
            for j in range(len(matches)):
                if matches[j][0] == dft_calc[site].iloc[i]:
                    nums.append(matches[j][1])
                    continue
                else:
                    continue
        else:
            nums.append(np.nan)
    return nums 

In [None]:
num_sites = []

for i in site_names:
    num_sites.append(num_of_sites(i))

In [None]:
num_col_names = ["NUM_" + i for i in suffixes]
num_col_names

In [None]:
for i in range(6):
    dft_calc[num_col_names[i]] = num_sites[i]

In [None]:
ionic_radius_names = [i for i in dft_calc if "_IONIC_RADIUS" in i]
ionic_radius_names

In [None]:
gs = []
of = []
ab = []
ao = []
bo = []
a_max = []
b_max = []
a_wt_avg = []
b_wt_avg = []

for i in range(dft_calc.shape[0]):
    
    ionic_radii_list_a = list(dft_calc[ionic_radius_names[0:3]].astype(float).iloc[i].dropna())
    num_list_a = list(dft_calc[num_col_names[0:3]].astype(float).iloc[i].dropna())
    a_max.append(dft_calc[site_names[np.argmax(num_list_a)]].iloc[i])
    a_wt_avg.append(num_list_a)
    
    ionic_radii_list_b = list(dft_calc[ionic_radius_names[3:6]].astype(float).iloc[i].dropna())
    num_list_b = list(dft_calc[num_col_names[3:6]].astype(float).iloc[i].dropna())
    b_max.append(dft_calc[site_names[np.argmax(num_list_b)+3]].iloc[i])
    b_wt_avg.append(num_list_b)
    
    a_sum = 0
    
    sam_list_a = len(ionic_radii_list_a)
    sam_list_b = len(ionic_radii_list_b)
    
    for j in range(sam_list_a):
        a = ionic_radii_list_a[j] * num_list_a[j]
        a_sum = a_sum + a
    
    b_sum = 0
    for k in range(sam_list_b):
        b = ionic_radii_list_b[k] * num_list_b[k]
        b_sum = b_sum + b
        of_ = (b_sum/8)/1.4

    gs_tf = ((a_sum/8) + 1.4)/(np.sqrt(2)*((b_sum/8)+1.4))    
    
    gs.append(gs_tf)
    of.append(of_)
    ab.append((a_sum/8)+(b_sum/8))
    ao.append((a_sum/8)+1.4)
    bo.append((b_sum/8)+1.4)
    
dft_calc["GOLDSCHMIDT_TF"] = gs
dft_calc["OCTAHEDRAL_FACTOR"] = of
dft_calc["A_B"] = ab
dft_calc["A_O"] = ao
dft_calc["B_O"] = bo
dft_calc["A_MAX"] = a_max
dft_calc["B_MAX"] = b_max

In [None]:
df["GOLDSCHMIDT_TF"] = gs
df["OCTAHEDRAL_FACTOR"] = of
df["A_B"] = ab
df["A_O"] = ao
df["B_O"] = bo

In [None]:
df["A_MAX"] = a_max
df["B_MAX"] = b_max

In [None]:
placeholder_df = elemental_prop.copy()
placeholder_df.columns = "A_MAX_" + placeholder_df.columns.values
a_max_names = list(placeholder_df.columns)
df = pd.merge(df, placeholder_df, how="inner", left_on="A_MAX", right_on="A_MAX_SYMBOL")
df = df.drop(columns=["A_MAX", "A_MAX_SYMBOL"])
a_max_names.remove("A_MAX_SYMBOL")

In [None]:
placeholder_df = elemental_prop.copy()
placeholder_df.columns = "B_MAX_" + placeholder_df.columns.values
b_max_names = list(placeholder_df.columns)
df = pd.merge(df, placeholder_df, how="inner", left_on="B_MAX", right_on="B_MAX_SYMBOL")
df = df.drop(columns=["B_MAX", "B_MAX_SYMBOL"])
b_max_names.remove("B_MAX_SYMBOL")

In [None]:
placeholder_df = elemental_prop.iloc[:, 1:40].copy()
placeholder_df.columns = "AB_AVG_" + placeholder_df.columns.values
ab_avg_list_names = placeholder_df.columns

for i in range(len(ab_avg_list_names)):
    df[ab_avg_list_names[i]] = (df[a_max_names[i]].astype(float) + df[b_max_names[i]].astype(float))/2

In [None]:
placeholder_df = elemental_prop.iloc[:, 1:40].copy()
placeholder_df.columns = "DIFF_" + placeholder_df.columns.values
ab_diff_list_names = placeholder_df.columns

for i in range(len(ab_diff_list_names)):
    df[ab_diff_list_names[i]] = abs(df[a_max_names[i]].astype(float) - df[b_max_names[i]].astype(float))

In [None]:
placeholder_df = elemental_prop.iloc[:, 1:40].copy()
placeholder_df.columns = "RATIO_" + placeholder_df.columns.values
ratio_list_names = placeholder_df.columns

for i in range(len(ratio_list_names)):
    df[ratio_list_names[i]] = df[a_max_names[i]].astype(float) / df[b_max_names[i]].astype(float)

In [None]:
a_max_names

In [None]:
elemental_prop.columns

In [None]:
df["A_MAX_BCC_ENERGY_DIFF"]

In [None]:
df["B_MAX_BCC_ENERGY_DIFF"]

In [None]:
"B1_BCC_ENERGY_DIFF" in df.columns

In [None]:
dft_calc["BCC_ENERGY_DIFF"]

In [None]:
a_prop_names = []
b_prop_names = []

a_wt_avg_names = []
b_wt_avg_names = []

for i in elemental_prop.columns[1:82]:
    triplet_props_a = [j + "_" + i for j in suffixes[0:3]]
    triplet_props_b = [j + "_" + i for j in suffixes[3:6]]
    
    a_prop_names.append(triplet_props_a)
    b_prop_names.append(triplet_props_b)
    
    a_wt_avg_names.append("A_WT_AVG_" + i)
    b_wt_avg_names.append("B_WT_AVG_" + i)
    

In [None]:
a_max_all_names = ["ALL_MAX_A_" + i for i in elemental_prop.columns[1:82]]
b_max_all_names = ["ALL_MAX_B_" + i for i in elemental_prop.columns[1:82]]

a_min_all_names = ["ALL_MIN_A_" + i for i in elemental_prop.columns[1:82]]
b_min_all_names = ["ALL_MIN_B_" + i for i in elemental_prop.columns[1:82]]

a_range_names = ["RANGE_A_" + i for i in elemental_prop.columns[1:82]]
b_range_names= ["RANGE_B_" + i for i in elemental_prop.columns[1:82]]

In [None]:
for j in range(81):
    
    a_vals = []
    b_vals = []
    a_max_alls = []
    b_max_alls = []
    a_min_alls = []
    b_min_alls = []
    a_ranges = []
    b_ranges = []
    
    for i in range(dft_calc.shape[0]):
        
        num_list_a = list(dft_calc[num_col_names[0:3]].astype(float).iloc[i].dropna())
        num_list_b = list(dft_calc[num_col_names[3:6]].astype(float).iloc[i].dropna())
        
        len_a = len(num_list_a)
        len_b = len(num_list_b)

        a_properties = list(dft_calc[a_prop_names[j]].astype(float).iloc[i].dropna())
        b_properties = list(dft_calc[b_prop_names[j]].astype(float).iloc[i].dropna())
        
        a_max_all = max(a_properties)
        b_max_all = max(b_properties)
        
        a_min_all = min(a_properties)
        b_min_all = min(b_properties)
        
        a_range = a_max_all - a_min_all
        b_range = b_max_all - b_min_all
        
        a = np.sum(np.multiply(a_properties, num_list_a))/(8)
        b = np.sum(np.multiply(b_properties, num_list_b))/(8)
        
        a_vals.append(a)
        b_vals.append(b)
        a_max_alls.append(a_max_all)
        b_max_alls.append(b_max_all)
        a_min_alls.append(a_min_all)
        b_min_alls.append(b_min_all)
        a_ranges.append(a_range)
        b_ranges.append(b_range)
        
    df[a_wt_avg_names[j]] = a_vals
    df[b_wt_avg_names[j]] = b_vals
    
    df[a_max_all_names[j]] = a_max_alls
    df[b_max_all_names[j]] = b_max_alls
    
    df[a_min_all_names[j]] = a_min_alls
    df[b_min_all_names[j]] = b_min_alls
    
    df[a_min_all_names[j]] = a_ranges
    df[b_min_all_names[j]] = b_ranges

---

In [None]:
uglies = "COMPOSITION A_SITE_1 A_SITE_2 A_SITE_3 B_SITE_1 B_SITE_2 B_SITE_3 X_SITE".split()

In [None]:
object_columns = [i for i in df.columns if df[i].dtypes == "object" and i not in uglies]

In [None]:
df[object_columns] = df[object_columns].astype(float)

In [None]:
object_columns = [i for i in df.columns if df[i].dtypes == "object"]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split


In [None]:
X = df.drop(columns=["ENERGY_ABOVE_HULL", 
                     "FORMATION_ENERGY", 
                     "COMPOSITION",
                     "A_SITE_1",
                     "A_SITE_2",
                     "A_SITE_3", 
                     "B_SITE_1", 
                     "B_SITE_2",
                     "B_SITE_3",
                     "X_SITE"])
y = df["FORMATION_ENERGY"]

In [None]:
import math

In [None]:
inf_indices = []

for i in range(X.shape[0]):
    for j in range(len(X.columns)):
        print(i)
        print(j)
        if math.isinf(X.iloc[i, j]):
            inf_indices.append([i, j])

In [None]:
X.iloc[:,254]

In [None]:
df["RATIO_BCC_ENERGY_DIFF"]

In [None]:
inf_indices

In [None]:
X.iloc[:,254]

In [None]:
i

In [None]:
X.iloc[0,6]

In [None]:
X.columns[207]

In [None]:
inf_indices

In [None]:
len(indexNum)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.20,
                                                    random_state=42)

In [None]:
max = []

for i in [j for j in df.columns if df[j].dtypes != "object"]:
    print(i)
    print(df[i].max())
    max.append(df[i].max())
    

In [None]:
max

In [None]:
np.argmax(max)

In [None]:
X_train

In [None]:
scaler = StandardScaler()

In [None]:
X_train = scaler.fit_transform(X_train)

In [None]:
df.dtypes

In [None]:
df