In [13]:
# Imports
import pandas as pd
import os
import typing
from functools import partial
from pprint import pprint
from pypif import pif

In [14]:
# Load Data
# Filter to comp
# Parse Vector
# Upload to citrination

In [15]:
load_path = os.path.join('data','training_data.csv')
save_path = os.path.join('data','processed_data.csv')

In [16]:
df = pd.read_csv(load_path)

In [17]:
original_cols = df.columns

In [18]:
cols = ['{}_percent'.format(i*10) for i in range(11)]

In [19]:
def debug(func):
    def debug_wrapper(*args, **kwargs):
        row = args[0]
        print(row['stabilityVec'])
        func(*args, **kwargs)
    return debug_wrapper

In [20]:
def vec_to_stability(row: pd.Series, cols: list) -> pd.Series:

    vec = eval(row['stabilityVec'])
    for element, col in zip(vec, cols):
        row[col] = int(element)
    
    return row

In [21]:
vtf = partial(vec_to_stability, cols=cols)

In [22]:
df = df.apply(vtf, axis=1)

In [243]:
df[['formulaA', 'formulaB']+cols].to_csv(save_path, index=False)

In [27]:
df.head()

Unnamed: 0,formulaA,formulaB,formulaA_elements_AtomicVolume,formulaB_elements_AtomicVolume,formulaA_elements_AtomicWeight,formulaB_elements_AtomicWeight,formulaA_elements_BoilingT,formulaB_elements_BoilingT,formulaA_elements_BulkModulus,formulaB_elements_BulkModulus,...,10_percent,20_percent,30_percent,40_percent,50_percent,60_percent,70_percent,80_percent,90_percent,100_percent
0,Ne,He,37.232186,37.236036,20.1791,4.002602,26.92,4.07,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
1,Cs,He,117.456016,37.236036,132.905452,4.002602,944.0,4.07,1.6,0.0,...,0,0,0,0,0,0,0,0,0,1
2,K,He,75.847865,37.236036,39.0983,4.002602,1032.0,4.07,3.1,0.0,...,0,0,0,0,0,0,0,0,0,1
3,Ba,He,64.969282,37.236036,137.327,4.002602,2143.0,4.07,9.6,0.0,...,0,0,0,0,0,0,0,0,0,1
4,Sr,He,55.323131,37.236036,87.62,4.002602,1655.0,4.07,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1


## Pivot each row and convert the stability vector into a chemical formula 
Pivot stability vector

In [44]:
binary_df = df.melt(id_vars=original_cols, var_name='weight_fraction_element_b', value_name='stable')

In [51]:
binary_df['weight_fraction_element_b'] = binary_df['weight_fraction_element_b'].str.strip('_percent').astype(float)

In [52]:
binary_df.head()

Unnamed: 0,formulaA,formulaB,formulaA_elements_AtomicVolume,formulaB_elements_AtomicVolume,formulaA_elements_AtomicWeight,formulaB_elements_AtomicWeight,formulaA_elements_BoilingT,formulaB_elements_BoilingT,formulaA_elements_BulkModulus,formulaB_elements_BulkModulus,...,formulaB_elements_ShearModulus,formulaA_elements_SpaceGroupNumber,formulaB_elements_SpaceGroupNumber,avg_coordination_A,avg_coordination_B,avg_nearest_neighbor_distance_A,avg_nearest_neighbor_distance_B,stabilityVec,weight_fraction_element_b,stable
0,Ne,He,37.232186,37.236036,20.1791,4.002602,26.92,4.07,0.0,0.0,...,0.0,225,225,12.0,8.0,3.04326,2.73717,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",0.0,1
1,Cs,He,117.456016,37.236036,132.905452,4.002602,944.0,4.07,1.6,0.0,...,0.0,229,225,8.0,8.0,5.32395,2.73717,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",0.0,1
2,K,He,75.847865,37.236036,39.0983,4.002602,1032.0,4.07,3.1,0.0,...,0.0,229,225,8.0,8.0,4.57083,2.73717,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",0.0,1
3,Ba,He,64.969282,37.236036,137.327,4.002602,2143.0,4.07,9.6,0.0,...,0.0,229,225,8.0,8.0,4.35637,2.73717,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",0.0,1
4,Sr,He,55.323131,37.236036,87.62,4.002602,1655.0,4.07,0.0,0.0,...,0.0,225,225,12.0,8.0,4.26546,2.73717,"[1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0]",0.0,1


Convert `weight_percent` to formula

In [65]:
def row_to_formula(row: pd.Series):
    w = row['weight_fraction_element_b']
    a = row['formulaA']
    b = row['formulaB']
    
    if w == 0:
        return a
    elif w == 100:
        return b
    else:
        wa = 100.0-w
        def compute_formula(wa):
            return f"{a}{wa} {b}{w}"
        return compute_formula(wa)
             

In [66]:
binary_df['formula'] = binary_df.apply(row_to_formula, axis=1)

Export data

In [74]:
binary_df[['formula','stable']].to_csv(save_path, index=False)