In [57]:
import pandas as pd
import numpy as np
import json

In [58]:
df_train = pd.read_csv('../data/raw/train.csv')

df_train.head()

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],CO2_working_capacity [mL/g]
0,mof_unit_1,1116.667429,875.2406,0.0,0.07899,0.0607,COOH-OEt,3,4,11,pcu,22.864166,6.786041,105.284502
1,mof_unit_2,2769.503842,2211.697211,603.61,0.13794,0.104,F-OMe,10,44,57,etb,33.61678,7.147286,101.224774
2,mof_unit_3,1089.818728,773.68796,788.5,0.14874,0.1262,OMe-COOH,2,22,24,pcu,19.263726,6.347967,118.987011
3,mof_unit_4,2205.198301,1304.63872,1441.53,0.21814,0.222,H-SO3H,9,17,24,sra,25.701377,6.190085,187.626004
4,mof_unit_5,1137.800963,901.73612,0.0,0.07778,0.0591,NHMe-OH,2,1,22,pcu,30.001838,6.478063,79.210001


In [59]:
df_test = pd.read_csv('../data/raw/test.csv')

df_test.head()

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]
0,mof_unit_68614,1208.301332,797.70936,586.54,0.11392,0.1039,OEt,2,5,26,pcu,36.639791,7.00564
1,mof_unit_68615,4126.414623,3733.65779,852.49,0.21367,0.1422,H-I,4,6,17,acs,18.390691,5.119399
2,mof_unit_68616,1602.148373,747.21048,3155.73,0.33883,0.4375,CN-OH,3,11,17,pcu,13.06285,5.0454
3,mof_unit_68617,2436.629312,995.80232,3521.09,0.40464,0.5963,OMe,2,1,28,pcu,9.601198,5.106238
4,mof_unit_68618,3123.418006,1337.538,2678.46,0.38959,0.5479,NO2-Pr,3,8,19,pcu,12.974954,5.287639


In [60]:
map_old_new_colname = {
    'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]': 'heat_adsorption',
    'CO2/N2_selectivity': 'Co2N2_selectivity',
    'CO2_working_capacity [mL/g]': 'CO2_working_capacity',
    'volume [A^3]' : 'volume',
    'weight [u]': 'weight',
    'surface_area [m^2/g]': 'surface_area',
    'void_volume [cm^3/g]': 'void_volume'
}

# Training Dataset

In [61]:
df_train = df_train.rename(columns=map_old_new_colname)

## Topology

In [62]:
with open('../data/processed/mapping/topology.json', 'r') as f:
    map_topology = json.load(f)

In [63]:
df_train['topology'] = df_train['topology'].apply(lambda item: map_topology[item])
df_train['topology'] = df_train['topology'].astype('category')

## Functional Groups

In [64]:
with open('../data/processed/mapping/func_group.json', 'r') as f:
    map_func_group = json.load(f)

In [65]:
print(map_func_group)

{'NO2': 0, 'OMe': 1, 'SO3H': 2, 'OPr': 3, 'HCO': 4, 'OH': 5, 'Et': 6, 'NHMe': 7, 'Me': 8, 'OEt': 9, 'COOH': 10, 'H': 11, 'NH2': 12, 'Cl': 13, 'Pr': 14, 'F': 15, 'CN': 16, 'Br': 17, 'I': 18, 'Ph': 19}


In [66]:
def try_map_func_group(value, index):
    try:
        if len(value) == 0:
            return len(map_func_group)
        elif len(value) == 1 and index == 1:
            return len(map_func_group)
        elif value[index] == 'H':
            return len(map_func_group)
        else:
            return map_func_group[value[index]]
    except TypeError:
        return len(map_func_group)

In [67]:
df_train['functional_groups'] = df_train['functional_groups'].str.split('-')
df_train['functional_groups1'] = df_train['functional_groups'].apply(lambda item: try_map_func_group(item, 0))
df_train['functional_groups1'] = df_train['functional_groups1'].astype('category')

df_train['functional_groups2'] = df_train['functional_groups'].apply(lambda item: try_map_func_group(item, 1))
df_train['functional_groups2'] = df_train['functional_groups2'].astype('category')

## Droping

In [68]:
col_to_drop = ['MOFname', 'functional_groups']
col_to_drop = col_to_drop + ['volume', 'weight', 'surface_area', 'void_fraction', 'void_volume']

df_train = df_train.drop(columns=col_to_drop)

In [69]:
df_train.replace({np.inf,- np.inf}, np.nan, inplace=True)
df_train['heat_adsorption'].isna().sum()

2089

In [70]:
df_train_drop_nan = df_train[~df_train['heat_adsorption'].isna()]

## Export Training Processed

In [71]:
train_save_filename = input('Enter Train processed filename: ')

df_train_drop_nan.to_csv(f'../data/processed/train/{train_save_filename}.csv', index=False)

# Testing Dataset

In [72]:
df_test = df_test.rename(columns=map_old_new_colname)

In [73]:
df_test['topology'] = df_test['topology'].apply(lambda item: map_topology[item])
df_test['topology'] = df_test['topology'].astype('category')

In [74]:
df_test['functional_groups'] = df_test['functional_groups'].str.split('-')
df_test['functional_groups1'] = df_test['functional_groups'].apply(lambda item: try_map_func_group(item, 0))
df_test['functional_groups1'] = df_test['functional_groups1'].astype('category')

df_test['functional_groups2'] = df_test['functional_groups'].apply(lambda item: try_map_func_group(item, 1))
df_test['functional_groups2'] = df_test['functional_groups2'].astype('category')

In [75]:
col_to_drop = ['MOFname', 'functional_groups']
col_to_drop = col_to_drop + ['volume', 'weight', 'surface_area', 'void_fraction', 'void_volume']

df_test = df_test.drop(columns=col_to_drop)

In [76]:
len(df_test.columns), len(df_train.columns)

(8, 9)

In [77]:
test_save_filename = input('Enter Test processed filename: ')
df_test.to_csv(f'../data/processed/test/{test_save_filename}.csv', index=False)