In [33]:
import pandas as pd
import numpy as np
import json

In [34]:
df_train = pd.read_csv('../data/raw/train.csv')

df_train.head()

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol],CO2_working_capacity [mL/g]
0,mof_unit_1,1116.667429,875.2406,0.0,0.07899,0.0607,COOH-OEt,3,4,11,pcu,22.864166,6.786041,105.284502
1,mof_unit_2,2769.503842,2211.697211,603.61,0.13794,0.104,F-OMe,10,44,57,etb,33.61678,7.147286,101.224774
2,mof_unit_3,1089.818728,773.68796,788.5,0.14874,0.1262,OMe-COOH,2,22,24,pcu,19.263726,6.347967,118.987011
3,mof_unit_4,2205.198301,1304.63872,1441.53,0.21814,0.222,H-SO3H,9,17,24,sra,25.701377,6.190085,187.626004
4,mof_unit_5,1137.800963,901.73612,0.0,0.07778,0.0591,NHMe-OH,2,1,22,pcu,30.001838,6.478063,79.210001


In [35]:
df_test = pd.read_csv('../data/raw/test.csv')

df_test.head()

Unnamed: 0,MOFname,volume [A^3],weight [u],surface_area [m^2/g],void_fraction,void_volume [cm^3/g],functional_groups,metal_linker,organic_linker1,organic_linker2,topology,CO2/N2_selectivity,heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]
0,mof_unit_68614,1208.301332,797.70936,586.54,0.11392,0.1039,OEt,2,5,26,pcu,36.639791,7.00564
1,mof_unit_68615,4126.414623,3733.65779,852.49,0.21367,0.1422,H-I,4,6,17,acs,18.390691,5.119399
2,mof_unit_68616,1602.148373,747.21048,3155.73,0.33883,0.4375,CN-OH,3,11,17,pcu,13.06285,5.0454
3,mof_unit_68617,2436.629312,995.80232,3521.09,0.40464,0.5963,OMe,2,1,28,pcu,9.601198,5.106238
4,mof_unit_68618,3123.418006,1337.538,2678.46,0.38959,0.5479,NO2-Pr,3,8,19,pcu,12.974954,5.287639


In [36]:
map_old_new_colname = {
    'heat_adsorption_CO2_P0.15bar_T298K [kcal/mol]': 'heat_adsorption',
    'CO2/N2_selectivity': 'Co2N2_selectivity',
    'CO2_working_capacity [mL/g]': 'CO2_working_capacity',
    'volume [A^3]' : 'volume',
    'weight [u]': 'weight',
    'surface_area [m^2/g]': 'surface_area',
    'void_volume [cm^3/g]': 'void_volume'
}

# Training Dataset

In [37]:
df_train = df_train.rename(columns=map_old_new_colname)

## Cat topology

In [38]:
with open('../data/processed/mapping/topology.json', 'r') as f:
    map_topology = json.load(f)

In [39]:
#df_train['topology'] = df_train['topology'].apply(lambda item: map_topology[item])
#df_train['topology'] = df_train['topology'].astype('category')

In [40]:
df_train = pd.get_dummies(df_train, columns=['topology'])

## Cat Functional Groups

In [41]:
with open('../data/processed/mapping/func_group.json', 'r') as f:
    map_func_group = json.load(f)

In [42]:
print(map_func_group)

{'NO2': 0, 'OMe': 1, 'SO3H': 2, 'OPr': 3, 'HCO': 4, 'OH': 5, 'Et': 6, 'NHMe': 7, 'Me': 8, 'OEt': 9, 'COOH': 10, 'H': 11, 'NH2': 12, 'Cl': 13, 'Pr': 14, 'F': 15, 'CN': 16, 'Br': 17, 'I': 18, 'Ph': 19}


In [43]:
def try_map_func_group(value, index):
    try:
        if len(value) == 1:
            return np.nan
        elif value[index] == 'H':
            return np.nan
        else:
            return value[index]
    except TypeError:
        return np.nan

In [44]:
df_train['functional_groups'] = df_train['functional_groups'].str.split('-')
df_train['functional_groups1'] = df_train['functional_groups'].apply(lambda item: try_map_func_group(item, 0))
#df_train['functional_groups1'] = df_train['functional_groups1'].astype('category')

df_train['functional_groups2'] = df_train['functional_groups'].apply(lambda item: try_map_func_group(item, 1))
df_train['functional_groups2'] = df_train['functional_groups2'].astype('category')

In [45]:
df_train = pd.get_dummies(df_train, columns=['functional_groups1', 'functional_groups2'], dummy_na=True)

## Metal Linker & Organic Linker

In [46]:
df_train = pd.get_dummies(df_train, columns=['organic_linker1', 'organic_linker2', 'metal_linker'])

In [47]:
df_train.head()

Unnamed: 0,MOFname,volume,weight,surface_area,void_fraction,void_volume,functional_groups,Co2N2_selectivity,heat_adsorption,CO2_working_capacity,...,organic_linker2_57,organic_linker2_58,organic_linker2_59,metal_linker_1,metal_linker_2,metal_linker_3,metal_linker_4,metal_linker_9,metal_linker_10,metal_linker_12
0,mof_unit_1,1116.667429,875.2406,0.0,0.07899,0.0607,"[COOH, OEt]",22.864166,6.786041,105.284502,...,0,0,0,0,0,1,0,0,0,0
1,mof_unit_2,2769.503842,2211.697211,603.61,0.13794,0.104,"[F, OMe]",33.61678,7.147286,101.224774,...,1,0,0,0,0,0,0,0,1,0
2,mof_unit_3,1089.818728,773.68796,788.5,0.14874,0.1262,"[OMe, COOH]",19.263726,6.347967,118.987011,...,0,0,0,0,1,0,0,0,0,0
3,mof_unit_4,2205.198301,1304.63872,1441.53,0.21814,0.222,"[H, SO3H]",25.701377,6.190085,187.626004,...,0,0,0,0,0,0,0,1,0,0
4,mof_unit_5,1137.800963,901.73612,0.0,0.07778,0.0591,"[NHMe, OH]",30.001838,6.478063,79.210001,...,0,0,0,0,1,0,0,0,0,0


## Dropping

In [48]:
col_to_drop = ['MOFname', 'functional_groups']
col_to_drop = col_to_drop + ['volume', 'weight', 'surface_area', 'void_fraction', 'void_volume']

df_train = df_train.drop(columns=col_to_drop)

In [49]:
df_train.replace({np.inf,- np.inf}, np.nan, inplace=True)
df_train['heat_adsorption'].isna().sum()

2089

In [50]:
df_train_drop_nan = df_train[~df_train['heat_adsorption'].isna()]

In [51]:
df_train_drop_nan

Unnamed: 0,Co2N2_selectivity,heat_adsorption,CO2_working_capacity,topology_acs,topology_bcu,topology_etb,topology_fof,topology_nbo,topology_pcu,topology_pts,...,organic_linker2_57,organic_linker2_58,organic_linker2_59,metal_linker_1,metal_linker_2,metal_linker_3,metal_linker_4,metal_linker_9,metal_linker_10,metal_linker_12
0,22.864166,6.786041,105.284502,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
1,33.616780,7.147286,101.224774,0,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
2,19.263726,6.347967,118.987011,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
3,25.701377,6.190085,187.626004,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,30.001838,6.478063,79.210001,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66519,5.867674,4.485481,7.602105,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0
66520,4.060772,3.605688,2.675231,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
66521,4.313411,3.361233,-1.686092,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,0,0,0
66522,3.447440,2.781566,-7.546805,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0


## Export CSV processed

In [52]:
train_save_filename = input('Enter Train processed filename: ')

In [53]:
df_train_drop_nan.to_csv(f'../data/processed/train/{train_save_filename}.csv', index=False)

# Testing Dataset

In [54]:
df_test = df_test.rename(columns=map_old_new_colname)

In [55]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17000 entries, 0 to 16999
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   MOFname            17000 non-null  object 
 1   volume             17000 non-null  float64
 2   weight             17000 non-null  float64
 3   surface_area       17000 non-null  float64
 4   void_fraction      17000 non-null  float64
 5   void_volume        17000 non-null  float64
 6   functional_groups  17000 non-null  object 
 7   metal_linker       17000 non-null  int64  
 8   organic_linker1    17000 non-null  int64  
 9   organic_linker2    17000 non-null  int64  
 10  topology           17000 non-null  object 
 11  Co2N2_selectivity  17000 non-null  float64
 12  heat_adsorption    17000 non-null  float64
dtypes: float64(7), int64(3), object(3)
memory usage: 1.7+ MB


## Functional Groups

In [56]:
def try_map_func_group(value, index):
    try:
        if len(value) == 1:
            return np.nan
        elif value[index] == 'H':
            return np.nan
        else:
            return value[index]
    except TypeError:
        return np.nan

In [57]:
df_test['functional_groups'] = df_test['functional_groups'].str.split('-')
df_test['functional_groups1'] = df_test['functional_groups'].apply(lambda item: try_map_func_group(item, 0))
#df_test['functional_groups1'] = df_test['functional_groups1'].astype('category')

df_test['functional_groups2'] = df_test['functional_groups'].apply(lambda item: try_map_func_group(item, 1))
df_test['functional_groups2'] = df_test['functional_groups2'].astype('category')

In [58]:
col_cat = [
    'topology', 'organic_linker1','organic_linker2', 'metal_linker'
]

df_test = pd.get_dummies(df_test, columns=col_cat)
df_test = pd.get_dummies(df_test, columns=['functional_groups1', 'functional_groups2'], dummy_na=True)

## Dropping

In [59]:
col_to_drop = ['MOFname', 'functional_groups']
col_to_drop = col_to_drop + ['volume', 'weight', 'surface_area', 'void_fraction', 'void_volume']

df_test = df_test.drop(columns=col_to_drop)

In [60]:
len(df_test.columns), len(df_train.columns)

(172, 175)

In [61]:
df_test = df_test.reindex(columns=df_train.drop(columns='CO2_working_capacity').columns)

In [62]:
test_save_filename = input('Enter Test processed filename: ')
df_test.to_csv(f'../data/processed/test/{test_save_filename}.csv', index=False)