In [1]:
from pathlib import Path
import math
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools

import sys
sys.path.insert(1, '../src/stproject')
from utils import *

import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz 2.44.1/bin/'

pd.set_option('display.width', 500, 'display.max_rows', None, 'display.max_columns', None)
data_dir_raw = Path('../data')

## Feature engineering 0
features:<br>
'C' (# of C atoms)<br>
'H' (# of H atoms)<br>
'C=C' (# of C-C double bonds)<br>
'C#C' (# of C-C triple bonds)<br>
'Ar' (# of benzene rings)<br>
'O-alc' (# of hydroxyl's O atoms)<br>
'O-eth' (# of ether's O atoms)<br>
'O-ald' (# of aldehyde's O atoms)<br>
'O-ket' (# of ketone's O atoms)<br>
'O-acid' (# of carboxylic acid's O atoms)<br>
'O-ester' (# of ester's O atoms)<br>
'R3' (# of 3-membered rings)<br>
'R4' (# of 4-membered rings)<br>
'R5' (# of 5-membered rings)<br>
'R6' (# of 6-membered rings)<br>
'R7' (# of 7-membered rings)<br>
'R8' (# of 8-membered rings)

### fe0 - liquid surface tension dataset

In [2]:
df_stliq_clean = pd.read_csv(data_dir_raw / 'df_stliq_clean.csv', index_col=0)

# adding column hosting RDKit molecule object from smiles
PandasTools.AddMoleculeColumnToFrame(df_stliq_clean, 'smiles', 'rdkmol', includeFingerprints=True)

# feature engineering + exporting data
df_fe0 = construct_features(df_stliq_clean['rdkmol'], count_frags_0)
df_fe0.set_index(df_stliq_clean.index, inplace=True) # recovering original index

# appending measured_st and molecule name from df_stliq_clean
df_fe0['measured_st'] = df_stliq_clean['measured_st']
df_fe0['molecule'] = df_stliq_clean['molecule']
df_fe0['density'] = df_stliq_clean['density']

df_fe0 = df_fe0[~df_fe0['molecule'].str.contains('anhydride|carbonate|Formic acid|Furan')]
df_fe0.to_csv(data_dir_raw / 'df_fe0.csv')

### fe0 - diols and acids reference table

In [3]:
# DIOLS
df_diols = pd.read_csv(data_dir_raw / 'diols_ref.csv', index_col=0)

# adding column hosting RDKit molecule object from smiles
PandasTools.AddMoleculeColumnToFrame(df_diols, 'smiles', 'rdkmol', includeFingerprints=True)

# adding features from feature engineering 0
df_diols_fe0 = construct_features(df_diols['rdkmol'], count_frags_0)
df_diols_fe0.set_index(df_diols.index, inplace=True) # recovering original index

df_diols_fe0.to_csv(data_dir_raw / 'df_diols_fe0.csv')


#ACIDS
df_acids = pd.read_csv(data_dir_raw / 'acids_ref.csv', index_col=0)

# adding column hosting RDKit molecule object from smiles
PandasTools.AddMoleculeColumnToFrame(df_acids, 'smiles', 'rdkmol', includeFingerprints=True)

# adding features from feature engineering 0
df_acids_fe0 = construct_features(df_acids['rdkmol'], count_frags_0)
df_acids_fe0.set_index(df_acids.index, inplace=True) # recovering original index

df_acids_fe0.to_csv(data_dir_raw / 'df_acids_fe0.csv')

### fe0 - polymer molar volumes dataset

In [6]:
df_mvpolym = pd.read_csv(data_dir_raw / 'df_mvpolym.csv')

# combining repeating units, smiles and molar ratios into 1 column each, containing list of repeating units, smiles
# and molar ratios respectively
df_mvpolym['ru'] = df_mvpolym.filter(regex='ru').values.tolist()
df_mvpolym['smiles'] = df_mvpolym.filter(regex='smiles').values.tolist()
df_mvpolym['molfrac'] = df_mvpolym.filter(regex='molfrac').values.tolist()

# constructing list of mols from list of smiles
def construct_rdkmol_list(df):
    smiles_list = [smiles for smiles in df['smiles'] if type(smiles) == str] 
    return [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

df_mvpolym['rdkmol'] = df_mvpolym.apply(construct_rdkmol_list, axis=1)

# constructing average features
# this preserves the indices
df_mvpolym_fe0 = construct_avg_features(df_mvpolym['rdkmol'], df_mvpolym['molfrac'], count_frags_0)

# appending polymer type
df_mvpolym_fe0['type'] = df_mvpolym['type']

# correcting features based on the type of polymer
# can be thought of as atoms/fragments lost from polymerization (e.g. H2O loss for condensation)
