In [2]:
from pathlib import Path
import math
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import PandasTools

import sys
sys.path.insert(1, '../src/stproject')
from utils import *

import os
os.environ["PATH"] += os.pathsep + 'C:/Program Files/Graphviz 2.44.1/bin/'

pd.set_option('display.width', 500, 'display.max_rows', None, 'display.max_columns', None)
data_dir_raw = Path('../data')

## Feature engineering 1
features:<br>
'C' (# of 4 coord C)<br>
'CH' (# of 3 coord C)<br>
'CH2' (# of 2 coord C)<br>
'CH3' (# of 1 coord C)<br>
'C=C' (# of C=C)<br>
'C#C' (# of C#C)<br>
'Ar' (# of aryl (C6))<br>
'OH' (# of hydroxyl)<br>
'C-O-C' (# of ether)<br>
'CHO' (# of aldehyde)<br>
'CO' (# of ketone)<br>
'COOH' (# of carboxylic acid)<br>
'COOR' (# of ester)<br>

### fe1 - liquid surface tension dataset

In [3]:
df_stliq_clean = pd.read_csv(data_dir_raw / 'df_stliq_clean.csv', index_col=0)

# adding column hosting RDKit molecule object from smiles
PandasTools.AddMoleculeColumnToFrame(df_stliq_clean, 'smiles', 'rdkmol', includeFingerprints=True)

# feature engineering + exporting data
df_fe1 = construct_features(df_stliq_clean['rdkmol'], count_frags_1)
df_fe1.set_index(df_stliq_clean.index, inplace=True) # recovering original index

# appending measured_st and molecule name from df_stliq_clean
df_fe1['measured_st'] = df_stliq_clean['measured_st']
df_fe1['molecule'] = df_stliq_clean['molecule']
df_fe1['density'] = df_stliq_clean['density']

df_fe1 = df_fe1[~df_fe1['molecule'].str.contains('anhydride|carbonate|Formic acid|Furan')]
df_fe1.to_csv(data_dir_raw / 'df_fe1.csv')

### fe1- diols and acids reference table

In [4]:
# DIOLS
df_diols = pd.read_csv(data_dir_raw / 'diols_ref.csv', index_col=0)

# adding column hosting RDKit molecule object from smiles
PandasTools.AddMoleculeColumnToFrame(df_diols, 'smiles', 'rdkmol', includeFingerprints=True)

# adding features from feature engineering 0
df_diols_fe1 = construct_features(df_diols['rdkmol'], count_frags_1)
df_diols_fe1.set_index(df_diols.index, inplace=True) # recovering original index

df_diols_fe1.to_csv(data_dir_raw / 'df_diols_fe1.csv')


#ACIDS
df_acids = pd.read_csv(data_dir_raw / 'acids_ref.csv', index_col=0)

# adding column hosting RDKit molecule object from smiles
PandasTools.AddMoleculeColumnToFrame(df_acids, 'smiles', 'rdkmol', includeFingerprints=True)

# adding features from feature engineering 0
df_acids_fe1 = construct_features(df_acids['rdkmol'], count_frags_1)
df_acids_fe1.set_index(df_acids.index, inplace=True) # recovering original index

df_acids_fe1.to_csv(data_dir_raw / 'df_acids_fe1.csv')

### fe1 - polymer molar volumes dataset
this section needs to be corrected. Particularly the C=C correction after polymerization

In [8]:
df_mvpolym = pd.read_csv(data_dir_raw / 'df_mvpolym.csv')

# combining repeating units, smiles and molar ratios into 1 column each, containing list of repeating units, smiles
# and molar ratios respectively
df_mvpolym['ru'] = df_mvpolym.filter(regex='ru').values.tolist()
df_mvpolym['smiles'] = df_mvpolym.filter(regex='smiles').values.tolist()
df_mvpolym['molfrac'] = df_mvpolym.filter(regex='molfrac').values.tolist()

# constructing list of mols from list of smiles
def construct_rdkmol_list(df):
    smiles_list = [smiles for smiles in df['smiles'] if type(smiles) == str] 
    return [Chem.MolFromSmiles(smiles) for smiles in smiles_list]

df_mvpolym['rdkmol'] = df_mvpolym.apply(construct_rdkmol_list, axis=1)

# constructing average features
# this preserves the indices
df_mvpolym_fe1 = construct_avg_features(df_mvpolym['rdkmol'], df_mvpolym['molfrac'], count_frags_1)

# appending polymer type, name and measured molar volume
df_mvpolym_fe1['type'] = df_mvpolym['type']
df_mvpolym_fe1['polymer'] = df_mvpolym['polymer']
df_mvpolym_fe1['measured_mv'] = df_mvpolym['measured_mv']
df_mvpolym_fe1['ru'] = df_mvpolym['ru']

# correcting features based on the type of polymer
# can be thought of as atoms/fragments lost from polymerization 
# i.e. acid and OH loss for condensation, CH2-ring ring for polyether and C=C for radical polymers
df_mvpolym_fe1.loc[df_mvpolym_fe1['type'] == 'polyester', 'OH'] -= 1
df_mvpolym_fe1.loc[df_mvpolym_fe1['type'] == 'polyester', 'COOH'] -= 1
df_mvpolym_fe1.loc[df_mvpolym_fe1['type'] == 'polyester', 'COOR'] += 1
df_mvpolym_fe1.loc[df_mvpolym_fe1['type'] == 'radical', 'C=C'] -= 1
### C/CH/CH2 needs to be added for radical polymers
### C-ring/CH-ring/CH2-ring needs to be subtracted and the respective C/CH/CH2 added
### These two are done manually as of now on the csv file

# manually correcting poly(methylene oxide)
# df_mvpolym_fe1.loc[df_mvpolym_fe0['polymer'] == 'poly(methylene oxide)', ['O-eth', 'O-ester', 'R3']] = [1, 0, 0]

# scaling mv by number of repeating units for polyesters
df_mvpolym_fe1['measured_mv_scaled'] = df_mvpolym_fe1['measured_mv'] / df_mvpolym_fe1['ru'].apply(len)

df_mvpolym_fe1.to_csv(data_dir_raw / 'df_mvpolym_fe1.csv')