# Testing Segmentation

In [1]:
import re

In [2]:
import json

with open('../data/raw/bag-of-words.json', 'r') as f:
    bag_of_words = json.load(f)
    
bag_of_words.values()

dict_values([['meth', 'eth', 'prop', 'but', 'pent', 'hex', 'hept', 'oct', 'non', 'dec', 'undec', 'dodec', 'tridec', 'tetradec', 'pentadec', 'hexadec', 'heptadec', 'octadec', 'nonadec', 'icos'], ['ane', 'ene', 'diene', 'yne', 'diyne', 'enyne', 'an', 'en', 'dien', 'yn', 'yl'], ['oic acid', 'oic anyhydride', 'oate', 'oyl', 'amide', 'nitrile', 'al', 'one', 'ol', 'thiol', 'amine', 'iminie'], ['carboxylic acid', 'carboxylic anyhydride', 'carboxylate', 'carbonyl', 'carboxamide', 'carbonitrile', 'carbaldehyde'], ['carboxy', 'alkoxycarbonyl', 'halocarbonyl', 'carbamoyl', 'cyano', 'oxo', 'hydroxy', 'mercapto', 'amino', 'iminio'], ['bicyclo', 'cyclo', 'sprio'], [',', '-', '.', '(', ')', '[', ']'], ['di', 'tri', 'tetra', 'bis', 'tris', 'tetrakis'], ['chloro', 'fluoro', 'nitro']])

In [3]:
list_of_bag_words = []
for values in bag_of_words.values():
    list_of_bag_words.extend(values)

list_of_bag_words.sort(key=len, reverse=True)

In [4]:
list_of_bag_words

['carboxylic anyhydride',
 'carboxylic acid',
 'oic anyhydride',
 'alkoxycarbonyl',
 'carbonitrile',
 'carbaldehyde',
 'halocarbonyl',
 'carboxylate',
 'carboxamide',
 'carbamoyl',
 'tetradec',
 'pentadec',
 'heptadec',
 'oic acid',
 'carbonyl',
 'mercapto',
 'tetrakis',
 'hexadec',
 'octadec',
 'nonadec',
 'nitrile',
 'carboxy',
 'hydroxy',
 'bicyclo',
 'tridec',
 'iminie',
 'iminio',
 'chloro',
 'fluoro',
 'undec',
 'dodec',
 'diene',
 'diyne',
 'enyne',
 'amide',
 'thiol',
 'amine',
 'cyano',
 'amino',
 'cyclo',
 'sprio',
 'tetra',
 'nitro',
 'meth',
 'prop',
 'pent',
 'hept',
 'icos',
 'dien',
 'oate',
 'tris',
 'eth',
 'but',
 'hex',
 'oct',
 'non',
 'dec',
 'ane',
 'ene',
 'yne',
 'oyl',
 'one',
 'oxo',
 'tri',
 'bis',
 'an',
 'en',
 'yn',
 'yl',
 'al',
 'ol',
 'di',
 ',',
 '-',
 '.',
 '(',
 ')',
 '[',
 ']']

In [35]:
def split_str_from_words(l, s):
    m = re.split('({0})'.format('|'.join(l)), s)
    return [i for i in m if i] # removes empty strings (improvements are welcome)


In [39]:
iupac = "1,1,1,5,5,5-hexafluoropentane-2,4-dione"

split_str_from_words(list_of_bag_words, iupac)

['1',
 ',',
 '1',
 ',',
 '1',
 ',',
 '5',
 ',',
 '5',
 ',',
 '5',
 '-',
 'hex',
 'a',
 'fluoro',
 'pent',
 'ane',
 '-',
 '2',
 ',',
 '4',
 '-',
 'di',
 'one']

In [40]:
%timeit split_str_from_words(bag_of_words, iupac)

1.27 µs ± 4.7 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


# Main

In [2]:
"""This file contains the tokenize function to segment the IUPAC name."""
import json
import re
from typing import List


class KMESegmentation(object):
    """Segmentation the Chemical IUPAC name using regular expression."""

    def __init__(self, path: str = None) -> None:
        """Initialize the class with the path to the bag of words file.

        Args:
            path (str): Path to bag of words json file. Defaults to None.
        """
        self.list_of_bag_words = []

        with open(path, 'r') as bag_words_file:
            bag_of_words = json.load(bag_words_file)

        for words in bag_of_words.values():
            self.list_of_bag_words.extend(words)

        self.list_of_bag_words.sort(key=len, reverse=True)

    def segment(self, iupac_name: str) -> List[str]:
        """Segment the iupac_name string using the bag of words.

        Args:
            iupac_name (str): Chemical IUPAC name that need to be segmented.

        Returns:
            List[str]: Results of the segmentation.
        """
        segmented_word = []
        joined_bag_words = '|'.join(self.list_of_bag_words)
        format_bag_words = '({0})'.format(joined_bag_words)
        raw_output = re.split(format_bag_words, iupac_name)

        for word in raw_output:
            if word != '' and word is not None:
                segmented_word.append(word)

        return segmented_word


In [3]:
tokenizer = KMESegmentation('../data/raw/bag-of-words.json')

In [4]:
tokenizer.segment('1,1,1,5,5,5-hexafluoropentane-2,4-dione')

['1',
 ',',
 '1',
 ',',
 '1',
 ',',
 '5',
 ',',
 '5',
 ',',
 '5',
 '-',
 'hex',
 'a',
 'fluoro',
 'pent',
 'ane',
 '-',
 '2',
 ',',
 '4',
 '-',
 'di',
 'one']

In [5]:
import pandas as pd
df = pd.read_csv('../data/interim/propane_iupac.csv')

df.head()

Unnamed: 0,cid,iupacname
0,69628,"1-prop-2-enoxy-2,2-bis(prop-2-enoxymethyl)butane"
1,73706,"1,1,1,5,5,5-hexafluoropentane-2,4-dione"
2,109722,"N'-(3-tetradecoxypropyl)propane-1,3-diamine"
3,111007,"3-(hexadecylamino)propane-1,2-diol"
4,264852,1-methyl-4-[2-(4-methylphenyl)propan-2-yl]benzene


In [None]:
df['iupacname'].apply(tokenizer.segment)