# Testing Segmentaiton

In [8]:
import re

In [9]:
import json

with open('../data/raw/bag-of-words.json', 'r') as f:
    bag_of_words = json.load(f)
    
bag_of_words.values()

dict_values([['meth', 'eth', 'prop', 'but', 'pent', 'hex', 'hept', 'oct', 'non', 'dec', 'undec', 'dodec', 'tridec', 'tetradec', 'pentadec', 'hexadec', 'heptadec', 'octadec', 'nonadec', 'icos'], ['ane', 'ene', 'diene', 'yne', 'diyne', 'enyne', 'an', 'en', 'dien', 'yn', 'yl'], ['oic acid', 'oic anyhydride', 'oate', 'oyl', 'amide', 'nitrile', 'al', 'one', 'ol', 'thiol', 'amine', 'iminie'], ['carboxylic acid', 'carboxylic anyhydride', 'carboxylate', 'carbonyl', 'carboxamide', 'carbonitrile', 'carbaldehyde'], ['carboxy', 'alkoxycarbonyl', 'halocarbonyl', 'carbamoyl', 'cyano', 'oxo', 'hydroxy', 'mercapto', 'amino', 'iminio'], ['bicyclo', 'cyclo', 'sprio'], [',', '-', '.', '(', ')', '[', ']'], ['di', 'tri', 'tetra', 'bis', 'tris', 'tetrakis'], ['chloro', 'fluoro', 'nitro']])

In [10]:
list_of_bag_words = []
for values in bag_of_words.values():
    list_of_bag_words.extend(values)

list_of_bag_words.sort(key=len, reverse=True)

In [11]:
list_of_bag_words

['carboxylic anyhydride',
 'carboxylic acid',
 'oic anyhydride',
 'alkoxycarbonyl',
 'carbonitrile',
 'carbaldehyde',
 'halocarbonyl',
 'carboxylate',
 'carboxamide',
 'carbamoyl',
 'tetradec',
 'pentadec',
 'heptadec',
 'oic acid',
 'carbonyl',
 'mercapto',
 'tetrakis',
 'hexadec',
 'octadec',
 'nonadec',
 'nitrile',
 'carboxy',
 'hydroxy',
 'bicyclo',
 'tridec',
 'iminie',
 'iminio',
 'chloro',
 'fluoro',
 'undec',
 'dodec',
 'diene',
 'diyne',
 'enyne',
 'amide',
 'thiol',
 'amine',
 'cyano',
 'amino',
 'cyclo',
 'sprio',
 'tetra',
 'nitro',
 'meth',
 'prop',
 'pent',
 'hept',
 'icos',
 'dien',
 'oate',
 'tris',
 'eth',
 'but',
 'hex',
 'oct',
 'non',
 'dec',
 'ane',
 'ene',
 'yne',
 'oyl',
 'one',
 'oxo',
 'tri',
 'bis',
 'an',
 'en',
 'yn',
 'yl',
 'al',
 'ol',
 'di',
 ',',
 '-',
 '.',
 '(',
 ')',
 '[',
 ']']

In [12]:
def split_str_from_words(l, s):
    m = re.split(rf"({'|'.join(l)})", s)
    return [i for i in m if i] # removes empty strings (improvements are welcome)


In [13]:
iupac = "1,1,1,5,5,5-hexafluoropentane-2,4-dione"

split_str_from_words(list_of_bag_words, iupac)

['1',
 ',',
 '1',
 ',',
 '1',
 ',',
 '5',
 ',',
 '5',
 ',',
 '5',
 '-',
 'hex',
 'a',
 'fluoro',
 'pent',
 'ane',
 '-',
 '2',
 ',',
 '4',
 '-',
 'di',
 'one']

In [14]:
%timeit split_str_from_words(bag_of_words, iupac)

1.17 µs ± 7.57 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)


# Main

In [17]:
import json
import re
from typing import List

class KMESegmentation():
    """Segmentation the Chemical IUPAC name using regular expression
    """
    def __init__(self, path:str = None) -> None:
        """Initialize the class with the path to the bag of words file

        Args:
            path (str): Path to bag of words json file. Defaults to None.
        """
        self.list_of_bag_words = []
        
        if path is None:
            raise(ValueError("Path is not specified"))

        with open(path, 'r') as f:
            bag_of_words = json.load(f)

        for values in bag_of_words.values():
            self.list_of_bag_words.extend(values)

        self.list_of_bag_words.sort(key=len, reverse=True)

    def segment(self, input: str) -> List[str]:
        """Segment the input string using the bag of words

        Args:
            input (str): Chemical IUPAC name that need to be segmented.

        Returns:
            List[str]: Results of the segmentation.
        """
        raw_output = re.split(rf"({'|'.join(self.list_of_bag_words)})", input)
        output = [word for word in raw_output if word]

        return output

In [18]:
tokenizer = KMESegmentation('../data/raw/bag-of-words.json')

In [19]:
tokenizer.segment('1,1,1,5,5,5-hexafluoropentane-2,4-dione')

['1',
 ',',
 '1',
 ',',
 '1',
 ',',
 '5',
 ',',
 '5',
 ',',
 '5',
 '-',
 'hex',
 'a',
 'fluoro',
 'pent',
 'ane',
 '-',
 '2',
 ',',
 '4',
 '-',
 'di',
 'one']