# Word tokenizer

<div class="alert alert-info">

This tutorial is available as an IPython notebook at [Malaya/example/tokenizer-word](https://github.com/huseinzol05/Malaya/tree/master/example/tokenizer-word).
    
</div>

In [1]:
%%time
import malaya

CPU times: user 6.01 s, sys: 1.2 s, total: 7.21 s
Wall time: 8.42 s


In [2]:
string1 = 'xjdi ke, y u xsuke makan HUSEIN kt situ tmpt, i hate it. pelikle, pada'
string2 = 'i mmg2 xske mknn HUSEIN kampng tmpat, i love them. pelikle saye'
string3 = 'perdana menteri ke11 sgt suka makn ayam, harganya cuma rm15.50'
string4 = 'pada 10/4, kementerian mengumumkan, 1/100'
string5 = 'Husein Zolkepli (911223-06-2305), dapat tempat ke-12 lumba lari hari ni'
string6 = 'Husein Zolkepli (2011 - 2019) adalah ketua kampng di kedah sekolah King Edward ke-IV'
string7 = '2jam 30 minit aku tunggu kau, 60.1 kg kau ni, suhu harini 31.2c, aku dahaga minum 600ml'
string8 = 'online & desktop: regexr.com or download the desktop version for Mac'
string9 = 'belajaq unity di google.us.edi?34535/534534?dfg=g&fg unity'
string10 = 'Gambar ni membantu. Gambar tutorial >>. facebook. com/story. story_fbid=10206183032200965&id=1418962070'

### Load word tokenizer

```python
class Tokenizer:
    def __init__(self, **kwargs):
        """
        Load Tokenizer object.
        Check supported regex pattern at 
        https://github.com/huseinzol05/Malaya/blob/master/malaya/text/regex.py#L85

        Parameters
        ----------
        emojis: bool, optional (default=False)
            True to keep emojis.
        urls: bool, optional (default=True)
            True to keep urls.
        urls_improved: bool, optional (default=True)
            True to keep urls, better version.
        tags: bool, optional (default=True)
            True to keep tags: <tag>.
        emails: bool, optional (default=True)
            True to keep emails.
        users: bool, optional (default=True)
            True to keep users handles: @cbaziotis.
        hashtags: bool, optional (default=True)
            True to keep hashtags.
        phones: bool, optional (default=True)
            True to keep phones.
        percents: bool, optional (default=True)
            True to keep percents.
        money: bool, optional (default=True)
            True to keep money expressions.
        date: bool, optional (default=True)
            True to keep date expressions.
        time: bool, optional (default=True)
            True to keep time expressions.
        time_pukul: bool, optional (default=True)
            True to keep time `pukul` expressions.
        acronyms: bool, optional (default=True)
            True to keep acronyms.
        emoticons: bool, optional (default=True)
            True to keep emoticons.
        censored: bool, optional (default=True)
            True to keep censored words: f**k.
        emphasis: bool, optional (default=True)
            True to keep words with emphasis: *very* good.
        numbers: bool, optional (default=True)
            True to keep numbers.
        temperature: bool, optional (default=True)
            True to keep temperatures
        distance: bool, optional (default=True)
            True to keep distances.
        volume: bool, optional (default=True)
            True to keep volumes.
        duration: bool, optional (default=True)
            True to keep durations.
        weight: bool, optional (default=True)
            True to keep weights.
        hypen: bool, optional (default=True)
            True to keep hypens.
        ic: bool, optional (default=True)
            True to keep Malaysian IC.
        title: bool, optional (default=True)
            True to keep title with dot, Dr. ayam -> ['Dr.', 'ayam']
        """
```

In [3]:
tokenizer = malaya.tokenizer.Tokenizer()

### Tokenize

```python
def tokenize(self, string: str, lowercase: bool = False):
    """
    Tokenize string into words.

    Parameters
    ----------
    string : str
    lowercase: bool, optional (default=False)

    Returns
    -------
    result: List[str]
    """
```

In [4]:
tokenizer.tokenize(string1)

['xjdi',
 'ke',
 ',',
 'y',
 'u',
 'xsuke',
 'makan',
 'HUSEIN',
 'kt',
 'situ',
 'tmpt',
 ',',
 'i',
 'hate',
 'it',
 '.',
 'pelikle',
 ',',
 'pada']

In [5]:
tokenizer.tokenize(string2)

['i',
 'mmg2',
 'xske',
 'mknn',
 'HUSEIN',
 'kampng',
 'tmpat',
 ',',
 'i',
 'love',
 'them',
 '.',
 'pelikle',
 'saye']

In [6]:
tokenizer.tokenize(string3)

['perdana',
 'menteri',
 'ke11',
 'sgt',
 'suka',
 'makn',
 'ayam',
 ',',
 'harganya',
 'cuma',
 'rm15.50']

In [7]:
tokenizer.tokenize(string4)

['pada',
 '10',
 '/',
 '4',
 ',',
 'kementerian',
 'mengumumkan',
 ',',
 '1',
 '/',
 '100']

In [8]:
tokenizer.tokenize(string5)

['Husein',
 'Zolkepli',
 '(',
 '911223-06-2305',
 ')',
 ',',
 'dapat',
 'tempat',
 'ke-12',
 'lumba',
 'lari',
 'hari',
 'ni']

In [9]:
tokenizer.tokenize(string6)

['Husein',
 'Zolkepli',
 '(',
 '2011',
 '-',
 '2019',
 ')',
 'adalah',
 'ketua',
 'kampng',
 'di',
 'kedah',
 'sekolah',
 'King',
 'Edward',
 'ke-IV']

In [10]:
tokenizer.tokenize(string7)

['2jam',
 '30 minit',
 'aku',
 'tunggu',
 'kau',
 ',',
 '60.1 kg',
 'kau',
 'ni',
 ',',
 'suhu',
 'harini',
 '31.2c',
 ',',
 'aku',
 'dahaga',
 'minum',
 '600ml']

In [11]:
tokenizer.tokenize(string8)

['online',
 '&',
 'desktop',
 ':',
 'regexr.com',
 'or',
 'download',
 'the',
 'desktop',
 'version',
 'for',
 'Mac']

In [12]:
tokenizer.tokenize(string9)

['belajaq', 'unity', 'di', 'google.us.edi?34535/534534?dfg=g&fg', 'unity']

####  url

In [13]:
tokenizer.tokenize('website saya http://huseinhouse.com')

['website', 'saya', 'http://huseinhouse.com']

In [14]:
tokenizer.tokenize('website saya huseinhouse.com')

['website', 'saya', 'huseinhouse.com']

In [15]:
tokenizer.tokenize('website saya huseinhouse.com/pelik?a=1')

['website', 'saya', 'huseinhouse.com/pelik?a=1']

#### tags

In [16]:
tokenizer.tokenize('panggil saya <husein>')

['panggil', 'saya', '<husein>']

In [17]:
tokenizer.tokenize('panggil saya <husein >')

['panggil', 'saya', '<', 'husein', '>']

#### emails

In [18]:
tokenizer.tokenize('email saya husein@rumah.com')

['email', 'saya', 'husein@rumah.com']

In [19]:
tokenizer.tokenize('email saya husein@rumah.com.my')

['email', 'saya', 'husein@rumah.com.my']

#### users

In [20]:
tokenizer.tokenize('twitter saya @husein123zolkepli')

['twitter', 'saya', '@husein123zolkepli']

In [21]:
tokenizer.tokenize('twitter saya @ husein123zolkepli')

['twitter', 'saya', '@', 'husein123zolkepli']

#### hashtags

In [22]:
tokenizer.tokenize('panggil saya #huseincomel')

['panggil', 'saya', '#huseincomel']

In [23]:
tokenizer.tokenize('panggil saya # huseincomel')

['panggil', 'saya', '#', 'huseincomel']

#### phones

In [24]:
tokenizer.tokenize('call sye di 013-1234567')

['call', 'sye', 'di', '013-1234567']

In [25]:
tokenizer.tokenize('call sye di 013- 1234567')

['call', 'sye', 'di', '013', '-', '1234567']

#### percents

In [26]:
tokenizer.tokenize('saya sokong 100%')

['saya', 'sokong', '100%']

In [27]:
tokenizer.tokenize('saya sokong 100 %')

['saya', 'sokong', '100', '%']

#### money

In [28]:
tokenizer.tokenize('saya tinggal rm100')

['saya', 'tinggal', 'rm100']

In [29]:
tokenizer.tokenize('saya tinggal rm100k')

['saya', 'tinggal', 'rm100k']

In [30]:
tokenizer.tokenize('saya tinggal rm100M')

['saya', 'tinggal', 'rm100M']

In [31]:
tokenizer.tokenize('saya tinggal rm100.123M')

['saya', 'tinggal', 'rm100.123M']

In [32]:
tokenizer.tokenize('saya tinggal 40 sen')

['saya', 'tinggal', '40 sen']

In [33]:
tokenizer.tokenize('saya tinggal 21 ringgit 50 sen')

['saya', 'tinggal', '21 ringgit', '50 sen']

In [34]:
tokenizer.tokenize('saya tinggal 21 juta ringgit')

['saya', 'tinggal', '21 juta ringgit']

In [35]:
tokenizer.tokenize('saya tinggal rm 2ribu')

['saya', 'tinggal', 'rm 2ribu']

In [36]:
tokenizer.tokenize('saya tinggal rm2 juta')

['saya', 'tinggal', 'rm2 juta']

#### date

In [37]:
tokenizer.tokenize('tarikh perjumpaan 10/11/2011')

['tarikh', 'perjumpaan', '10/11/2011']

In [38]:
tokenizer.tokenize('tarikh perjumpaan 10-11-2011')

['tarikh', 'perjumpaan', '10-11-2011']

In [39]:
tokenizer.tokenize('tarikh perjumpaan 12 mei 2011')

['tarikh', 'perjumpaan', '12 mei 2011']

In [40]:
tokenizer.tokenize('tarikh perjumpaan mei 12 2011')

['tarikh', 'perjumpaan', 'mei 12 2011']

#### time

In [41]:
tokenizer.tokenize('jumpa 3 am')

['jumpa', '3 am']

In [42]:
tokenizer.tokenize('jumpa 3.30am')

['jumpa', '3.30am']

In [43]:
tokenizer.tokenize('jumpa 22:00')

['jumpa', '22:00']

In [44]:
tokenizer.tokenize('jumpa pukul 2')

['jumpa', 'pukul 2']

In [45]:
tokenizer.tokenize('jumpa pukul 2.30')

['jumpa', 'pukul 2.30']

In [46]:
tokenizer.tokenize('jumpa 2.30 pagi')

['jumpa', '2.30 pagi']

In [47]:
tokenizer.tokenize('jumpa 2.30 ptg')

['jumpa', '2.30 ptg']

In [48]:
tokenizer.tokenize('jumpa 2.30 malam')

['jumpa', '2.30 malam']

In [49]:
tokenizer.tokenize('jumpa 2.30 tngahari')

['jumpa', '2.30 tngahari']

In [50]:
tokenizer.tokenize('jumpa 2:30:00 tngahari')

['jumpa', '2:30:00 tngahari']

In [51]:
tokenizer.tokenize('jumpa pukul 2:30:00 tngahari')

['jumpa', 'pukul 2:30:00', 'tngahari']

#### censored

In [52]:
tokenizer.tokenize('f**k lah')

['f**k', 'lah']

#### emphasis

In [53]:
tokenizer.tokenize('*damn* good weih')

['*damn*', 'good', 'weih']

#### numbers

In [54]:
tokenizer.tokenize('no saya 123')

['no', 'saya', '123']

#### temperature

In [55]:
tokenizer.tokenize('sejuk harini, 31.1c')

['sejuk', 'harini', ',', '31.1c']

In [56]:
tokenizer.tokenize('sejuk harini, 31.1C')

['sejuk', 'harini', ',', '31.1C']

#### distance

In [57]:
tokenizer.tokenize('nak sampai lagi 31km')

['nak', 'sampai', 'lagi', '31km']

In [58]:
tokenizer.tokenize('nak sampai lagi 31 km')

['nak', 'sampai', 'lagi', '31 km']

#### volume

In [59]:
tokenizer.tokenize('botol ni 400ml')

['botol', 'ni', '400ml']

In [60]:
tokenizer.tokenize('botol ni 400 l')

['botol', 'ni', '400 l']

#### duration

In [61]:
tokenizer.tokenize('aku dah tunggu kau 2jam kut')

['aku', 'dah', 'tunggu', 'kau', '2jam', 'kut']

In [62]:
tokenizer.tokenize('aku dah tunggu kau 2 jam kut')

['aku', 'dah', 'tunggu', 'kau', '2 jam', 'kut']

In [63]:
tokenizer.tokenize('lagi 10 minit 3 jam')

['lagi', '10 minit', '3 jam']

#### weight

In [64]:
tokenizer.tokenize('berat kau 60 kg')

['berat', 'kau', '60 kg']

In [65]:
tokenizer.tokenize('berat kau 60kg')

['berat', 'kau', '60kg']

#### hypen

In [66]:
tokenizer.tokenize('sememang-memangnya kau sakai')

['sememang-memangnya', 'kau', 'sakai']

In [67]:
tokenizer.tokenize('sememang- memangnya kau sakai')

['sememang', '-', 'memangnya', 'kau', 'sakai']

#### IC

In [68]:
tokenizer.tokenize('sememang-memangnya kau sakai, 911223-06-2305')

['sememang-memangnya', 'kau', 'sakai', ',', '911223-06-2305']

#### titles

In [69]:
tokenizer.tokenize('dr. syed sakai gile.')

['dr.', 'syed', 'sakai', 'gile', '.']

In [70]:
tokenizer.tokenize('dr. phd. syed sakai gile.')

['dr.', 'phd.', 'syed', 'sakai', 'gile', '.']

In [71]:
tokenizer.tokenize('ybhg. dr. syed.')

['ybhg.', 'dr.', 'syed', '.']