### Multi Word Segmenter

Peter Norvig's approach adopted in the code below.  
https://nbviewer.jupyter.org/url/norvig.com/ipython/How%20to%20Do%20Things%20with%20Words.ipynb

1) Import Dependencies

In [1]:
from __future__ import division
%pylab inline
import os
import regex as re
import math
import string
from collections import Counter
import pandas as pd
import numpy as np
import nltk
from tqdm import tqdm_notebook as tqdm

#!pip install -U symspellpy
from symspellpy.symspellpy import SymSpell, Verbosity  # import the module

Populating the interactive namespace from numpy and matplotlib


2) Import Custom Dictionary

In [2]:
# put dictionary into string
with open('custom_dict_bg.txt', 'r') as myfile:
    DICT=myfile.read().replace('\n', '')

#add space between numbers and characters
DICT2 = re.sub(r'(\d+)(\w+)', r'\1 \2', DICT.lower())

#get list of word and count tuples
d_tup = re.findall(r'(\w+)\s(\d+)', DICT2)

#assign counter
C = Counter()
for pair in d_tup:
    C[pair[0]] = int(pair[1])

3) Implement Word Segmentation Functions

In [3]:
def Pword(token):
    return (C[token]/30e9)

def Pwords(words):
    "Probability of words, assuming each word is independent of others."
    return product(Pword(w) for w in words)

def product(nums):
    "Multiply the numbers together.  (Like `sum`, but with multiplication.)"
    result = 1
    for x in nums:
        result *= x
    return result

In [4]:
def memo(f):
    "Memoize function f, whose args must all be hashable."
    cache = {}
    def fmemo(*args):
        if args not in cache:
            cache[args] = f(*args)
        return cache[args]
    fmemo.cache = cache
    return fmemo

In [5]:
def splits(text, start=0, L=20):
    "Return a list of all (first, rest) pairs; start <= len(first) <= L."
    return [(text[:i], text[i:]) 
            for i in range(start, min(len(text), L)+1)]

In [6]:
def wordlist(wlist):
    wrds = ''
    for word in wlist:
        wrds += word + ' '
    return wrds[:-1]

In [7]:
@memo
def segment(text):
    "Return a list of words that is the most probable segmentation of text."
    if not text: 
        return []
    else:
        candidates = ([first] + segment(rest) 
                      for (first, rest) in splits(text, 1))
        return max(candidates, key=Pwords)

In [8]:
def seg_text(text):
    return wordlist(segment(text))

In [9]:
def seg_paragraph(text):
    pstring = ''
    for token in nltk.word_tokenize(text):
        pstring += seg_text(token) + ' '
    return pstring[:-1]

4) Test it out

In [10]:
# Peter's example

decl = ('wheninthecourseofhumaneventsitbecomesnecessaryforonepeople' +
        'todissolvethepoliticalbandswhichhaveconnectedthemwithanother' +
        'andtoassumeamongthepowersoftheearththeseparateandequalstation' +
        'towhichthelawsofnatureandofnaturesgodentitlethem')

print(segment(decl))

['when', 'in', 'the', 'course', 'of', 'human', 'events', 'it', 'becomes', 'necessary', 'for', 'one', 'people', 'to', 'dissolve', 'the', 'political', 'bands', 'which', 'have', 'connected', 'them', 'with', 'another', 'and', 'to', 'assume', 'among', 'the', 'powers', 'of', 'the', 'earth', 'the', 'separate', 'and', 'equal', 'station', 'to', 'which', 'the', 'laws', 'of', 'nature', 'and', 'of', 'natures', 'god', 'entitle', 'them']


In [11]:
# test a long run-on of special words concatenated using the custom dictionary
# note this is not intended to make as much sense as Peter's example does above
"""
The output we expect:
'shri', 'purohit', 'bhagavad', 'gita', 'dnyana', 'mahabaratha', 'pandavas', 'dhritarashtra', 'pandu', 
'duryodhana', 'sanjaya', 'bheeshma', 'kurukshetra', 'drona', 'drupada', 'bheema', 'virata', 'soubhadra', 
'droupadi', 'karna', 'kuru', 'conches', 'kunti', 'viwaswana', 'manu', 'knowest', 'apana', 'om', 'narada', 
'vyasa'
"""
seg_text('shripurohitbhagavadgitadnyanamahabarathapandavasdhritarashtrapanduduryodhanasanjayabheeshmakurukshetradronadrupadabheemaviratasoubhadradroupadikarnakuruconcheskuntiviwaswanamanuapanaomnaradavyasa')

'shri purohit bhagavad gita dnyana mahabaratha pandavas dhritarashtra pandu duryodhana sanjaya bheeshma kurukshetra drona drupada bheema virata soubhadra droupadi karna kuru conches kunti viwaswana manu apana om narada vyasa'