In [2]:
import pandas as pd
import numpy
import spacy

In [6]:
data_file = 'data/wikitext-2/train.txt'
nlp = spacy.load("en")

In [9]:
# def tokenize(x):
#     return x.strip().split(' ')

def tokenize(x):
    return nlp(x, parse=False, tag=False, entity=False)

def load_file(file_name):
    with open(data_file) as d:
        for line in d:
            yield [x.text for x in tokenize(line)]

def get_tokens(file):
    tokens = {}
    for sent in file:
        for token in sent:
            try:
                tokens[token] += 1
            except:
                tokens[token] = 1
    return tokens
            

In [10]:
%%time 
f = load_file(data_file)
tokens = get_tokens(f)
print(len(tokens))

33242
CPU times: user 5.63 s, sys: 1.09 ms, total: 5.63 s
Wall time: 5.64 s


In [11]:
tokens = list(tokens)

In [12]:
%%time
from collections import Counter
pairs = Counter()
for token in tokens:
    token = token.strip()
    if len(token) > 4:
        token = f"*{token}#"
        pairs[('*',)] += 1
        for ci in range(len(token)-1):
            pc, c = token[ci], token[ci+1]
            pairs[(pc,c)] += 1
            pairs[(c,)] += 1
        

CPU times: user 197 ms, sys: 0 ns, total: 197 ms
Wall time: 196 ms


In [46]:
import math
def calc_assocs(pairs):
    
    total = sum(pairs.values())
    assocs = Counter()
    for p in pairs:
        if len(p) > 1:
            x, y = p
            cXY = pairs[(x, y)]
            cX = pairs[(x,)]
            if cX == 0:
                print(x)
            cY = pairs[(y,)]

            p_x_given_y = float(cXY) / cY 
            p_x_and_not_y = float(cX - cXY)
            p_not_y = total - cY
            p_x_given_not_y = p_x_and_not_y / float(p_not_y)       


            p_y_given_x = float(cXY) / cX 
            p_y_and_not_x = float(cY - cXY)
            p_not_x = total - cX
            p_y_given_not_x = p_y_and_not_x / float(p_not_x)

            delta_p = p_x_given_y - p_x_given_not_y
            delta_p_r = p_y_given_x - p_y_given_not_x
            #delta_p = delta_p / 2.0
#             delta_p, delta_p_r = 0, 0
#             try:
#                 delta_p = -math.log(p_x_given_y) + math.log(p_x_given_not_y)
#                 delta_p_r = -math.log(p_y_given_x) + math.log(p_y_given_not_x)
#             except:
#                 pass
#             p_x_y = float(cXY) / total
#             p_x, p_y = float(cX)/total, float(cY)/total
#             pmi_x_y = math.log(p_x_y/(p_x*p_y))
#             delta_p = pmi_x_y + math.log(p_x_y)
#             delta_p_r = delta_p
            assocs[(x,y)] = (delta_p + delta_p_r) / 2, delta_p_r
    return assocs
        

In [14]:
%%time
assocs = calc_assocs(pairs)

CPU times: user 4.25 ms, sys: 0 ns, total: 4.25 ms
Wall time: 4.05 ms


In [102]:
def calc_sequence(token, assocs):
    seq = []
    token = f"*{token}#"
    for ci in range(len(token)-1):
        pc, c = token[ci], token[ci+1]
        
        assoc_score = assocs[(pc,c)]
        if type(assoc_score) != tuple :
            assoc_score = (-10,-10)
        
        seq.append((pc, c, assoc_score))
    return seq

In [152]:
#def format_seq_item(seq_item):
    

def break_sequence(token, assocs):
    seq_scores = calc_sequence(token, assocs)
    segments = [[seq_scores[0]]]
    
    for pair in seq_scores[1:]:
        prev_seg = segments[-1][-1]
        #print(pair, prev_seg)
        prev_score = prev_seg[-1][0]
        current_score = pair[-1][0]
        
        prev_score_r = prev_seg[-1][1]
        current_score_r = pair[-1][1]

        if current_score >= prev_score :       
            segments[-1].append(pair)
        else:
            segments.append([pair])
    return segments

def get_segments(segments):
    #string = [segments[0][0][0]]
    #string[0] = segments[0][0]
    string = []
    for seg in segments:
        string.append(''.join([x[1] for x in seg]))
    string[0] = segments[0][0][0] + string[0]
    
    return string

In [153]:
tokens[:10]

[' \n', ' ', '=', 'Valkyria', 'Chronicles', 'III', '\n', 'Senjō', 'no', '3']

In [154]:
from pprint import pprint
segments = break_sequence('pictures', assocs)
pprint(segments)
get_segments(segments)

[[('*', 'p', (0.1311015515781312, 0.042354210131006466))],
 [('p', 'i', (0.031185753925970575, 0.04821829069895062)),
  ('i', 'c', (0.11520417482608117, 0.0728002665488222))],
 [('c', 't', (0.05615470179520681, 0.07153683733698775))],
 [('t', 'u', (0.033010876453544405, 0.020970869601242517)),
  ('u', 'r', (0.09340248135713808, 0.1320753958047427)),
  ('r', 'e', (0.11807387017001868, 0.14048887076092767)),
  ('e', 's', (0.1261080069497572, 0.0983210116511411)),
  ('s', '#', (0.269699683397255, 0.3456234344006285))]]


['*p', 'ic', 't', 'ures#']

In [155]:
%%time
from collections import Counter
c_pairs = Counter()
for token in tokens:
    token = token.strip()
    if len(token) > 4:
        segments = get_segments(break_sequence(token,assocs))
        c_pairs[(segments[0],)] += 1
        for ci in range(len(segments)-1):
            pc, c = segments[ci], segments[ci+1]
            c_pairs[(pc,c)] += 1
            c_pairs[(c,)] += 1
        

CPU times: user 342 ms, sys: 7.28 ms, total: 350 ms
Wall time: 348 ms


In [156]:
c_assocs = calc_assocs(c_pairs)

In [157]:
def c_calc_sequence(token, assocs, c_assocs):
    seq = []
    token = get_segments(break_sequence(token,assocs))
    for ci in range(len(token)-1):
        pc, c = token[ci], token[ci+1]
        assoc_score = c_assocs[(pc,c)]
        if type(assoc_score) != tuple :
            assoc_score = (-10,-10)
        seq.append((pc, c, assoc_score))
    if len(seq) == 0:
        return token
    return seq


def c_break_sequence(token, assocs, c_assocs):
    seq_scores = c_calc_sequence(token, assocs,c_assocs)
    
    segments = [[seq_scores[0]]]
    
    for pair in seq_scores[1:]:
        prev_seg = segments[-1][-1]
        prev_score = prev_seg[-1][0]
        current_score = pair[-1][0]
        prev_score_r = prev_seg[-1][1]
        current_score_r = pair[-1][1]

        if current_score >= prev_score and current_score_r > prev_score_r:
            segments[-1].append(pair)
        else:
            segments.append([pair])
    return segments


In [158]:
c_calc_sequence('apples', assocs, c_assocs)

['*apples#']

In [162]:
segs = c_break_sequence('pictures', assocs, c_assocs)
pprint(segs)
get_segments(segs)

[[('*p', 'ic', (0.006277062600799739, 0.0027433061082693683)),
  ('ic', 't', (0.01768971830260924, 0.030021624374684374))],
 [('t', 'ures#', (0.08493006426974567, 0.002659853322716279))]]


['*pict', 'ures#']

In [160]:
all_segs = {x : get_segments(c_break_sequence(x, assocs, c_assocs)) for x in tokens[:3000] if len(x) > 5}

In [161]:
pprint(all_segs)

{'Abraham': ['*Abra', 'ham', '#'],
 'According': ['*Ac', 'cor', 'ding#'],
 'Action': ['*Action', '#'],
 'Adaptations': ['*Ad', 'aptation', 's#'],
 'Administration': ['*Ad', 'minist', 'ration', '#'],
 'Advertiser': ['*Ad', 'ver', 'ti', 's', 'er#'],
 'Africa': ['*Afric', 'a', '#'],
 'African': ['*Afric', 'an', '#'],
 'Allies': ['*Alli', 'es#'],
 'Alongside': ['*Al', 'ong', 'side#'],
 'Alphabet': ['*Al', 'phab', 'e', 't#'],
 'Although': ['*Al', 'though', '#'],
 'Always': ['*Al', 'way', 's#'],
 'American': ['*Americ', 'an#'],
 'Americans': ['*Americ', 'ans#'],
 'Andrew': ['*And', 're', 'w#'],
 'Andrews': ['*And', 're', 'ws#'],
 'Angeles': ['*Ang', 'e', 'les#'],
 'Anglican': ['*Ang', 'lican#'],
 'Another': ['*An', 'other#'],
 'Antiquities': ['*Anti', 'qu', 'ities#'],
 'Archipelago': ['*Ar', 'chi', 'pelag', 'o#'],
 'Arkadelphia': ['*Ar', 'kade', 'l', 'phi', 'a#'],
 'Arkansas': ['*Ar', 'kan', 's', 'as#'],
 'Armored': ['*Ar', 'mored#'],
 'Arniel': ['*Ar', 'ni', 'el#'],
 'Arsenal': ['*Ar', 's',