In [1]:
import pandas as pd
import numpy
import spacy

In [2]:
data_file = 'data/wikitext-2/train.txt'
nlp = spacy.load("en")

In [3]:
! wc -l data/wikitext-2/train.txt

36718 data/wikitext-2/train.txt


In [4]:
# def tokenize(x):
#     return x.strip().split(' ')

def tokenize(x):
    return nlp(x, parse=False, tag=False, entity=False)

def load_file(file_name):
    with open(data_file) as d:
        for line in d:
            yield [x.text for x in tokenize(line)]

def get_tokens(file):
    tokens = {}
    for sent in file:
        for token in sent:
            try:
                tokens[token] += 1
            except:
                tokens[token] = 1
    return tokens
            

In [17]:
%%time 
f = load_file(data_file)
tokens = get_tokens(f)
print(len(tokens))

33242
CPU times: user 5.45 s, sys: 4.01 ms, total: 5.45 s
Wall time: 5.48 s


In [18]:
tokens = list(tokens)

In [19]:
%%time
from collections import Counter
pairs = Counter()
contexts = Counter()

#tokens = ['apples']

CONTEXT_NGRAM_SIZE = 2

for token in tokens:
    token = token.strip()
    if len(token) > 4:
        token = f"^{token}$"        
        
        for ci in range(1, len(token)-2):
            pair = token[ci], token[ci+1]
            right_context = token[ci+2:ci+2+CONTEXT_NGRAM_SIZE]
            min_left = ci - CONTEXT_NGRAM_SIZE
            if min_left < 0:
                min_left = 0
                
            left_context = token[min_left:ci]
            
            pairs[pair] += 1
            contexts[(left_context, 'l')] += 1
            contexts[(right_context, 'r')] += 1
        
        

CPU times: user 305 ms, sys: 0 ns, total: 305 ms
Wall time: 304 ms


In [22]:
len(pairs), len(contexts)

(1575, 2603)

In [56]:
import math
def to_probs(counts):
    probs = Counter()
    all_values = list(counts.values())
    sum_count = sum(all_values)
    
    for k, v in counts.items():
        probs[k] = round(math.log(v / sum_count), 2)
    return probs

In [57]:
pair_probs, context_probs = to_probs(pairs), to_probs(contexts)

In [74]:
def calc_scores(token, pair_probs, context_probs):
    token = f"^{token}$"
    for ci in range(1, len(token)-2):
        sequence = []
        for cj in range(ci, len(token)-2):            
            pair = token[cj], token[cj+1]
            score = 0
            if len(sequence) > 0:
                prev_pair = sequence[-1][0]
                score = sequence[-1][1]
                #print(pair, prev_pair)
                #print(sequence)
                
            right_context = token[cj+2:cj+2+CONTEXT_NGRAM_SIZE]
            
            min_left = ci - CONTEXT_NGRAM_SIZE
            if min_left < 0:
                min_left = 0
            left_context = token[min_left:ci]
            
            pair_score = pair_probs[pair]
                        
            left_context_score = context_probs[(left_context, 'l')]
            right_context_score = context_probs[(right_context, 'r')]

            total_pair_score = score+pair_score
            sum_score = total_pair_score + left_context_score + right_context_score

            sequence.append((pair, total_pair_score, (left_context, left_context_score), (right_context, right_context_score)))

            joined_pair = ''.join([s[0][0] for s in sequence])
            
            print(f"{left_context}:{left_context_score} {joined_pair}:{total_pair_score} {right_context}:{right_context_score} {sum_score}")
            
#             print(pair, pair_score)
#             print(left_context, left_context_score)
#             print(right_context, right_context_score)
#             print('Total Score', sum_score)
#             print('-')

In [75]:
calc_scores('apples', pair_probs, context_probs)

^:-2.61 a:-5.97 pl:-7.03 -15.61
^:-2.61 ap:-12.5 le:-5.45 -20.56
^:-2.61 app:-18.53 es:-5.0 -26.14
^:-2.61 appl:-23.12 s$:-4.16 -29.89
^:-2.61 apple:-27.270000000000003 $:-2.61 -32.49
^a:-5.74 p:-6.53 le:-5.45 -17.72
^a:-5.74 pp:-12.56 es:-5.0 -23.3
^a:-5.74 ppl:-17.15 s$:-4.16 -27.05
^a:-5.74 pple:-21.299999999999997 $:-2.61 -29.65
ap:-6.76 p:-6.03 es:-5.0 -17.79
ap:-6.76 pl:-10.620000000000001 s$:-4.16 -21.540000000000003
ap:-6.76 ple:-14.770000000000001 $:-2.61 -24.14
pp:-7.29 l:-4.59 s$:-4.16 -16.04
pp:-7.29 le:-8.74 $:-2.61 -18.64
pl:-6.81 e:-4.15 $:-2.61 -13.57


In [49]:
import math
def calc_assocs(pairs):
    
    total = sum(pairs.values())
    assocs = Counter()
    for p in pairs:
        if len(p) > 2:
            x, y, context = p
            cXY = pairs[(x, y, context)]
            context_left, context_right = context[0], context[1]
            cX = pairs[(x, (context_left, y))]
            if cX == 0:
                print(p, (x, (context_left, y)))
            cY = pairs[(y, (x,context_right))]

            p_x_given_y = float(cXY) / cY 
            p_x_and_not_y = float(cX - cXY)
            p_not_y = total - cY
            p_x_given_not_y = p_x_and_not_y / float(p_not_y)       


            p_y_given_x = float(cXY) / cX 
            p_y_and_not_x = float(cY - cXY)
            p_not_x = total - cX
            p_y_given_not_x = p_y_and_not_x / float(p_not_x)

            delta_p = p_x_given_y - p_x_given_not_y
            delta_p_r = p_y_given_x - p_y_given_not_x
            #delta_p = delta_p / 2.0
#             delta_p, delta_p_r = 0, 0
#             try:
#                 delta_p = -math.log(p_x_given_y) + math.log(p_x_given_not_y)
#                 delta_p_r = -math.log(p_y_given_x) + math.log(p_y_given_not_x)
#             except:
#                 pass
#             p_x_y = float(cXY) / total
#             p_x, p_y = float(cX)/total, float(cY)/total
#             pmi_x_y = math.log(p_x_y/(p_x*p_y))
#             delta_p = pmi_x_y + math.log(p_x_y)
#             delta_p_r = delta_p
            assocs[p] = delta_p , delta_p_r
    return assocs
        

In [9]:
%%time
assocs = calc_assocs(pairs)

CPU times: user 69.3 ms, sys: 3.77 ms, total: 73 ms
Wall time: 72.7 ms


In [123]:
assocs

Counter({('*', 'V', ('~', 'a')): (0.9997555910980412, 0.2647058823529412),
         ('V', 'a', ('*', 'l')): (0.9332727234809086, 0.31110915583989546),
         ('a', 'l', ('V', 'k')): (0.03843416568142205, 0.06661778775342087),
         ('l', 'k', ('a', 'y')): (0.3332844555668517, 0.038457628064378975),
         ('k', 'y', ('l', 'r')): (0.9999960897939713, 0.3333333333333333),
         ('y', 'r', ('k', 'i')): (0.043478260869565216, 0.9999569877336837),
         ('r', 'i', ('y', 'a')): (0.0388056131524167, 0.34743895990846324),
         ('i', 'a', ('r', '#')): (0.1805146631217811, 0.16474733710267558),
         ('a', '#', ('i', '~')): (0.19125127161749747, 0.9984451246345065),
         ('*', 'C', ('~', 'h')): (0.9984119192112793, 0.17647058823529413),
         ('C', 'h', ('*', 'r')): (0.9997086759812145, 0.14367816091954022),
         ('h', 'r', ('C', 'o')): (0.06662560476683033, 0.15989050909361083),
         ('r', 'o', ('h', 'n')): (0.10517710795839423, 0.2664007419820123),
         (

In [28]:
def calc_sequence(token, assocs):
    seq = []
    token = f"*{token}#"
    for ci in range(len(token)-1):
        pc, c = token[ci], token[ci+1]
        
        context_left = token[ci - 1] if ci -1 >= 0 else '~'
        context_right = token[ci + 2] if ci + 2 <= len(token) - 1 else '~'
        context = (context_left, context_right)
        
        assoc_score = assocs[(pc,c, context)]
        if type(assoc_score) != tuple :
            assoc_score = (-10,-10)
        
        seq.append((pc, c, context, assoc_score))
    return seq

In [29]:
#def format_seq_item(seq_item):
    

def break_sequence(token, assocs):
    seq_scores = calc_sequence(token, assocs)
    #print(seq_scores)
    segments = [[seq_scores[0]]]
    threshold = 0.4
    for pair in seq_scores[1:]:
        #print(pair)
        prev_seg = segments[-1][-1]
        #print(pair, prev_seg)
        prev_score = prev_seg[-1][0]
        current_score = pair[-1][0]
        
        prev_score_r = prev_seg[-1][1]
        current_score_r = pair[-1][1]

        if current_score >= prev_score :       
        #if  current_score_r >=threshold :
            segments[-1].append(pair)
        else:
            segments.append([pair])
    return segments

def get_segments(segments):
    #string = [segments[0][0][0]]
    #string[0] = segments[0][0]
    string = []
    for seg in segments:
        string.append(''.join([x[1] for x in seg]))
    string[0] = segments[0][0][0] + string[0]
    
    return string

In [30]:
tokens[:10]

[' \n', ' ', '=', 'Valkyria', 'Chronicles', 'III', '\n', 'Senjō', 'no', '3']

In [32]:
from pprint import pprint
segments = break_sequence('cats', assocs)
pprint(segments)
get_segments(segments)

[[('*', 'c', ('~', 'a'), (0.9969485418529002, 0.1390728476821192))],
 [('c', 'a', ('*', 't'), (0.1732558846935741, 0.13064527907475618))],
 [('a', 't', ('c', 's'), (0.024020700532907387, 0.005184924865925573)),
  ('t', 's', ('a', '#'), (0.048281749988297734, 0.8279582712142167)),
  ('s', '#', ('t', '~'), (0.12052730696798493, 0.9899428128407785))]]


['*c', 'a', 'ts#']

In [137]:
#%%time
from collections import Counter
c_pairs = Counter()
for token in tokens:
    token = token.strip()
    if len(token) > 4:
        segments = get_segments(break_sequence(token,assocs))
        if len(segments) >= 2:
            context = ('~', segments[1])
            #pairs[('*', context)] += 1
            c_pairs[(segments[0],context)] += 1
            for ci in range(len(segments)-1):
                pc, c = segments[ci], segments[ci+1]
                context_left = segments[ci - 1] if ci -1 >= 0 else '~'
                context_right = segments[ci + 2] if ci + 2 <= len(segments) - 1 else '~'
                context = (context_left, context_right)
                c_pairs[(pc,c,context)] += 1
                c_pairs[(c, (pc, context_right))] += 1
                #c_pairs[(c,)] += 1


In [138]:
c_assocs = calc_assocs(c_pairs)

In [139]:
def c_calc_sequence(token, assocs, c_assocs):
    seq = []
    token = get_segments(break_sequence(token,assocs))
    for ci in range(len(token)-1):
        pc, c = token[ci], token[ci+1]
        context_left = token[ci - 1] if ci -1 >= 0 else '~'
        context_right = token[ci + 2] if ci + 2 <= len(token) - 1 else '~'
        context = (context_left, context_right)
        assoc_score = c_assocs[(pc,c,context)]
        if type(assoc_score) != tuple :
            assoc_score = (-10,-10)
        seq.append((pc, c, context, assoc_score))
    if len(seq) == 0:
        return token
    return seq


def c_break_sequence(token, assocs, c_assocs):
    seq_scores = c_calc_sequence(token, assocs,c_assocs)
    
    segments = [[seq_scores[0]]]
    threshold = 0.4
    for pair in seq_scores[1:]:
        prev_seg = segments[-1][-1]
        prev_score = prev_seg[-1][0]
        current_score = pair[-1][0]
        prev_score_r = prev_seg[-1][1]
        current_score_r = pair[-1][1]

        #if current_score >= prev_score and current_score_r > prev_score_r:
        if current_score >= threshold:
            segments[-1].append(pair)
        else:
            segments.append([pair])
    return segments


In [140]:
c_calc_sequence('computerize', assocs, c_assocs)

[('*c', 'om', ('~', 'pu'), (0.9996466614705118, 0.06422018348623854)),
 ('om', 'pu', ('*c', 't'), (0.875, 0.9999965358967697)),
 ('pu', 't', ('om', 'e'), (0.7999861436830784, 0.49999653588476967)),
 ('t', 'e', ('pu', 'r'), (0.01407064755664833, 0.1997575144538706)),
 ('e', 'r', ('t', 'iz'), (0.9997575178137806, 0.014084507042253521)),
 ('r', 'iz', ('e', 'e#'), (0.25, 0.9999896079063049)),
 ('iz', 'e#', ('r', '~'), (0.14285714285714285, 0.9999168623864652))]

In [145]:
segs = c_break_sequence('Battalions', assocs, c_assocs)
pprint(segs)
get_segments(segs)

[[('*B', 'at', ('~', 't'), (0.9999757516973812, 0.2222222222222222)),
  ('at', 't', ('*B', 'a'), (1.0, 1.0))],
 [('t', 'a', ('at', 'l'), (0.0273972602739726, 0.9997540529305806)),
  ('a', 'l', ('t', 'io'), (0.9997540529305806, 0.0273972602739726))],
 [('l', 'io', ('a', 'ns#'), (0.25, 0.9999792157406124))],
 [('io', 'ns#', ('l', '~'), (0.07920792079207921, 0.9996778372835794))]]


['*Batt', 'al', 'io', 'ns#']

In [143]:
all_segs = {x : (get_segments(c_break_sequence(x, assocs, c_assocs)), get_segments(break_sequence(x, assocs))) for x in tokens[:3000] if len(x) > 5}

In [144]:
pprint(all_segs)

{'Abraham': (['*Abra', 'h', 'a', 'm#'], ['*A', 'br', 'a', 'h', 'a', 'm#']),
 'According': (['*Acco', 'r', 'ding#'], ['*A', 'cc', 'o', 'r', 'ding#']),
 'Action': (['*Action#'], ['*A', 'ction#']),
 'Adaptations': (['*Adapt', 'atio', 'ns#'], ['*A', 'dap', 't', 'atio', 'ns#']),
 'Administration': (['*Adm', 'i', 'n', 'i', 's', 'tr', 'ation#'],
                    ['*A', 'dm', 'i', 'n', 'i', 's', 't', 'r', 'ation#']),
 'Advertiser': (['*Adve', 'r', 'ti', 's', 'e', 'r#'],
                ['*A', 'dve', 'r', 't', 'i', 's', 'e', 'r#']),
 'Africa': (['*Afri', 'ca#'], ['*A', 'fr', 'i', 'c', 'a#']),
 'African': (['*Afri', 'ca', 'n#'], ['*A', 'fr', 'i', 'c', 'a', 'n#']),
 'Allies': (['*Al', 'l', 'i', 'es#'], ['*A', 'l', 'l', 'i', 'es#']),
 'Alongside': (['*Alongsid', 'e#'], ['*A', 'lo', 'n', 'g', 'sid', 'e#']),
 'Alphabet': (['*Alph', 'a', 'be', 't#'],
              ['*A', 'lp', 'h', 'a', 'b', 'e', 't#']),
 'Although': (['*Al', 't', 'h', 'o', 'ugh#'],
              ['*A', 'l', 't', 'h', 'o', 'ug', '