In [125]:
from sklearn.ensemble import RandomForestClassifier
import glob
from nltk import word_tokenize
from nltk import sent_tokenize
import pickle
import codecs

In [185]:
PATH_TO_CORPUS = '../segmenter/training-files/*.txt'  # corpus before it has been tokenized/tagged
files = glob.glob(PATH_TO_CORPUS)
fin_pun = {'.': 0, '!': 1, '?': 3, 'OTHER': 4}

In [127]:
def segment(file):
    """Segment a text file into paragraphs, ignoring those that are too short (1 sentence only)"""
    with codecs.open(file, "r",encoding='utf-8', errors='ignore') as f:
        text = f.read()
        paragraphs = text.split('\n')
    return [sentences for sentences in text.split('\n') if  ('.' in sentences and sentences.strip()[-1] != '.') or sentences.count('.') > 1]

In [186]:
def _get_vec(paragraph):
    """Get training vector for a single paragraph"""
    sentences = sent_tokenize(paragraph)
    ret = []
    cumm_Ds = 0
    cumm_Dw = 0
    Fin_Pun = 0
    for sentence in sentences:
        delta_s = 1
        delta_w = len(word_tokenize(sentence))
        fp = sentence.strip()[-1]
        if fp in fin_pun:
            fp = fin_pun[fp]
        else:
            fp = fin_pun['OTHER']
        ret.append({
            'D_s': cumm_Ds + delta_s,
            'D_w': cumm_Dw + delta_w,
            'Length': delta_w,
            'EndOfP': 0,
            'FinPun': fp
        })
        cumm_Ds += delta_s
        cumm_Dw += delta_w
    ret[-1]['EndOfP'] = 1
    return ret

In [187]:
def make_training_vectors(paras):
    """
    Convert a list of paragraphs to a training vector
    
    The features come from http://homepages.inf.ed.ac.uk/mlap/Papers/emnlp04.pdf.
    """
    #total_length = sum([len(sent_tokenize(para)) for para in paras])
    X = []
    Y = []
    for para in paras:
        vecs = _get_vec(para)
        for vec in vecs:
            X.append([vec['D_s'], vec['D_w'], vec['Length'], vec['FinPun']])
            Y.append([vec['EndOfP']])
    return (X, Y)

In [80]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [229]:
X_train = []
Y_train = []
X_val = []
Y_val = []
for i in range(600):
    paras = segment(files[i])
    x, y = make_training_vectors(paras)
    if i % 5 != 0:
        X_train += x
        Y_train.append(y)
    else:
        X_val += x
        Y_val.append(y)
Y_train = flatten(flatten(Y_train))
Y_val = flatten(flatten(Y_val))


In [230]:
clf = RandomForestClassifier(n_estimators=10)
clf = clf.fit(X_train, Y_train)

In [231]:
# Validation Accuracy
scores = clf.predict(X_val)
accuracy = sum(Y_val[i]==scores[i] for i in range(len(Y_val)))/len(Y_val)
# Within 1 sentence for paragraph split
p_split_accuracy = sum(scores[i-1] + scores[i] + scores[i+1] > 0 for i in range(len(Y_val)-1) if Y_val[i]==1)/(len([1 for val in Y_val if val ==1])-2)
print(accuracy)
print(p_split_accuracy)

0.750519441971
0.601703940362


In [233]:
with open('paragraph_model.pickle', 'wb') as handle:
    pickle.dump(clf, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [235]:
def segment_text(model, paras, max_splits=4):
    """Recursively segment the text into paragraphs."""
    if max_splits == 0:
        return paras
    for i in range(len(paras)):
        x_pred, _ = make_training_vectors([paras[i]])
        splits = model.predict(x_pred)
        # Only retain first split
        if 1 in splits[:-1]:
            # Not terminating
            sentences = sent_tokenize(paras[i])
            split_ix = list(splits).index(1)
            new_p = [' '.join(sentences[:split_ix]), ' '.join(sentences[split_ix:])]
            new_ps = [p for p in paras[:i]] + new_p + [p for p in paras[i+1:]]
            return segment_text(model, new_ps, max_splits-1)
    return paras
    

In [227]:
test_text = """Motorola (which has been known as Lenovo Moto, but is now back to Motorola again) is the only brand left embracing a modular design. Need more battery? Snap on a Moto Mod battery pack. Want to up your entertainment game? Try on the speaker or projector Mod. With Google's Project Ara folded and LG's G5 experiment failed, the modular Moto Z could help decide Motorola's fate as a brand. The Moto Z family currently consists of the Moto Z, Moto Z Play, and Moto Z Force. In addition to the phones, Motorola has pledged to keep releasing Moto Mods at a rate of at least 12 a year. A next-gen Moto Z2 would help keep the Mods alive. Besides the rumored name, we haven't heard much else about the Moto Z2. The original Moto Z debuted last July, so we could be learn more about the phone in the next few months if the Moto Z follows an annual release schedule. In the meantime, there are plenty of new Moto Mods to keep your current Moto Z feeling fresh."""

In [238]:
segment_text(clf, [test_text], 8)

['Motorola (which has been known as Lenovo Moto, but is now back to Motorola again) is the only brand left embracing a modular design.',
 'Need more battery? Snap on a Moto Mod battery pack. Want to up your entertainment game?',
 "Try on the speaker or projector Mod. With Google's Project Ara folded and LG's G5 experiment failed, the modular Moto Z could help decide Motorola's fate as a brand. The Moto Z family currently consists of the Moto Z, Moto Z Play, and Moto Z Force. In addition to the phones, Motorola has pledged to keep releasing Moto Mods at a rate of at least 12 a year. A next-gen Moto Z2 would help keep the Mods alive. Besides the rumored name, we haven't heard much else about the Moto Z2. The original Moto Z debuted last July, so we could be learn more about the phone in the next few months if the Moto Z follows an annual release schedule. In the meantime, there are plenty of new Moto Mods to keep your current Moto Z feeling fresh."]

In [244]:
import copy
a = {'a': {'b': 1}}
b = copy.deepcopy(a)
b['a']['b'] = 2
a

{'a': {'b': 1}}