In [25]:
## Imports
import csv
import sys
import numpy as np
import pickle
from time import time
import re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectPercentile, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import metrics, utils

In [2]:
# Main functions

def generate_data(file_name, features, continuous_feats):
    break_label = 'word_tobi_break_index'
    break_set = set(["4", "4-", "4p"])
    x_data = []
    y_data = []

    with open(file_name, 'r') as f:
        reader = csv.DictReader(f) # DictReader fixes off-by-one error from before
        for i, l in enumerate(reader):
            feats = {feat: l[feat] for feat in features}
            # convert some to continuous features
            for feat in continuous_feats:
                feats[feat] = float(feats[feat])
            x_data.append(feats)
            label = l[break_label] in break_set
            y_data.append(label)
    return x_data, y_data

def classify_my_model(pipeline, param_grid, X, y, model_name, scorer='f1'):
    print('#'*35, model_name, '#'*35)
    folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=int(time()))
#     param_grid['selector__percentile'] = [10, 25, 50, 90, 95, 100]
    
    gs = GridSearchCV(pipeline,
                      param_grid,
                      scoring=scorer,
                      cv=5,
                      n_jobs=-1,
                      verbose=1)
    t0 = time()
    gs.fit(X, y)
    train_time = time() - t0
    print("Train time: %0.3fs" % train_time)
#     print("Real train time: %0.3fs" % (train_time * (TOTAL_COUNT/DEV_COUNT)))
    print("Best score: %0.3f" % gs.best_score_)
    best_params = gs.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
        print("\t%s: %r" % (param_name, best_params[param_name]))
    return gs.best_score_, gs.best_estimator_
    
#     with open(model_name+'_best_'+type(clf).__name__+".pkl", 'wb') as handle:
#         pickle.dump(best_model, handle)

In [3]:
# list of classifier models

from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=2)
dict_vectorizer = DictVectorizer()
# select_percentile = SelectPercentile(percentile=100)

clf_map = [
    (
        MultinomialNB(),
        {
            'clf__alpha': [.001, .01, .1, 1],
            'clf__fit_prior': [True, False],
        }
    ),
    (
        LinearSVC(),
        {
            'clf__C': [.1, 1, 10, 100],
            'clf__penalty': ['l2'],
            'clf__loss': ['hinge', 'squared_hinge'],
        }
    ),
    (
        LogisticRegression(),
        {
            'clf__penalty': ['l1','l2'],
            'clf__fit_intercept': [True, False],
            'clf__C':[.1, 1, 10, 100],
        }
    ),
    (
        RandomForestClassifier(),
        {
            'clf__n_estimators': [10, 20],
            'clf__max_features': ["auto", "log2",None]
        }
    )
        

]


# Adding POS features

In [4]:
def add_dist_end_turn(x_data):
    # distance from end of turn
    DIST_END_TURN = "DIST_END_TURN"
    for i in range(len(x_data)):
        x_data[i][DIST_END_TURN] = int(x_data[i]['total_number_of_words_in_turn']) - int(x_data[i]['word_number_in_turn'])

In [5]:
def add_pos_bigram(x_data, left=True, right=True):
    POS_TURN_BIGRAM_LEFT = "POS_TURN_BIGRAM_LEFT"
    POS_TURN_BIGRAM_RIGHT = "POS_TURN_BIGRAM_RIGHT"
    
    def add_left_bigram(x_data):
        if x_data[i]['word_number_in_turn'] == '1':
            x_data[i][POS_TURN_BIGRAM_LEFT] = 'BEGIN/'+x_data[i]['Stanford_PoS']
        else:
            x_data[i][POS_TURN_BIGRAM_LEFT] = x_data[i-1]['Stanford_PoS']+"/"+x_data[i]['Stanford_PoS']
    def add_right_bigram(x_data):
        if x_data[i]['word_number_in_turn'] == x_data[i]['total_number_of_words_in_turn']:
            x_data[i][POS_TURN_BIGRAM_RIGHT] = x_data[i]['Stanford_PoS']+"/END"
        else:
            x_data[i][POS_TURN_BIGRAM_RIGHT] = x_data[i]['Stanford_PoS']+"/"+x_data[i+1]['Stanford_PoS']
            
    if not left: add_left_bigram = lambda x: None
    if not right: add_right_bigram = lambda x: None
        
    for i in range(len(x_data)):
        add_left_bigram(x_data)
        add_right_bigram(x_data)

In [6]:
def add_is_stutter(x_data):
    IS_STUTTER = "IS_STUTTER"
    for i in range(len(x_data)):
        x_data[i][IS_STUTTER] = x_data[i]['word'][-1] == '-'

In [7]:
def add_pos_trigram(x_data):
    POS_TURN_TRIGRAM = "POS_TURN_TRIGRAM"
    for i in range(len(x_data)):
        # to the left
        left = "BEGIN"
        if x_data[i]['word_number_in_turn'] != '1':
            left = x_data[i-1]['Stanford_PoS']
        # to the right
        right = "END"
        if x_data[i]['word_number_in_turn'] != x_data[i]['total_number_of_words_in_turn']:
            right = x_data[i+1]['Stanford_PoS']
        x_data[i][POS_TURN_TRIGRAM] = left+"/"+x_data[i]['Stanford_PoS']+"/"+right

In [43]:
features = [
    'word',
#     'word_pos_tag',
#     'word_pos_tag_simplified',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function',
    'Most_Recent_Mention_Syntactic_Function',
    'Recent_Explicit_Mention_Syntactic_Function',
    'Recent_Implicit_Mention_Syntactic_Function',
    'Most_Recent_Mention_PoS',
    'Recent_Explicit_Mention_PoS',
    'Recent_Implicit_Mention_PoS',
#     'parse_tree'
]

continuous_feats = []

x_data, y_data = generate_data("../info-status/games-data-20180323.csv", features, continuous_feats)

# ptree = ParentedTree.fromstring(tree)

In [28]:
def balanced_subsample(x,y,subsample_size=1.0):
    x, y = np.array(x), np.array(y)
    class_xs = []
    min_elems = None

    for yi in np.unique(y):
        elems = x[(y == yi)]
        class_xs.append((yi, elems))
        if min_elems == None or elems.shape[0] < min_elems:
            min_elems = elems.shape[0]

    use_elems = min_elems
    if subsample_size < 1:
        use_elems = int(min_elems*subsample_size)

    xs = []
    ys = []

    for ci,this_xs in class_xs:
        if len(this_xs) > use_elems:
            utils.shuffle(this_xs, random_state=2557)

        x_ = this_xs[:use_elems]
        y_ = np.empty(use_elems)
        y_.fill(ci)

        xs.append(x_)
        ys.append(y_)

    xs = np.concatenate(xs)
    ys = np.concatenate(ys)

    return xs,ys


In [40]:
x_new, y_new = balanced_subsample(x_data, y_data, 1)
print(np.unique(y_new, return_counts=True))
print(np.unique(y_data, return_counts=True))
x_data, y_data = x_new, y_new

False
True
(array([0., 1.]), array([12245, 12245]))
(array([False,  True]), array([58081, 12245]))


In [None]:
add_dist_end_turn(x_data)
add_pos_bigram(x_data)
add_is_stutter(x_data)
add_pos_trigram(x_data)
models = {}
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__)
    y_pred = best_model.predict(X_test)
    models[clf.__class__.__name__] = best_model
    print(metrics.classification_report(y_test, y_pred))

################################### MultinomialNB ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   59.9s finished


Train time: 63.724s
Best score: 0.582
	clf__alpha: 0.1
	clf__fit_prior: True
             precision    recall  f1-score   support

      False       0.94      0.84      0.89     11618
       True       0.49      0.73      0.59      2448

avg / total       0.86      0.82      0.83     14066

################################### LinearSVC ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


Process ForkPoolWorker-80:
Traceback (most recent call last):
  File "/usr/lib/python3.4/multiprocessing/process.py", line 254, in _bootstrap
    self.run()
  File "/usr/lib/python3.4/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.4/multiprocessing/pool.py", line 108, in worker
    task = get()
  File "/usr/local/lib/python3.4/dist-packages/sklearn/externals/joblib/pool.py", line 362, in get
    return recv()
  File "/usr/lib/python3.4/multiprocessing/connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "/usr/lib/python3.4/multiprocessing/connection.py", line 420, in _recv_bytes
    return self._recv(size)
  File "/usr/lib/python3.4/multiprocessing/connection.py", line 383, in _recv
    chunk = read(handle, remaining)
KeyboardInterrupt


#  Adding syntactic features

In [242]:
tree = ParentedTree.fromstring('''
(ROOT
  (VP
    (ADJP
      (JJ okay)
      (SBAR
        (S
          (NP (PRP it))
          (VP
            (VBZ 's)
            (NP
              (NP (DT a) (DT an) (NN iron))
              (PP
                (IN on)
                (NP
                  (NP (JJ top))
                  (PP (IN with) (NP (DT a) (JJ green) (NN lime))))))))))
    (PP
      (IN on)
      (NP
        (NP (DT the) (JJ bottom) (NN left))
        (CC and)
        (NP (DT a) (NN nail))))))''')
row = x_data[110]
i = 20
for i in range(0,19):
    tree_location = tree.leaf_treeposition(i)
    print(tree[tree_location[:-1]])

(JJ okay)
(PRP it)
(VBZ 's)
(DT a)
(DT an)
(NN iron)
(IN on)
(JJ top)
(IN with)
(DT a)
(JJ green)
(NN lime)
(IN on)
(DT the)
(JJ bottom)
(NN left)
(CC and)
(DT a)
(NN nail)


In [227]:
from nltk.tree import ParentedTree

def parse_tree_depth(index, tree):
    '''
    :param int index: word number in turn
    :param ParentedTree tree: parse tree
    :returns: depth of current node
    '''
    tree_location = tree.leaf_treeposition(int(index) -1)
    return len(tree_location)

def parent_pos(index, tree):    
    tree_location = tree.leaf_treeposition(int(index) -1)
    parent_tree = tree[tree_location[:-2]] # 2 levels up, since 1 level up is the tag
    return parent_tree.label()

def get_index(row, tree):
    word = row['word']
    leaves = tree.leaves()
    if leaves.count(word) == 1: # no need to disambiguate
        return leaves.index('word')
    

def add_word_depth(x_data):
    for row in x_data:
        i = row['word_number_in_turn']
        tree_string = row['parse_tree']
        if not tree_string: continue
        tree = ParentedTree.fromstring(tree_string)
        row['parse_tree_depth'] = parse_tree_depth(i, tree)
        row['parent_pos'] = parent_pos(i, tree)
add_word_depth(x_data)
print(x_data[0])
# for i in range(1,27):
#     print(parent_pos(i, ptree))
#     print(x_data[i]['Stanford_PoS'])
#     print()

(ROOT (NP (NN yup)))
(ROOT
  (FRAG
    (ADJP (JJ okay))
    (S
      (VP
        (VBG looking)
        (S
          (VP
            (VBG looking)
            (PP
              (IN alright)
              (NP
                (NP (DT the) (JJS closest))
                (SBAR
                  (S
                    (NP (PRP I))
                    (VP
                      (VBD got)
                      (S (NP (POS 's)))
                      (NP
                        (DT the)
                        (JJ lemon)
                        (JJ alright)
                        (NN match))
                      (SBAR
                        (IN that)
                        (S
                          (NP (CD one) (NN aw))
                          (VP
                            (VBD got)
                            (S
                              (NP (PRP it))
                              (ADJP (JJ wrong))
                              (SBAR
                                (WHNP (WP what

IndexError: index must be less than or equal to len(self)

In [None]:
print

In [29]:
SIZE_SUBTREE = "SIZE_SUBTREE"
NUM_SIBLINGS = "NUM_SIBLINGS"
# Because we can't assume that every turn contains a fully connected dependency parse, we can treat distance as either 1, 2, or 3 (meaning 2 or more)
# dist = 0 when this term is last term
# dist = 1 when next term is head or a child of current term
# else dist = 2 when there is overlap between (current term's head and children) and (next term's head and children)
# else dist = 3
ARC_DIST_NEXT = "ARC_DIST_NEXT"
FUNC = "FUNC"
def subtree_size(data):
    return sum([subtree_size(child) for child in token.children])
def num_siblings(token):
    return len(list(token.head.children))
def arc_dist(token, next_token):
    if not next_token: return 0
    elif next_token.text == token.head.text or next_token.text in [a.text for a in token.children]: return 1
    elif any([a.text in [b.text for b in list(token.children)+[token.head]] for a in list(next_token.children)+[next_token.head]]): return 2
    else: return 3
def get_func(token):
    dep = token.dep_.upper()
    if "SUBJ" in dep or "OBJ" in dep:
        return dep
    else: return ''

def add_syntactic_features(x_data):
    # set these features to 0 by default
    for i in range(len(x_data)):
        x_data[i][SIZE_SUBTREE] = '-1'
        x_data[i][NUM_SIBLINGS] = '-1'
        x_data[i][ARC_DIST_NEXT] = '-1'
        x_data[i][FUNC] = ''
    
    t0 = time()
    # # w = [0,2,3,9,12,22,23,49,54,65,70,96,98]
    # turns = []
    # for i in range(23,60):
    for i in range(len(x_data)):
        if x_data[i]['word_number_in_turn'] == '1':
            len_turn = int(x_data[i]['total_number_of_words_in_turn'])

            # figure out the current turn
            turn = ' '.join([word['word'] for word in x_data[i:i+len_turn]])
            doc = nlp(turn)
            for j, token in enumerate(doc):
    #             print(token)
                next_token = doc[i+1] if i+1<len(doc) else None
                x_data[i+j][SIZE_SUBTREE] = str(subtree_size(token))
                x_data[i+j][NUM_SIBLINGS] = str(num_siblings(token))
                x_data[i+j][ARC_DIST_NEXT] = str(arc_dist(token, next_token))
                x_data[i+j][FUNC] = str(get_func(token))

    train_time = time() - t0
    print("Time to generate syntactic feature data: %0.3fs" % train_time)

In [92]:
from spacy import displacy

doc = nlp(turns[2])
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
          [child for child in token.children])
displacy.render(doc, style='dep', jupyter=True)

okay ROOT okay INTJ []
so advmod have VERB []
I nsubj have VERB []
have ROOT have VERB [so, I, carrots]
a det carrots NOUN []
shoe compound carrots NOUN []
carrots dobj have VERB [a, shoe, and, racket]
and cc carrots NOUN []
a det racket NOUN []
tennis compound racket NOUN []
racket conj carrots NOUN [a, tennis]


In [30]:
features = [
    'word',
#     'word_pos_tag',
#     'word_pos_tag_simplified',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function',
    'Most_Recent_Mention_Syntactic_Function',
    'Recent_Explicit_Mention_Syntactic_Function',
    'Recent_Implicit_Mention_Syntactic_Function',
    'Most_Recent_Mention_PoS',
    'Recent_Explicit_Mention_PoS',
    'Recent_Implicit_Mention_PoS',
    'parse_tree'
]

continuous_feats = []

x_data, y_data = generate_data("big-table-PoS.csv", features, continuous_feats)
add_dist_end_turn(x_data)
add_is_stutter(x_data)
add_pos_bigram(x_data)
add_pos_trigram(x_data)
add_syntactic_features(x_data)

Time to generate syntactic feature data: 103.587s


In [36]:
l = set([])
for i, x in enumerate(x_data):
    if x['FUNC'] == '-1':
        print(i, x['word'])
print(l)

196 number
197 nine
315 alright
320 the
321 closest
322 I
323 got's
324 the
325 l-
326 lemon
327 alright
328 match
329 that
330 one
331 aw
332 got
333 it
334 wrong
335 what
336 was
337 on
338 the
339 top
340 a
341 blue
342 li-
371 it
374 on
375 top
376 and
377 the
378 lemon
379 on
380 the
381 left
382 and
383 the
384 eye
385 on
386 the
387 right
408 and
409 a
410 lemon
447 the
448 bottom
449 right
450 I
451 guess
467 eye
468 on
469 the
470 right
952 red
953 airplane
959 yep
1017 sheep
1018 a
1019 lamb
1020 him
1021 on
1022 top
1023 and
1048 moon
1049 and
1050 an
1051 iron
1052 to
1053 a
1054 knee
1055 yeah
1056 under
1077 moon
1151 moon
1152 now
1175 and
1176 a
1177 yellow
1178 moon
1179 rhinoceros
1180 on
1189 top
1190 and
1191 a
1192 ruler
1203 uh
1204 underneath
1205 it
1206 to
1207 the
1208 right
1209 and
1210 that's
1211 it
1304 and
1305 a
1306 ruler
1325 that
1326 is
1327 ear
1328 ruler
1431 right
1469 do
1470 better
1481 uh?
1482 huh?
1483 huh?
1563 past
1564 one
1646 bottom
182

26624 the
26625 the
26643 image
26644 o-
26645 of
26646 the
26647 iron
26648 is
26683 ear
26684 on
26685 the
26686 right
26687 the
26695 off
26696 center
26697 you
26698 know
26738 of
26739 the
26740 cord
26757 is
26758 in
26759 line
26760 with
26761 the-
26763 with
26764 the
26765 right
26766 end
26771 of
26772 the
26773 mermaid
26815 in
26816 line
26817 wi-
26818 with
26819 the
26820 left
26821 end
26826 of
26827 the
26828 mermaid
26855 parallel
26856 to
26857 the
26858 M
26859 and
26860 M
27013 of
27014 that
27015 moon
27017 lime
27022 is
27023 between
27063 our
27064 right
27093 you
27094 know
27095 that-
27096 that
27097 little
27118 touching
27119 that
27120 and
27121 to
27122 the
27123 left
27124 of
27126 the
27127 onion
27128 parallel
27130 with
27131 the
27132 onion
27241 w-
27242 w-
27243 left
27244 edge
27245 of
27246 the
27247 nuns
27248 habit
27265 on
27266 the
27267 picture
27268 frame
27276 in
27277 that
27278 o-
27279 it's
27280 o-
27322 frame
27490 um
27494 above
27495

In [19]:

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

Time to generate syntactic feature data: 91.291s
################################### MultinomialNB ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   12.9s finished


Train time: 15.182s
Best score: 0.703
	clf__alpha: 0.01
	clf__fit_prior: True
             precision    recall  f1-score   support

      False       0.91      0.86      0.89      6787
       True       0.66      0.77      0.71      2341

avg / total       0.85      0.84      0.84      9128

################################### LinearSVC ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.2min finished


Train time: 81.181s
Best score: 0.753
	clf__C: 0.1
	clf__loss: 'squared_hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.90      0.96      0.93      6787
       True       0.84      0.69      0.76      2341

avg / total       0.88      0.89      0.88      9128

################################### LogisticRegression ###################################
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   21.3s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  1.3min finished


Train time: 80.035s
Best score: 0.754
	clf__C: 1
	clf__fit_intercept: True
	clf__penalty: 'l1'
             precision    recall  f1-score   support

      False       0.90      0.95      0.92      6787
       True       0.83      0.69      0.75      2341

avg / total       0.88      0.88      0.88      9128



In [16]:
# Find most significant features
model = pickle.load(open('syntactic_best_LogisticRegression.pkl', 'rb'))


FileNotFoundError: [Errno 2] No such file or directory: 'syntactic_best_LogisticRegression.pkl'

In [None]:
model.steps

In [None]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
        
def show_least_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names), key=lambda x: abs(x[0]))
    top = coefs_with_fns[:n]
    for (coef_1, fn_1) in top:
        print("\t%.4f\t%-15s" % (coef_1, fn_1))

In [None]:
show_most_informative_features(model.steps[0][1], model.steps[2][1])

In [None]:
show_least_informative_features(model.steps[0][1], model.steps[2][1], n=100)

In [None]:
len(model.steps[2][1].coef_[0])

In [None]:
from collections import defaultdict
def show_feature_importance(vectorizer, clf):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = zip(clf.coef_[0], feature_names)
    feat_count = defaultdict(list)
    for (coef_1, fn_1) in coefs_with_fns:
        feat = fn_1.split('=')[0]
        feat_count[feat].append(abs(coef_1))
    for feat in feat_count.keys():
        print(feat, sum(feat_count[feat])/len(feat_count[feat]))
    return feat_count

In [None]:
feat_count = show_feature_importance(model.steps[0][1], model.steps[2][1])

In [None]:
import matplotlib.pyplot as plt

In [None]:
for feat in feat_count.keys():
    hist, bins = np.histogram(feat_count[feat], bins=int(len(feat_count[feat])**0.5))
    width = 0.7 * (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    plt.bar(center, hist, align='center', width=width)
    plt.title(feat)
    plt.show()