In [1]:
## Imports
import csv
import sys
import numpy as np
import pickle
from time import time
import re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectPercentile, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV
from sklearn import metrics
import spacy
nlp = spacy.load('en')



In [13]:
best_score = 0
best_model = None

dict_vectorizer = DictVectorizer()
select_percentile = SelectPercentile(percentile=100)
clf_map = [
    (
        MultinomialNB(),
        {
            'clf__alpha': [.001, .01, .1, 1],
            'clf__fit_prior': [True, False],
        }
    ),
    (
        LinearSVC(),
        {
            'clf__C': [1, 10, 100],
            'clf__penalty': ['l2'],
            'clf__loss': ['hinge', 'squared_hinge'],
        }
    ),
    (
        LogisticRegression(),
        {
            'clf__penalty': ['l1','l2'],
            'clf__fit_intercept': [True, False],
            'clf__C':[1, 10, 100],
        }
    ),

]

def classify_my_model(clf, param_grid, model_name):
    global best_score, best_model
    print('###################################',type(clf),'#########################################')
    folds = StratifiedKFold(y_data, n_folds=3, shuffle=True, random_state=int(time()))
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
        ('selector', select_percentile),
        ('clf', clf)
    ])
    param_grid['selector__percentile'] = [90, 95, 100]

    gs = GridSearchCV(pipeline,
                      param_grid,
                      scoring='f1',
                      cv=folds,
                      n_jobs=-1,
                      verbose=1)
    t0 = time()
    gs.fit(x_data, y_data)
    train_time = time() - t0
    print("Train time: %0.3fs" % train_time)
#     print("Real train time: %0.3fs" % (train_time * (TOTAL_COUNT/DEV_COUNT)))
    print("Best score: %0.3f" % gs.best_score_)
    best_params = gs.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
        print("\t%s: %r" % (param_name, best_params[param_name]))
    best_score = gs.best_score_
    best_model = gs.best_estimator_
    with open(model_name+'_best_'+type(clf).__name__+".pkl", 'wb') as handle:
        pickle.dump(best_model, handle)

In [36]:
# Best one was LogReg with only num of syllables as a continuous feature
features = [
    'word',
    'word_pos_tag',
    'word_pos_tag_simplified',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task'
]
feat_indices = [18, 19, 20, 22, 11, 12, 14, 15]
label_index = 27
continuous_feats = [
    'word_number_of_syllables'
]
## Read the file
file_name = "big-table-PoS.csv"
x_data = []
y_data = []
labels = []
with open(file_name, 'r') as f:
    for i, l in enumerate(csv.reader(f)):
        if i == 0: continue
#         elif i == 2: print x_data, y_data
        feats = {feat: l[i] for feat, i in zip(features,feat_indices)}
        # convert some to continuous features
        for feat in continuous_feats:
            feats[feat] = float(feats[feat])
        x_data.append(feats)
        label = l[label_index] == "4" or l[label_index] == "4-" or l[label_index] == "4p"
        y_data.append(label)
        labels.append(l[label_index])

In [37]:
# distance from end of ipu and turn
DIST_END_TURN = "DIST_END_TURN"
for i in range(len(x_data)):
    x_data[i][DIST_END_TURN] = int(x_data[i]['total_number_of_words_in_turn']) - int(x_data[i]['word_number_in_turn'])

In [38]:
# Using turn pos bigrams
POS_TURN_BIGRAM_LEFT = "POS_TURN_BIGRAM_LEFT"
POS_TURN_BIGRAM_RIGHT = "POS_TURN_BIGRAM_RIGHT"
for i in range(len(x_data)):
    # to the left
    if x_data[i]['word_number_in_turn'] == '1':
        x_data[i][POS_TURN_BIGRAM_LEFT] = 'BEGIN/'+x_data[i]['word_pos_tag']
    else:
        x_data[i][POS_TURN_BIGRAM_LEFT] = x_data[i-1]['word_pos_tag']+"/"+x_data[i]['word_pos_tag']
    # to the right
    if x_data[i]['word_number_in_turn'] == x_data[i]['total_number_of_words_in_turn']:
        x_data[i][POS_TURN_BIGRAM_RIGHT] = x_data[i]['word_pos_tag']+"/END"
    else:
        x_data[i][POS_TURN_BIGRAM_RIGHT] = x_data[i]['word_pos_tag']+"/"+x_data[i+1]['word_pos_tag']

In [59]:
# POS Trigram
POS_IPU_TRIGRAM = "POS_TURN_TRIGRAM"
for i in range(len(x_data)):
    # to the left
    left = "BEGIN"
    if x_data[i]['word_number_in_turn'] != '1':
        left = x_data[i-1]['word_pos_tag']
    # to the right
    right = "END"
    if x_data[i]['word_number_in_turn'] != x_data[i]['total_number_of_words_in_turn']:
        right = x_data[i+1]['word_pos_tag']
    x_data[i][POS_IPU_TRIGRAM] = left+"/"+x_data[i]['word_pos_tag']+"/"+right

In [46]:
IS_STUTTER = "IS_STUTTER"
for i in range(len(x_data)):
    x_data[i][IS_STUTTER] = x_data[i]['word'][-1] == '-'

In [47]:
import regex as re
from spacy.tokenizer import Tokenizer
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r"'").match)

In [62]:
SIZE_SUBTREE = "SIZE_SUBTREE"
NUM_SIBLINGS = "NUM_SIBLINGS"
# Because we can't assume that every turn contains a fully connected dependency parse, we can treat distance as either 1, 2, or 3 (meaning 2 or more)
# dist = 0 when this term is last term
# dist = 1 when next term is head or a child of current term
# else dist = 2 when there is overlap between (current term's head and children) and (next term's head and children)
# else dist = 3
ARC_DIST_NEXT = "ARC_DIST_NEXT"
FUNC = "FUNC"
def subtree_size(token):
    return sum([subtree_size(child) for child in token.children])
def num_siblings(token):
    return len(list(token.head.children))
def arc_dist(token, next_token):
    if not next_token: return 0
    elif next_token.text == token.head.text or next_token.text in [a.text for a in token.children]: return 1
    elif any([a.text in [b.text for b in list(token.children)+[token.head]] for a in list(next_token.children)+[next_token.head]]): return 2
    else: return 3
def get_func(token):
    if "SUBJ" in token.dep_: return 0
    elif "DOBJ" in token.dep_: return 1
    elif "POBJ" in token.dep_: return 2
    else: return 3
# set these features to 0 by default
for i in range(len(x_data)):
    x_data[i][SIZE_SUBTREE] = '-1'
    x_data[i][NUM_SIBLINGS] = '-1'
    x_data[i][ARC_DIST_NEXT] = '-1'
    x_data[i][FUNC] = '-1'
for i in range(len(x_data)):
    if x_data[i]['word_number_in_turn'] == '1':
        # figure out the current turn
        turn = ' '.join([word['word'] for word in x_data[i:i+int(x_data[i]['total_number_of_words_in_turn'])]])
#         print(turn)
        doc = nlp(turn)
        for j, token in enumerate(doc):
#             print(token)
            next_token = doc[i+1] if i+1<len(doc) else None
            x_data[i+j][SIZE_SUBTREE] = str(subtree_size(token))
            x_data[i+j][NUM_SIBLINGS] = str(num_siblings(token))
            x_data[i+j][ARC_DIST_NEXT] = str(arc_dist(token, next_token))
            x_data[i+j][FUNC] = str(get_func(token))

In [63]:
for (clf, param_grid) in clf_map:
    classify_my_model(clf, param_grid, 'syntactic')

################################### <class 'sklearn.naive_bayes.MultinomialNB'> #########################################
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   39.8s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:  1.1min finished


Train time: 68.143s
Best score: 0.704
	clf__alpha: 0.01
	clf__fit_prior: True
	selector__percentile: 90
################################### <class 'sklearn.svm.classes.LinearSVC'> #########################################
Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  3.3min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  4.1min finished


Train time: 263.686s
Best score: 0.746
	clf__C: 1
	clf__loss: 'squared_hinge'
	clf__penalty: 'l2'
	selector__percentile: 90
################################### <class 'sklearn.linear_model.logistic.LogisticRegression'> #########################################
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  5.4min finished


Train time: 325.306s
Best score: 0.754
	clf__C: 1
	clf__fit_intercept: True
	clf__penalty: 'l2'
	selector__percentile: 90


In [64]:
# now we are going to use phrase breaks of 4 to segment utterances instead of the turn labels. 
indices_of_4 = set()
with open(file_name, 'r') as f:
    for i, l in enumerate(csv.reader(f)):
        if i == 0: continue
        if l[label_index] == "4":# or l[label_index] == "4-" or l[label_index] == "4p"
            indices_of_4.add(i-1)


In [65]:
# set these features to 0 by default
for i in range(len(x_data)):
    x_data[i][SIZE_SUBTREE] = '-1'
    x_data[i][NUM_SIBLINGS] = '-1'
    x_data[i][ARC_DIST_NEXT] = '-1'
    x_data[i][FUNC] = '-1'
sentence = []
start_index = 0
for i in range(len(x_data)):    
    sentence.append(x_data[i]['word'])
    if i-1 in indices_of_4:
        #we've reached the end of a sentence
        doc = nlp(' '.join(sentence))
        for j, token in enumerate(doc):
            next_token = doc[start_index+1] if start_index+1<len(doc) else None
            x_data[start_index+j][SIZE_SUBTREE] = str(subtree_size(token))
            x_data[start_index+j][NUM_SIBLINGS] = str(num_siblings(token))
            x_data[start_index+j][ARC_DIST_NEXT] = str(arc_dist(token, next_token))
            x_data[start_index+j][FUNC] = str(get_func(token))
        start_index = i
        sentence = []

In [66]:
for (clf, param_grid) in clf_map:
    classify_my_model(clf, param_grid, 'syntactic_punct')

################################### <class 'sklearn.naive_bayes.MultinomialNB'> #########################################
Fitting 3 folds for each of 24 candidates, totalling 72 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:   34.2s
[Parallel(n_jobs=-1)]: Done  72 out of  72 | elapsed:   56.4s finished


Train time: 57.815s
Best score: 0.715
	clf__alpha: 0.01
	clf__fit_prior: True
	selector__percentile: 90
################################### <class 'sklearn.svm.classes.LinearSVC'> #########################################
Fitting 3 folds for each of 18 candidates, totalling 54 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  2.3min
[Parallel(n_jobs=-1)]: Done  54 out of  54 | elapsed:  3.0min finished


Train time: 190.287s
Best score: 0.799
	clf__C: 1
	clf__loss: 'hinge'
	clf__penalty: 'l2'
	selector__percentile: 100
################################### <class 'sklearn.linear_model.logistic.LogisticRegression'> #########################################
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  4.6min finished


Train time: 283.761s
Best score: 0.805
	clf__C: 1
	clf__fit_intercept: True
	clf__penalty: 'l1'
	selector__percentile: 100


# Analyze this new model, since it seems to do somewhat better 

In [67]:
# Find most significant features
model = pickle.load(open('syntactic_punct_best_LogisticRegression.pkl', 'rb'))
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
        
def show_least_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names), key=lambda x: abs(x[0]))
    top = coefs_with_fns[:n]
    for (coef_1, fn_1) in top:
        print("\t%.4f\t%-15s" % (coef_1, fn_1))

In [68]:
show_most_informative_features(model.steps[0][1], model.steps[2][1])

	-3.9139	word=trying    		5.9484	POS_TURN_BIGRAM_RIGHT=VBN/END
	-3.8890	IS_STUTTER     		5.7678	POS_TURN_BIGRAM_RIGHT=NN/END
	-3.4406	word=parallel  		5.4133	POS_TURN_BIGRAM_RIGHT=VBG/END
	-3.2614	word=an        		5.4095	POS_TURN_BIGRAM_RIGHT=PRP/END
	-3.0772	word=gonna     		5.4017	POS_TURN_BIGRAM_RIGHT=VBP/END
	-2.7705	POS_TURN_BIGRAM_RIGHT=NNP/NNP		5.2641	POS_TURN_BIGRAM_RIGHT=RB/END
	-2.7675	word=kind      		5.1970	POS_TURN_BIGRAM_RIGHT=NNS/END
	-2.6617	POS_TURN_BIGRAM_RIGHT=CD/NNS		5.1425	word=it's-     
	-2.5836	POS_TURN_BIGRAM_RIGHT=JJ/NN		5.0438	POS_TURN_BIGRAM_RIGHT=JJR/END
	-2.5472	word=red       		4.9968	POS_TURN_BIGRAM_RIGHT=JJ/END
	-2.5011	POS_TURN_TRIGRAM=VBD/UH/UH		4.9715	POS_TURN_BIGRAM_RIGHT=VB/END
	-2.4398	POS_TURN_TRIGRAM=JJ/JJ/UH		4.8884	POS_TURN_BIGRAM_RIGHT=VBD/END
	-2.4021	word_pos_tag=VB_PRP		4.8864	POS_TURN_BIGRAM_RIGHT=VBZ/END
	-2.3296	POS_TURN_BIGRAM_RIGHT=JJ/TO		4.5564	POS_TURN_BIGRAM_RIGHT=NNP/END
	-2.3002	POS_TURN_BIGRAM_RIGHT=DT/CD		4.5437	POS_TURN_BIGRAM

In [57]:
model_old = pickle.load(open('syntactic_best_LogisticRegression.pkl', 'rb'))

In [58]:
show_most_informative_features(model_old.steps[0][1], model_old.steps[2][1])

	-3.0799	IS_STUTTER     		5.8195	POS_TURN_BIGRAM_RIGHT=VBN/END
	-3.0750	word=kind      		5.6724	POS_TURN_BIGRAM_RIGHT=VBP/END
	-2.7437	POS_TURN_BIGRAM_RIGHT=PRP/MD		5.6706	POS_TURN_BIGRAM_RIGHT=WP/END
	-2.7139	POS_TURN_TRIGRAM=RB/like/END		5.4495	POS_TURN_BIGRAM_RIGHT=NN/END
	-2.6983	POS_TURN_TRIGRAM=RB/there/END		5.2829	POS_TURN_BIGRAM_RIGHT=VBD/END
	-2.6206	POS_TURN_BIGRAM_RIGHT=NNP/NNP		5.2543	POS_TURN_BIGRAM_RIGHT=RB/END
	-2.5483	word=pretty    		5.1698	POS_TURN_BIGRAM_RIGHT=NNS/END
	-2.4947	POS_TURN_BIGRAM_RIGHT=VBP_RB/VB		5.1630	word=it's-     
	-2.3827	word=red       		5.0925	POS_TURN_BIGRAM_RIGHT=VB/END
	-2.3754	word_pos_tag=VB_PRP		5.0757	POS_TURN_BIGRAM_RIGHT=PRP/END
	-2.3681	POS_TURN_BIGRAM_RIGHT=VBZ/RP		4.9856	POS_TURN_BIGRAM_RIGHT=JJ/END
	-2.3253	POS_TURN_BIGRAM_RIGHT=JJ/TO		4.9715	POS_TURN_BIGRAM_RIGHT=VBG/END
	-2.2764	POS_TURN_TRIGRAM=NN/just/END		4.9332	POS_TURN_BIGRAM_RIGHT=VBZ/END
	-2.2459	POS_TURN_BIGRAM_RIGHT=JJ/NN		4.7349	POS_TURN_BIGRAM_RIGHT=JJR/END
	-2.2370	word