In [1]:
## Imports
import csv
import sys
import numpy as np
import pickle
from time import time
import re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectPercentile, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.cross_validation import StratifiedKFold
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.grid_search import GridSearchCV
from sklearn import metrics



In [2]:
best_score = 0
best_model = None

dict_vectorizer = DictVectorizer()
select_percentile = SelectPercentile(percentile=100)

def classify_my_model(clf, param_grid):
    global best_score, best_model
    print('###################################',type(clf),'#########################################')
    folds = StratifiedKFold(y_train, n_folds=3, shuffle=True, random_state=int(time()))
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
        ('selector', select_percentile),
        ('clf', clf)
    ])
    gs = GridSearchCV(pipeline,
                      param_grid,
                      scoring='f1',
                      cv=folds,
                      n_jobs=-1,
                      verbose=1)
    t0 = time()
    gs.fit(x_train, y_train)
    train_time = time() - t0
    print("Train time: %0.3fs" % train_time)
    print("Best score: %0.3f" % gs.best_score_)
    best_params = gs.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
        print("\t%s: %r" % (param_name, best_params[param_name]))
    best_score = gs.best_score_
    best_model = gs.best_estimator_

In [23]:
# Best one was LogReg with only num of syllables as a continuous feature
features = [
    'word',
    'word_pos_tag',
    'word_pos_tag_simplified',
    'word_number_of_syllables',
    'word_number_in_ipu',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_ipu',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task'
]
feat_indices = [18, 19, 20, 22, 10, 11, 12, 13, 14, 15]
label_index = 27
continuous_feats = [
    'word_number_of_syllables'
]
## Read the file
file_name = "big-table-PoS.csv"
x_data = []
y_data = []
labels = []
with open(file_name, 'r') as f:
    for i, l in enumerate(csv.reader(f)):
        if i == 0: continue
#         elif i == 2: print x_data, y_data
        feats = {feat: l[i] for feat, i in zip(features,feat_indices)}
        # convert some to continuous features
        for feat in continuous_feats:
            feats[feat] = float(feats[feat])
        x_data.append(feats)
        label = l[label_index] == "4" or l[label_index] == "4-" or l[label_index] == "4p"
        y_data.append(label)
        labels.append(l[label_index])

In [4]:
import random
data = list(zip(x_data, y_data))
random.shuffle(data)
test_data = data[:4000]
train_data = data[4000:]
x_train, y_train = zip(*train_data)
x_test, y_test = zip(*test_data)

In [5]:
param_grid = {
    'clf__penalty': ['l1','l2'],
    'clf__fit_intercept': [True, False],
    'clf__C':[1, 10, 100],
    'selector__percentile': [90, 95, 100]
}
clf = LogisticRegression()
classify_my_model(clf, param_grid)

################################### <class 'sklearn.linear_model.logistic.LogisticRegression'> #########################################
Fitting 3 folds for each of 36 candidates, totalling 108 fits


[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 108 out of 108 | elapsed:  3.8min finished


Train time: 231.285s
Best score: 0.758
	clf__C: 1
	clf__fit_intercept: False
	clf__penalty: 'l1'
	selector__percentile: 100


# Load best_model from best_LogisticRegression.pkl

In [36]:
with open('best_LogisticRegression.pkl', 'rb') as handle:
    best_model = pickle.load(handle)

# Analyze `best_model` 

In [37]:
# Analyze false positives
fp_terms = []
# Analyze false negatives
fn_terms = []
for i in range(len(x_test)):
    res = best_model.predict_proba(x_test[i])
    res = res[0][1] > res[0][0]
    if res and not y_test[i]:
        fp_terms.append(x_test[i]) 
    elif not res and y_data[i]:
        fn_terms.append(x_test[i]) 

In [38]:
print("False positive rate ::", len(fp_terms)/len(x_test))
print("False negative rate ::", len(fn_terms)/len(x_test))

False positive rate :: 0.0035
False negative rate :: 0.23225


In [39]:
fp_terms[0]

{'total_number_of_words_in_ipu': '3',
 'total_number_of_words_in_task': '12',
 'total_number_of_words_in_turn': '3',
 'word': 'okay',
 'word_number_in_ipu': '1',
 'word_number_in_task': '4',
 'word_number_in_turn': '1',
 'word_number_of_syllables': 2.0,
 'word_pos_tag': 'UH',
 'word_pos_tag_simplified': 'O'}

## let's do a reduce over word to see if some patterns arise

In [9]:
from collections import defaultdict
word_counts = defaultdict(int)
for term in x_test:
    word_counts[term['word']] += 1

In [10]:
fp_word = defaultdict(int)
for term in fp_terms:
    fp_word[term['word']] += 1
most_fp_words = sorted(list(fp_word.keys()), key=lambda x: -fp_word[x])

In [11]:
for word in most_fp_words[:20]:
    print(word, "\t", fp_word[word], "\t", word_counts[word], "\t", fp_word[word]/word_counts[word])

okay 	 7 	 132 	 0.05303030303030303
that 	 6 	 45 	 0.13333333333333333
uh 	 6 	 47 	 0.1276595744680851
right 	 5 	 68 	 0.07352941176470588
yeah 	 5 	 56 	 0.08928571428571429
racket 	 4 	 5 	 0.8
um 	 4 	 54 	 0.07407407407407407
eye 	 4 	 12 	 0.3333333333333333
mermaid 	 3 	 16 	 0.1875
iron 	 3 	 11 	 0.2727272727272727
lawnmower 	 3 	 9 	 0.3333333333333333
middle 	 3 	 18 	 0.16666666666666666
like 	 3 	 75 	 0.04
no 	 3 	 20 	 0.15
moon 	 3 	 21 	 0.14285714285714285
mm 	 3 	 6 	 0.5
lion 	 3 	 17 	 0.17647058823529413
knee 	 2 	 4 	 0.5
oreo 	 2 	 3 	 0.6666666666666666
nail 	 2 	 12 	 0.16666666666666666


In [158]:
# find context of most common fp terms
most_fp_terms = defaultdict(list)
for term in fp_terms:
    most_fp_terms[term['word']].append(term)

In [159]:
fp_terms_context = []
for i in range(len(x_data)):
    term = x_data[i]
    if term in most_fp_terms[term['word']]:
        fp_terms_context.append(x_data[i-2:i+3])

In [160]:
fp_terms_context[0]

[{'total_number_of_words_in_ipu': '20',
  'total_number_of_words_in_task': '30',
  'total_number_of_words_in_turn': '30',
  'word': "it's",
  'word_number_in_ipu': '10',
  'word_number_in_task': '20',
  'word_number_in_turn': '20',
  'word_number_of_syllables': 1.0,
  'word_pos_tag': 'PRP_BES',
  'word_pos_tag_simplified': 'C'},
 {'total_number_of_words_in_ipu': '20',
  'total_number_of_words_in_task': '30',
  'total_number_of_words_in_turn': '30',
  'word': 'a',
  'word_number_in_ipu': '11',
  'word_number_in_task': '21',
  'word_number_in_turn': '21',
  'word_number_of_syllables': 1.0,
  'word_pos_tag': 'DT',
  'word_pos_tag_simplified': 'O'},
 {'total_number_of_words_in_ipu': '20',
  'total_number_of_words_in_task': '30',
  'total_number_of_words_in_turn': '30',
  'word': 'mirror',
  'word_number_in_ipu': '12',
  'word_number_in_task': '22',
  'word_number_in_turn': '22',
  'word_number_of_syllables': 2.0,
  'word_pos_tag': 'NN',
  'word_pos_tag_simplified': 'N'},
 {'total_number_of

In [190]:
fp_same_ipu = [term for term in fp_terms if term['total_number_of_words_in_ipu'] == term['word_number_in_ipu']]
fp_same_task = [term for term in fp_terms if term['total_number_of_words_in_task'] == term['word_number_in_task']]
fp_same_turn = [term for term in fp_terms if term['total_number_of_words_in_turn'] == term['word_number_in_turn']]

In [191]:
print(len(fp_same_ipu)/len(fp_terms))
print(len(fp_same_task)/len(fp_terms))
print(len(fp_same_turn)/len(fp_terms))

0.2389937106918239
0.0
0.07547169811320754


In [194]:
fp_single_utterance = [term for term in fp_terms if term['total_number_of_words_in_ipu'] == term['word_number_in_ipu'] and term['word_number_in_ipu'] == '1']
print(len(fp_single_utterance)/len(fp_terms))

0.1320754716981132


In [161]:
fn_word = defaultdict(int)
for term in fn_terms:
    fn_word[term['word']] += 1
most_fn_words = sorted(list(fn_word.keys()), key=lambda x: -fn_word[x])

In [163]:
for word in most_fn_words[:20]:
    print(word, "\t", fn_word[word], "\t", word_counts[word], "\t", fn_word[word]/word_counts[word])

the 	 116 	 477 	 0.2431865828092243
of 	 33 	 127 	 0.25984251968503935
a 	 29 	 98 	 0.29591836734693877
and 	 24 	 126 	 0.19047619047619047
I 	 23 	 78 	 0.2948717948717949
on 	 16 	 68 	 0.23529411764705882
so 	 15 	 46 	 0.32608695652173914
like 	 15 	 68 	 0.22058823529411764
that 	 15 	 66 	 0.22727272727272727
it's 	 15 	 63 	 0.23809523809523808
you 	 13 	 40 	 0.325
is 	 12 	 68 	 0.17647058823529413
see 	 11 	 29 	 0.3793103448275862
right 	 10 	 68 	 0.14705882352941177
with 	 10 	 29 	 0.3448275862068966
top 	 9 	 34 	 0.2647058823529412
blue 	 8 	 28 	 0.2857142857142857
little 	 7 	 21 	 0.3333333333333333
uh 	 7 	 44 	 0.1590909090909091
it 	 7 	 55 	 0.12727272727272726


In [167]:
# find context of most common fp terms
most_fn_terms = defaultdict(list)
for term in fn_terms:
    most_fn_terms[term['word']].append(term)

In [168]:
fn_terms_context = []
for i in range(len(x_data)):
    term = x_data[i]
    if term in most_fn_terms[term['word']]:
        fn_terms_context.append(x_data[i-2:i+3])

In [173]:
[a[2] for a in fn_terms_context[1:10]]

[{'total_number_of_words_in_ipu': '5',
  'total_number_of_words_in_task': '5',
  'total_number_of_words_in_turn': '5',
  'word': 'I',
  'word_number_in_ipu': '2',
  'word_number_in_task': '2',
  'word_number_in_turn': '2',
  'word_number_of_syllables': 1.0,
  'word_pos_tag': 'PRP',
  'word_pos_tag_simplified': 'O'},
 {'total_number_of_words_in_ipu': '26',
  'total_number_of_words_in_task': '26',
  'total_number_of_words_in_turn': '26',
  'word': 'wine',
  'word_number_in_ipu': '20',
  'word_number_in_task': '20',
  'word_number_in_turn': '20',
  'word_number_of_syllables': 1.0,
  'word_pos_tag': 'NN',
  'word_pos_tag_simplified': 'N'},
 {'total_number_of_words_in_ipu': '2',
  'total_number_of_words_in_task': '2',
  'total_number_of_words_in_turn': '2',
  'word': 'got',
  'word_number_in_ipu': '1',
  'word_number_in_task': '1',
  'word_number_in_turn': '1',
  'word_number_of_syllables': 1.0,
  'word_pos_tag': 'VBD',
  'word_pos_tag_simplified': 'V'},
 {'total_number_of_words_in_ipu': '2

In [187]:
fn_same_ipu = [term for term in fn_terms if term['total_number_of_words_in_ipu'] == term['word_number_in_ipu']]
fn_same_task = [term for term in fn_terms if term['total_number_of_words_in_task'] == term['word_number_in_task']]
fn_same_turn = [term for term in fn_terms if term['total_number_of_words_in_turn'] == term['word_number_in_turn']]

In [189]:
print(len(fn_same_ipu)/len(fn_terms))
print(len(fn_same_task)/len(fn_terms))
print(len(fn_same_turn)/len(fn_terms))

0.07593307593307594
0.003861003861003861
0.023166023166023165


In [195]:
fn_single_utterance = [term for term in fn_terms if term['total_number_of_words_in_ipu'] == term['word_number_in_ipu'] and term['word_number_in_ipu'] == '1']
print(len(fn_single_utterance)/len(fn_terms))

0.007722007722007722


## then pos tag

In [174]:
pos_counts = defaultdict(int)
for term in x_test:
    pos_counts[term['word_pos_tag']] += 1

In [175]:
fp_pos = defaultdict(int)
for term in fp_terms:
    fp_pos[term['word_pos_tag']] += 1
most_fp_pos = sorted(list(fp_pos.keys()), key=lambda x: -fp_pos[x])

In [176]:
for pos in most_fp_pos[:20]:
    print(pos, "\t", fp_pos[pos], "\t", pos_counts[pos], "\t", fp_pos[pos]/pos_counts[pos])

NN 	 67 	 628 	 0.10668789808917198
UH 	 37 	 417 	 0.08872901678657075
RB 	 14 	 308 	 0.045454545454545456
NNS 	 8 	 55 	 0.14545454545454545
JJ 	 5 	 266 	 0.018796992481203006
VBZ 	 5 	 106 	 0.04716981132075472
VBN 	 3 	 29 	 0.10344827586206896
RP 	 3 	 18 	 0.16666666666666666
CC 	 2 	 143 	 0.013986013986013986
NNP 	 2 	 19 	 0.10526315789473684
PRP 	 2 	 189 	 0.010582010582010581
IN 	 2 	 462 	 0.004329004329004329
DT 	 2 	 678 	 0.0029498525073746312
VBP 	 2 	 79 	 0.02531645569620253
WDT 	 1 	 11 	 0.09090909090909091
VBD 	 1 	 41 	 0.024390243902439025
VB 	 1 	 95 	 0.010526315789473684
PRP_BES 	 1 	 72 	 0.013888888888888888
CD 	 1 	 42 	 0.023809523809523808


In [177]:
fn_pos = defaultdict(int)
for term in fn_terms:
    fn_pos[term['word_pos_tag']] += 1
most_fn_pos = sorted(list(fn_pos.keys()), key=lambda x: -fn_pos[x])

In [178]:
for pos in most_fn_pos[:20]:
    print(pos, "\t", fn_pos[pos], "\t", pos_counts[pos], "\t", fn_pos[pos]/pos_counts[pos])

DT 	 168 	 678 	 0.24778761061946902
IN 	 106 	 462 	 0.22943722943722944
NN 	 83 	 628 	 0.1321656050955414
JJ 	 61 	 266 	 0.22932330827067668
RB 	 57 	 308 	 0.18506493506493507
PRP 	 47 	 189 	 0.24867724867724866
CC 	 30 	 143 	 0.2097902097902098
VB 	 29 	 95 	 0.30526315789473685
VBZ 	 24 	 106 	 0.22641509433962265
UH 	 24 	 417 	 0.05755395683453238
VBP 	 22 	 79 	 0.27848101265822783
PRP_BES 	 17 	 72 	 0.2361111111111111
VBG 	 14 	 50 	 0.28
XX 	 9 	 46 	 0.1956521739130435
VBD 	 9 	 41 	 0.21951219512195122
CD 	 8 	 42 	 0.19047619047619047
MD 	 8 	 35 	 0.22857142857142856
NNS 	 8 	 55 	 0.14545454545454545
PRP$ 	 8 	 27 	 0.2962962962962963
PRP_VBP 	 5 	 33 	 0.15151515151515152


## then number of syllables

In [179]:
nos_counts = defaultdict(int)
for term in x_test:
    nos_counts[term['word_number_of_syllables']] += 1

In [180]:
fp_nos = defaultdict(int)
for term in fp_terms:
    fp_nos[term['word_number_of_syllables']] += 1
most_fp_nos = sorted(list(fp_nos.keys()), key=lambda x: -fp_nos[x])

In [181]:
for nos in most_fp_nos[:20]:
    print(nos, "\t", fp_nos[nos], "\t", nos_counts[nos], "\t", fp_nos[nos]/nos_counts[nos])

1.0 	 94 	 3147 	 0.02986971719097553
2.0 	 55 	 746 	 0.07372654155495978
3.0 	 6 	 79 	 0.0759493670886076
4.0 	 2 	 22 	 0.09090909090909091
5.0 	 2 	 6 	 0.3333333333333333


In [182]:
fn_nos = defaultdict(int)
for term in fn_terms:
    fn_nos[term['word_number_of_syllables']] += 1
most_fn_nos = sorted(list(fn_nos.keys()), key=lambda x: -fn_nos[x])

In [183]:
for nos in most_fn_nos[:20]:
    print(nos, "\t", fn_nos[nos], "\t", nos_counts[nos], "\t", fn_nos[nos]/nos_counts[nos])

1.0 	 649 	 3147 	 0.20622815379726725
2.0 	 114 	 746 	 0.15281501340482573
3.0 	 9 	 79 	 0.11392405063291139
4.0 	 4 	 22 	 0.18181818181818182
5.0 	 1 	 6 	 0.16666666666666666


# Can we even find single-word IPUs or turns with no level 4 phrase boundary

In [28]:
single_ipu = [i for i in range(len(x_data)) if x_data[i]['total_number_of_words_in_ipu'] == x_data[i]['word_number_in_ipu'] and x_data[i]['word_number_in_ipu'] == '1']
single_turn = [i for i in range(len(x_data)) if x_data[i]['total_number_of_words_in_turn'] == x_data[i]['word_number_in_turn'] and x_data[i]['word_number_in_turn'] == '1']
single_ipu_no_4 = [i for i in single_ipu if not y_data[i]]
single_turn_no_4 = [i for i in single_turn if not y_data[i]]
print(single_ipu_no_4)
print(single_turn_no_4)

[391, 1692, 1823, 2034, 2035, 2122, 2233, 2427, 2671, 2680, 2688, 2842, 2887, 2888, 2906, 2934, 2965, 2981, 2982, 3067, 3119, 3205, 3245, 3314, 3367, 3408, 3540, 3632, 3633, 3650, 3733, 3761, 3768, 3812, 3840, 3957, 3969, 4015, 4043, 4165, 4317, 4360, 4380, 4812, 4922, 4927, 5035, 5101, 5149, 5241, 5263, 5277, 5437, 5695, 5761, 6228, 6483, 6638, 6961, 7780, 7859, 8263, 8483, 8525, 8526, 8527, 8563, 8605, 8635, 8703, 8715, 8831, 8845, 9010, 9088, 9145, 9401, 9420, 9461, 9462, 9463, 9468, 9513, 9545, 9549, 9599, 9613, 9936, 10221, 10312, 10457, 10511, 10572, 11451, 11452, 11527, 11550, 11558, 11620, 11645, 11696, 11711, 11756, 12002, 12010, 12140, 12203, 12278, 12299, 12376, 12433, 12509, 12523, 12565, 12629, 12655, 12667, 12922, 13987, 14190, 14294, 14438, 14452, 14457, 14936, 15295, 15311, 15436, 15516, 15517, 15519, 15727, 15761, 15766, 15781, 15789, 15833, 15835, 15997, 16089, 16243, 16299, 16453, 16595, 16605, 16628, 16758, 16984, 16985, 17002, 17026, 17078, 18923, 18957, 19075, 191

In [34]:
single_ipu_no_4_breaks = defaultdict(int)
for i in single_ipu_no_4:
    single_ipu_no_4_breaks[labels[i]] += 1
print(single_ipu_no_4_breaks, len(single_ipu_no_4))


defaultdict(<class 'int'>, {'1p': 256, '3p': 90, '2': 33, '2p': 21, '1': 12, '3': 66}) 478


In [35]:
single_turn_no_4_breaks = defaultdict(int)
for i in single_turn_no_4:
    single_turn_no_4_breaks[labels[i]] += 1
print(single_turn_no_4_breaks, len(single_turn_no_4))
    

defaultdict(<class 'int'>, {'1p': 83, '3p': 17, '2': 1, '2p': 2, '3': 6, '1': 1}) 110
