Refer to comparison.xlsx for summary of information in this notebook

In [1]:
## Imports
import csv
import sys
import numpy as np
import pickle
from time import time
import re
from spherecluster import SphericalKMeans

from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectPercentile, SelectKBest
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn import metrics

In [60]:
# Main functions

def generate_data(file_name, features, continuous_feats, impute=0):
    break_label = 'word_tobi_break_index'
    break_set = set(["4", "4-", "4p"])
    x_data = []
    y_data = []

    with open(file_name, 'r') as f:
        reader = csv.DictReader(f) # DictReader fixes off-by-one error from before
        for i, l in enumerate(reader):
            feats = {}
            for feat in features:
                if feat in continuous_feats:
                    if not l[feat]:
                        feats[feat] = impute
                    else:
                        feats[feat] = int(l[feat])
                else:
                    feats[feat] = l[feat]
#             feats = {feat: l[feat] for feat in features}
#             # convert some to continuous features
#             for feat in continuous_feats:
#                 feats[feat] = float(feats[feat])
            x_data.append(feats)
            label = l[break_label] in break_set
            y_data.append(label)
    return x_data, y_data

def classify_my_model(pipeline, param_grid, X, y, model_name, scorer='f1', save_model=False):
    print('#'*35, model_name, '#'*35)
    folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=2557)
    
    gs = GridSearchCV(pipeline,
                      param_grid,
                      scoring=scorer,
                      cv=5,
                      n_jobs=-1,
                      verbose=1)
    t0 = time()
    gs.fit(X, y)
    train_time = time() - t0
    print("Train time: %0.3fs" % train_time)
#     print("Real train time: %0.3fs" % (train_time * (TOTAL_COUNT/DEV_COUNT)))
    print("Best score: %0.3f" % gs.best_score_)
    best_params = gs.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
        print("\t%s: %r" % (param_name, best_params[param_name]))
    
    if save_model:
        file_name = model_name + '_best' + ".pkl"
        print('saving model to' + file_name)
        with open(file_name, 'wb') as handle:
            pickle.dump(best_model, handle)
            
    return gs.best_score_, gs.best_estimator_

In [108]:
# list of classifier models

from sklearn.metrics import fbeta_score, make_scorer
fdot25_scorer = make_scorer(fbeta_score, beta=.25)
dict_vectorizer = DictVectorizer()
# select_percentile = SelectPercentile(percentile=100)

# list of classifier models
# redefine for quicker running
clf_map = [
    (
        LinearSVC(),
        {
            'clf__C': [.1],
            'clf__penalty': ['l2'],
            'clf__loss': ['hinge', 'squared_hinge'],
        }
    ),
    (
        LogisticRegression(),
        {
            'clf__penalty': ['l2'],
            'clf__fit_intercept': [True],
            'clf__C':[.1],
        }
    ),
    (
        RandomForestClassifier(random_state=2557),
        {
            'clf__n_estimators': [50],
            'clf__max_features': ["auto", "log2"]
        }
    ),
    
#     (
#         ExtraTreesClassifier(random_state=2557),
#         {
#             'clf__n_estimators': [20],
#             'clf__max_features': ["auto", "log2"]
#         }
#     ),

]


In [5]:
import pandas as pd
FILE_NAME = "../info-status/games-data-20180427.csv"
df = pd.read_csv(FILE_NAME)

### Baseline Model

In [7]:
features = [
    'word',
#     'word_pos_tag',
#     'word_pos_tag_simplified',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS'
]

continuous_feats = ['word_number_of_syllables',]

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)


In [8]:
# majority class baseline classifier
# no features are used, which is why we get 0.00 for everything
y_pred = [0] * len(X_test)
print(metrics.classification_report(y_test, y_pred))
print(metrics.accuracy_score(y_test, y_pred))

             precision    recall  f1-score   support

      False       0.83      1.00      0.90     11618
       True       0.00      0.00      0.00      2448

avg / total       0.68      0.83      0.75     14066

0.8259633157969573


  'precision', 'predicted', average, warn_for)


In [9]:
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

################################### LinearSVC ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.1s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.0s finished


Train time: 5.657s
Best score: 0.668
	clf__C: 0.1
	clf__loss: 'squared_hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.88      0.97      0.92     11618
       True       0.70      0.38      0.49      2448

avg / total       0.85      0.86      0.85     14066

################################### LogisticRegression ###################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    1.7s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.1s finished


Train time: 3.465s
Best score: 0.678
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.88      0.97      0.92     11618
       True       0.73      0.34      0.46      2448

avg / total       0.85      0.86      0.84     14066

################################### RandomForestClassifier ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   23.7s remaining:   15.8s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   39.0s finished


Train time: 56.350s
Best score: 0.666
	clf__max_features: 'log2'
	clf__n_estimators: 20
             precision    recall  f1-score   support

      False       0.87      0.98      0.92     11618
       True       0.75      0.29      0.42      2448

avg / total       0.85      0.86      0.83     14066

################################### ExtraTreesClassifier ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   32.7s remaining:   21.8s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   48.6s finished


Train time: 71.701s
Best score: 0.665
	clf__max_features: 'log2'
	clf__n_estimators: 20
             precision    recall  f1-score   support

      False       0.87      0.98      0.92     11618
       True       0.74      0.33      0.45      2448

avg / total       0.85      0.86      0.84     14066



## Model with syntactic features

In [12]:
features = [
    'word',
#     'word_pos_tag',
#     'word_pos_tag_simplified',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function'
]

continuous_feats = ['word_number_of_syllables',]

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

    
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

################################### LinearSVC ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    3.9s remaining:    2.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.9s finished


Train time: 6.851s
Best score: 0.674
	clf__C: 0.1
	clf__loss: 'squared_hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.89      0.96      0.92     11618
       True       0.71      0.42      0.53      2448

avg / total       0.86      0.87      0.86     14066

################################### LogisticRegression ###################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    2.2s remaining:    3.3s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    2.8s finished


Train time: 4.463s
Best score: 0.687
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.88      0.97      0.92     11618
       True       0.72      0.39      0.50      2448

avg / total       0.85      0.87      0.85     14066

################################### RandomForestClassifier ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   24.1s remaining:   16.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   34.9s finished


Train time: 51.015s
Best score: 0.680
	clf__max_features: 'log2'
	clf__n_estimators: 20
             precision    recall  f1-score   support

      False       0.87      0.97      0.92     11618
       True       0.73      0.32      0.45      2448

avg / total       0.85      0.86      0.84     14066



## Model with syntactic features and basic mention features


In [18]:
features = [
    'word',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function',
    'Most_Recent_Mention_Syntactic_Function',
    'Recent_Explicit_Mention_Syntactic_Function',
    'Recent_Implicit_Mention_Syntactic_Function',
    'Most_Recent_Mention_PoS',
    'Recent_Explicit_Mention_PoS',
    'Recent_Implicit_Mention_PoS',
]

continuous_feats = ['word_number_of_syllables',]

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

    
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

################################### LinearSVC ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    6.4s remaining:    4.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    7.8s finished


Train time: 10.064s
Best score: 0.684
	clf__C: 0.1
	clf__loss: 'hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.87      0.97      0.92     11618
       True       0.73      0.33      0.46      2448

avg / total       0.85      0.86      0.84     14066

################################### LogisticRegression ###################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    4.3s remaining:    6.4s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    5.5s finished


Train time: 8.519s
Best score: 0.690
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.88      0.97      0.92     11618
       True       0.72      0.39      0.51      2448

avg / total       0.85      0.87      0.85     14066

################################### RandomForestClassifier ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   28.9s remaining:   19.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   41.9s finished


Train time: 60.234s
Best score: 0.676
	clf__max_features: 'log2'
	clf__n_estimators: 20
             precision    recall  f1-score   support

      False       0.87      0.98      0.92     11618
       True       0.73      0.30      0.43      2448

avg / total       0.84      0.86      0.83     14066



In [19]:
# adapted from accent_classifier.ipynb

def get_input_mentioned(X_data, df):
    col = df['Most_Recent_Mention']
    for i in range(len(X_data)):
        mentioned = True if not pd.isnull(col[i]) else False
        X_data[i]['mentioned'] = mentioned

def set_input_num_mentions(X_data, df):
    col = df['Number_Of_Coref_Mentions']
    for i in range(len(X_data)):
        num = col[i] if not pd.isnull(col[i]) else 0
        X_data[i]['Number_Of_Coref_Mentions'] = num

def get_input_far_back_mentioned(X_data, df):
    curr_time_col = df['word_end_time']
    most_recent_time_col = df['Most_Recent_Mention']
    for i in range(len(X_data)):
        curr_time = curr_time_col[i]
        most_recent_time = most_recent_time_col[i] if not pd.isnull(most_recent_time_col[i]) else curr_time
        X_data[i]['time_between_mentions'] = curr_time - most_recent_time


In [23]:
features = [
    'word',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function',
    'Most_Recent_Mention_Syntactic_Function',
    'Recent_Explicit_Mention_Syntactic_Function',
    'Recent_Implicit_Mention_Syntactic_Function',
    'Most_Recent_Mention_PoS',
    'Recent_Explicit_Mention_PoS',
    'Recent_Implicit_Mention_PoS',
]

continuous_feats = ['word_number_of_syllables']

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
get_input_mentioned(x_data, df)
set_input_num_mentions(x_data, df)
get_input_far_back_mentioned(x_data, df)

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

################################### LinearSVC ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   29.5s remaining:   19.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   39.8s finished


Train time: 51.161s
Best score: 0.669
	clf__C: 0.1
	clf__loss: 'hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.87      0.97      0.92     11618
       True       0.72      0.34      0.46      2448

avg / total       0.85      0.86      0.84     14066

################################### LogisticRegression ###################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    5.3s remaining:    8.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    6.6s finished


Train time: 9.999s
Best score: 0.687
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.88      0.97      0.92     11618
       True       0.72      0.39      0.51      2448

avg / total       0.85      0.87      0.85     14066

################################### RandomForestClassifier ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   28.5s remaining:   19.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   37.9s finished


Train time: 55.504s
Best score: 0.684
	clf__max_features: 'auto'
	clf__n_estimators: 20
             precision    recall  f1-score   support

      False       0.88      0.97      0.92     11618
       True       0.72      0.35      0.47      2448

avg / total       0.85      0.86      0.84     14066



# Adding POS features

In [29]:
def add_dist_end_turn(x_data):
    # distance from end of turn
    DIST_END_TURN = "DIST_END_TURN"
    for i in range(len(x_data)):
        x_data[i][DIST_END_TURN] = int(x_data[i]['total_number_of_words_in_turn']) - int(x_data[i]['word_number_in_turn'])
        
def add_pos_bigram(x_data, left=True, right=True):
    POS_TURN_BIGRAM_LEFT = "POS_TURN_BIGRAM_LEFT"
    POS_TURN_BIGRAM_RIGHT = "POS_TURN_BIGRAM_RIGHT"
    
    def add_left_bigram(x_data):
        if x_data[i]['word_number_in_turn'] == '1':
            x_data[i][POS_TURN_BIGRAM_LEFT] = 'BEGIN/'+x_data[i]['Stanford_PoS']
        else:
            x_data[i][POS_TURN_BIGRAM_LEFT] = x_data[i-1]['Stanford_PoS']+"/"+x_data[i]['Stanford_PoS']
    def add_right_bigram(x_data):
        word_number, total_word_number = int(x_data[i]['word_number_in_turn']), int(x_data[i]['total_number_of_words_in_turn'])
        if word_number == total_word_number:
            x_data[i][POS_TURN_BIGRAM_RIGHT] = x_data[i]['Stanford_PoS']+"/END"
        else:
            x_data[i][POS_TURN_BIGRAM_RIGHT] = x_data[i]['Stanford_PoS']+"/"+x_data[i+1]['Stanford_PoS']
            
    if not left: add_left_bigram = lambda x: None
    if not right: add_right_bigram = lambda x: None
        
    for i in range(len(x_data)):
        add_left_bigram(x_data)
        add_right_bigram(x_data)
        
# not super useful in my experiments, kept for reference
def add_word_bigram(x_data, left=True, right=True):
    POS_TURN_BIGRAM_LEFT = "POS_WORD_BIGRAM_LEFT"
    POS_TURN_BIGRAM_RIGHT = "POS_WORD_BIGRAM_RIGHT"
    
    def add_left_bigram(x_data):
        if x_data[i]['word_number_in_turn'] == '1':
            x_data[i][POS_TURN_BIGRAM_LEFT] = 'BEGIN/'+x_data[i]['word']
        else:
            x_data[i][POS_TURN_BIGRAM_LEFT] = x_data[i-1]['word']+"/"+x_data[i]['word']
    def add_right_bigram(x_data):
        word_number, total_word_number = int(x_data[i]['word_number_in_turn']), int(x_data[i]['total_number_of_words_in_turn'])
        if word_number == total_word_number:
            x_data[i][POS_TURN_BIGRAM_RIGHT] = x_data[i]['word']+"/END"
        else:
            x_data[i][POS_TURN_BIGRAM_RIGHT] = x_data[i]['word']+"/"+x_data[i+1]['word']
            
    if not left: add_left_bigram = lambda x: None
    if not right: add_right_bigram = lambda x: None
        
    for i in range(len(x_data)):
        add_left_bigram(x_data)
        add_right_bigram(x_data)
        
def add_pos_trigram(x_data):
    POS_TURN_TRIGRAM = "POS_TURN_TRIGRAM"
    for i in range(len(x_data)):
        # to the left
        left = "BEGIN"
        if x_data[i]['word_number_in_turn'] != '1':
            left = x_data[i-1]['Stanford_PoS']
        # to the right
        right = "END"
        word_number, total_word_number = int(x_data[i]['word_number_in_turn']), int(x_data[i]['total_number_of_words_in_turn'])
        if x_data[i]['word_number_in_turn'] != x_data[i]['total_number_of_words_in_turn']:
            right = x_data[i+1]['Stanford_PoS']
        x_data[i][POS_TURN_TRIGRAM] = left+"/"+x_data[i]['Stanford_PoS']+"/"+right
        
def add_is_stutter(x_data):
    IS_STUTTER = "IS_STUTTER"
    for i in range(len(x_data)):
        x_data[i][IS_STUTTER] = x_data[i]['word'][-1] == '-'

## With pos bigram features
We no longer include the extra mention features, since didn't help

In [35]:
features = [
    'word',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function',
    'Most_Recent_Mention_Syntactic_Function',
    'Recent_Explicit_Mention_Syntactic_Function',
    'Recent_Implicit_Mention_Syntactic_Function',
    'Most_Recent_Mention_PoS',
    'Recent_Explicit_Mention_PoS',
    'Recent_Implicit_Mention_PoS',
]

continuous_feats = ['word_number_of_syllables']

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
add_dist_end_turn(x_data)
add_pos_bigram(x_data)
add_word_bigram(x_data)

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

    
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

################################### LinearSVC ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   41.6s remaining:   27.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   53.3s finished


Train time: 65.750s
Best score: 0.732
	clf__C: 0.1
	clf__loss: 'hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.91      0.96      0.93     11618
       True       0.75      0.53      0.62      2448

avg / total       0.88      0.89      0.88     14066

################################### LogisticRegression ###################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    6.8s remaining:   10.2s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.5s finished


Train time: 12.776s
Best score: 0.741
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.90      0.97      0.93     11618
       True       0.77      0.51      0.62      2448

avg / total       0.88      0.89      0.88     14066

################################### RandomForestClassifier ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  1.3min remaining:   52.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.7min finished


Train time: 142.963s
Best score: 0.734
	clf__max_features: 'log2'
	clf__n_estimators: 20
             precision    recall  f1-score   support

      False       0.88      0.98      0.93     11618
       True       0.79      0.38      0.51      2448

avg / total       0.87      0.87      0.86     14066



In [34]:
print(x_data[0])

{'word_number_of_syllables': 1, 'word_number_in_turn': '1', 'DIST_END_TURN': 0, 'Recent_Implicit_Mention_PoS': '', 'POS_TURN_BIGRAM_RIGHT': 'NN/END', 'Recent_Explicit_Mention_Syntactic_Function': '', 'Recent_Explicit_Mention_PoS': '', 'POS_WORD_BIGRAM_RIGHT': 'yup/END', 'POS_TURN_BIGRAM_LEFT': 'BEGIN/NN', 'Stanford_PoS': 'NN', 'total_number_of_words_in_turn': '1', 'word_number_in_task': '1', 'syntactic_function': 'ROOT', 'Most_Recent_Mention_Syntactic_Function': '', 'total_number_of_words_in_task': '50', 'Recent_Implicit_Mention_Syntactic_Function': '', 'word': 'yup', 'POS_WORD_BIGRAM_LEFT': 'BEGIN/yup', 'Most_Recent_Mention_PoS': ''}


## With pos bigram features, with is_stutter
Note: pos trigrams were tested, but did not help accuracy. Given both right and left bigrams, trigrams may not capture any additional information.

In [39]:
features = [
    'word',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function',
    'Most_Recent_Mention_Syntactic_Function',
    'Recent_Explicit_Mention_Syntactic_Function',
    'Recent_Implicit_Mention_Syntactic_Function',
    'Most_Recent_Mention_PoS',
    'Recent_Explicit_Mention_PoS',
    'Recent_Implicit_Mention_PoS',
]

continuous_feats = ['word_number_of_syllables']

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
add_dist_end_turn(x_data)
add_pos_bigram(x_data)
add_word_bigram(x_data)
add_is_stutter(x_data)

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.accuracy_score(y_test, y_pred))

################################### LinearSVC ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   32.0s remaining:   21.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   41.3s finished


Train time: 51.409s
Best score: 0.732
	clf__C: 0.1
	clf__loss: 'hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.91      0.96      0.93     11618
       True       0.75      0.53      0.62      2448

avg / total       0.88      0.89      0.88     14066

0.8876013081188682
################################### LogisticRegression ###################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    7.0s remaining:   10.5s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.9s finished


Train time: 13.845s
Best score: 0.740
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.90      0.97      0.93     11618
       True       0.77      0.52      0.62      2448

avg / total       0.88      0.89      0.88     14066

0.8885966159533627
################################### RandomForestClassifier ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  1.5min remaining:   58.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.9min finished


Train time: 161.314s
Best score: 0.739
	clf__max_features: 'log2'
	clf__n_estimators: 20
             precision    recall  f1-score   support

      False       0.88      0.98      0.93     11618
       True       0.77      0.39      0.52      2448

avg / total       0.86      0.87      0.86     14066

0.8743068391866913


# Adding word embedding features

In [68]:
EMBEDDINGS_FILE = "/mnt/e/word2vec/glove.twitter.27B.200d.txt"
# EMBEDDING_DIM = 50
#load embeddings
gembeddings_index = {}
with open(EMBEDDINGS_FILE, 'r') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        gembedding = np.asarray(values[1:], dtype='float32')
        gembeddings_index[word] = gembedding

print('G Word embeddings:', len(gembeddings_index))

G Word embeddings: 1193514


In [82]:
def add_embeddings(x_data, embeddings_d):
    dim_len = len(embeddings_d['the'])
    for i in range(len(x_data)):
        vector = [0] * dim_len
        word = x_data[i]['word']
        if word in embeddings_d:
            vector = embeddings_d[word]
        
        for dim in range(dim_len):
            x_data[i]['embedding_dim{}'.format(dim)] = vector[dim]


def add_embedding_cluster(x_data, embeddings_d, sphere=True, clusters=10):
    dim_len = len(embeddings_d['the'])
    x_words = [x['word'].lower() for x in x_data]
    embeddings = [embeddings_d.get(word) if word in embeddings_d else [0.0] * dim_len for word in x_words]
    
    if sphere: # equivalent to cosine similarity
        print('doing spherical kmeans')
        kmeans = SphericalKMeans(n_clusters=clusters, random_state=2557).fit(embeddings)
    else: # Euclidean distance
        kmeans = KMeans(n_clusters=clusters, random_state=2557).fit(embeddings)

    for i in range(len(x_data)):
        x_data[i]['embedding_cluster'] = str(kmeans.labels_[i])
    return kmeans

In [70]:
features = [
    'word',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
]

continuous_feats = [
    'word_number_of_syllables',
#     'word_number_in_turn',
#     'word_number_in_task',
#     'total_number_of_words_in_turn',
#     'total_number_of_words_in_task',
]

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
# add_dist_end_turn(x_data)
# add_pos_bigram(x_data)
# add_is_stutter(x_data)

# get_input_mentioned(x_data)
# set_input_num_mentions(x_data)
# get_input_far_back_mentioned(x_data)

NUM_CLUSTERS=50
SPHERE=True
kmeans = add_embedding_cluster(x_data, gembeddings_index, sphere=SPHERE, clusters=NUM_CLUSTERS)

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)
print(x_data[0])



doing spherical kmeans
{'word_number_of_syllables': 1, 'word_number_in_turn': '1', 'word_number_in_task': '1', 'total_number_of_words_in_task': '50', 'total_number_of_words_in_turn': '1', 'word': 'yup', 'embedding_cluster': 38, 'Stanford_PoS': 'NN'}


In [71]:
print("TWITTER 200 CLUSTER-50")
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.accuracy_score(y_test, y_pred))

TWITTER 200 CLUSTER-50
################################### LinearSVC ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   25.1s remaining:   16.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   30.0s finished


Train time: 42.392s
Best score: 0.667
	clf__C: 0.1
	clf__loss: 'squared_hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.88      0.97      0.92     11618
       True       0.70      0.38      0.49      2448

avg / total       0.85      0.86      0.85     14066

0.8637850135077492
################################### LogisticRegression ###################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    5.2s remaining:    7.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    7.1s finished


Train time: 11.345s
Best score: 0.678
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.88      0.97      0.92     11618
       True       0.73      0.34      0.46      2448

avg / total       0.85      0.86      0.84     14066

0.8631451727570028
################################### RandomForestClassifier ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   25.9s remaining:   17.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   35.5s finished


Train time: 52.696s
Best score: 0.667
	clf__max_features: 'log2'
	clf__n_estimators: 20
             precision    recall  f1-score   support

      False       0.87      0.98      0.92     11618
       True       0.72      0.30      0.42      2448

avg / total       0.84      0.86      0.83     14066

0.857457699417034


## Additional Syntactic Features
Not extensively tested -- future work should begin here

In [67]:
features = [
    'word',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function',
    'Most_Recent_Mention_Syntactic_Function',
    'Recent_Explicit_Mention_Syntactic_Function',
    'Recent_Implicit_Mention_Syntactic_Function',
    'Most_Recent_Mention_PoS',
    'Recent_Explicit_Mention_PoS',
    'Recent_Implicit_Mention_PoS',
    
    'tree_depth',
    'tree_width',
    'word_depth',
    'constituent_width',
    'constituent_label',
    'constituent_forward_position',
    'constituent_backward_position'
]

continuous_feats = [
    'word_number_of_syllables',
    'tree_depth',
    'tree_width',
    'word_depth',
    'constituent_width',
    'constituent_forward_position',
    'constituent_backward_position'
]
x_data, y_data = generate_data(FILE_NAME, features, continuous_feats, impute=-1)
add_dist_end_turn(x_data)
add_pos_bigram(x_data)
add_word_bigram(x_data)
add_is_stutter(x_data)

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

print(x_data[0])
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train,
                                               clf.__class__.__name__, scorer=fdot25_scorer,
                                               save_model=True)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.accuracy_score(y_test, y_pred))

{'word_number_in_turn': '1', 'POS_WORD_BIGRAM_RIGHT': 'yup/END', 'tree_depth': 4, 'constituent_backward_position': 0, 'Recent_Explicit_Mention_PoS': '', 'Stanford_PoS': 'NN', 'constituent_width': 1, 'constituent_label': 'NP', 'syntactic_function': 'ROOT', 'Most_Recent_Mention_Syntactic_Function': '', 'Most_Recent_Mention_PoS': '', 'Recent_Implicit_Mention_Syntactic_Function': '', 'POS_WORD_BIGRAM_LEFT': 'BEGIN/yup', 'word_number_of_syllables': 1, 'DIST_END_TURN': 0, 'POS_TURN_BIGRAM_RIGHT': 'NN/END', 'total_number_of_words_in_turn': '1', 'Recent_Implicit_Mention_PoS': '', 'constituent_forward_position': 0, 'word_depth': 3, 'Recent_Explicit_Mention_Syntactic_Function': '', 'word_number_in_task': '1', 'POS_TURN_BIGRAM_LEFT': 'BEGIN/NN', 'total_number_of_words_in_task': '50', 'word': 'yup', 'tree_width': 1, 'IS_STUTTER': False}
################################### LinearSVC ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   39.0s remaining:   26.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   51.7s finished


Train time: 67.801s
Best score: 0.728
	clf__C: 0.1
	clf__loss: 'hinge'
	clf__penalty: 'l2'
saving model toLinearSVC_best.pkl
             precision    recall  f1-score   support

      False       0.91      0.96      0.93     11618
       True       0.75      0.53      0.62      2448

avg / total       0.88      0.89      0.88     14066

0.8880989620361155
################################### LogisticRegression ###################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:   11.3s remaining:   17.0s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:   13.4s finished


Train time: 20.460s
Best score: 0.739
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
saving model toLogisticRegression_best.pkl
             precision    recall  f1-score   support

      False       0.90      0.97      0.93     11618
       True       0.76      0.52      0.62      2448

avg / total       0.88      0.89      0.88     14066

0.887814588369117
################################### RandomForestClassifier ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  1.3min remaining:   53.3s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.8min finished


Train time: 145.058s
Best score: 0.733
	clf__max_features: 'auto'
	clf__n_estimators: 20
saving model toRandomForestClassifier_best.pkl
             precision    recall  f1-score   support

      False       0.89      0.97      0.93     11618
       True       0.77      0.44      0.56      2448

avg / total       0.87      0.88      0.87     14066

0.8807052466941562


## Accent Classifier functions

In [74]:
# adapted get_labels() from accent_classifier.ipynb

def get_labels(df, x_data=None):
    '''
    get accent label information from df, and optionally write to x_data
    '''
    accent_columns = df[['word_tobi_break_index', 'word_tobi_pitch_accent']]
    
    y_labels = []
    
    for index, row in accent_columns.iterrows():
        
        if row['word_tobi_pitch_accent'] == "*?":
            y_labels.append(0)
            
        elif row['word_tobi_pitch_accent'] == "_":
            y_labels.append(0)
            
        else:
            y_labels.append(1)
    if x_data:
        print(len(y_labels))
        for i, label in enumerate(y_labels):
            if i >= len(x_data):
                break
            x_data[i]["accent_label0"] = y_labels[i]
    return np.array(y_labels)


In [75]:
from itertools import islice

def iter_window(seq, n=2):
    "Returns a sliding window (of width n) over data from the iterable"
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(seq)
    result = tuple(islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

def get_label_window(labels, x_data=None, window_size=2, prev=0):
    '''
    :param labels: list of binary values generated by get_labels()
    :param x_data: the data list, if given adds accent0 to accent<window_size> features to each row,
                   where x is the index of the accent relative to the current row (ex. accent-1 is
                   the accent of the row before)
    :param window_size: int size of window
    :param prev: int offset of start (ex. prev=1 means looks 1 back)
    '''
    start_index = 0
    labels_windows = []
    for w in iter_window(labels, window_size):
        labels_windows.append(list(w))
    if prev:
        start_index = -prev
        initial = labels_windows[0]
        to_prepend = []
        for i in range(prev, 0, -1):
            a = ([0] * (i))
            a.extend(initial[0:window_size - (i)])
            to_prepend.append(a)
        labels_windows = to_prepend + labels_windows
    if len(labels_windows) < len(labels):
        diff = len(labels) - len(labels_windows)
        final = labels_windows[-1]
        for i in range(diff):
            a = final[(i+1):]
            a.extend([0] * (i+1))
            labels_windows.append(a)
    if x_data:
        for i in range(len(x_data)):
            count = start_index
            for j in range(window_size):
                x_data[i]['accent{}'.format(count)] = labels_windows[i][j]
                count += 1
                
    labels_windows = np.array(labels_windows)
    return labels_windows


In [109]:
features = [
    'word',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function',
    
#     'tree_depth',
#     'tree_width',
#     'word_depth',
#     'constituent_width',
#     'constituent_label',
#     'constituent_forward_position',
#     'constituent_backward_position'
]

continuous_feats = [
    'word_number_of_syllables',
#     'tree_depth',
#     'tree_width',
#     'word_depth',
#     'constituent_width',
#     'constituent_forward_position',
#     'constituent_backward_position'
]

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
add_dist_end_turn(x_data)
add_pos_bigram(x_data)
# add_word_bigram(x_data)
add_is_stutter(x_data)

# get_input_mentioned(x_data, df)
# set_input_num_mentions(x_data, df)
# get_input_far_back_mentioned(x_data, df)

# kmeans = add_embedding_cluster(x_data, gembeddings_index,sphere=True, clusters=50)

labels = get_labels(df)
get_label_window(labels, x_data=x_data, window_size=3, prev=1)
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)
print(x_data[0])

print("everything")
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train,
                                               clf.__class__.__name__ +'pitch_accent_gold',
                                               scorer=fdot25_scorer, save_model=True)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.accuracy_score(y_test, y_pred))

{'word_number_of_syllables': 1, 'word_number_in_turn': '1', 'DIST_END_TURN': 0, 'accent-1': 0, 'POS_TURN_BIGRAM_RIGHT': 'NN/END', 'accent0': 1, 'POS_TURN_BIGRAM_LEFT': 'BEGIN/NN', 'Stanford_PoS': 'NN', 'total_number_of_words_in_turn': '1', 'word_number_in_task': '1', 'syntactic_function': 'ROOT', 'total_number_of_words_in_task': '50', 'word': 'yup', 'accent1': 1, 'IS_STUTTER': False}
everything
################################### LinearSVCpitch_accent_gold ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   26.4s remaining:   17.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   32.7s finished


Train time: 42.503s
Best score: 0.741
	clf__C: 0.1
	clf__loss: 'hinge'
	clf__penalty: 'l2'
saving model toLinearSVCpitch_accent_gold_best.pkl
             precision    recall  f1-score   support

      False       0.91      0.97      0.93     11618
       True       0.76      0.52      0.62      2448

avg / total       0.88      0.89      0.88     14066

0.8886677093701123
################################### LogisticRegressionpitch_accent_gold ###################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:    5.9s remaining:    8.9s
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:    8.0s finished


Train time: 12.950s
Best score: 0.745
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
saving model toLogisticRegressionpitch_accent_gold_best.pkl
             precision    recall  f1-score   support

      False       0.91      0.97      0.94     11618
       True       0.77      0.52      0.62      2448

avg / total       0.88      0.89      0.88     14066

0.8893786435376084
################################### RandomForestClassifierpitch_accent_gold ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   57.1s remaining:   38.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  1.4min finished


Train time: 119.580s
Best score: 0.820
	clf__max_features: 'auto'
	clf__n_estimators: 50
saving model toRandomForestClassifierpitch_accent_gold_best.pkl
             precision    recall  f1-score   support

      False       0.91      0.98      0.94     11618
       True       0.87      0.53      0.65      2448

avg / total       0.90      0.90      0.89     14066

0.9035262334707806


## Error Analysis

In [None]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
        
def show_least_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names), key=lambda x: abs(x[0]))
    top = coefs_with_fns[:n]
    for (coef_1, fn_1) in top:
        print("\t%.4f\t%-15s" % (coef_1, fn_1))

In [None]:
show_most_informative_features(model.steps[0][1], model.steps[2][1])

In [None]:
show_least_informative_features(model.steps[0][1], model.steps[2][1], n=100)

In [None]:
len(model.steps[2][1].coef_[0])

In [None]:
from collections import defaultdict
def show_feature_importance(vectorizer, clf):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = zip(clf.coef_[0], feature_names)
    feat_count = defaultdict(list)
    for (coef_1, fn_1) in coefs_with_fns:
        feat = fn_1.split('=')[0]
        feat_count[feat].append(abs(coef_1))
    for feat in feat_count.keys():
        print(feat, sum(feat_count[feat])/len(feat_count[feat]))
    return feat_count

In [None]:
feat_count = show_feature_importance(model.steps[0][1], model.steps[2][1])

In [None]:
import matplotlib.pyplot as plt

In [None]:
for feat in feat_count.keys():
    hist, bins = np.histogram(feat_count[feat], bins=int(len(feat_count[feat])**0.5))
    width = 0.7 * (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    plt.bar(center, hist, align='center', width=width)
    plt.title(feat)
    plt.show()