In [1]:
## Imports
import csv
import sys
import numpy as np
import pickle
from time import time
import re
from spherecluster import SphericalKMeans

from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.cluster import KMeans
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectPercentile, SelectKBest
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn import metrics

In [2]:
# Main functions

def generate_data(file_name, features, continuous_feats):
    break_label = 'word_tobi_break_index'
    break_set = set(["4", "4-", "4p"])
    x_data = []
    y_data = []

    with open(file_name, 'r') as f:
        reader = csv.DictReader(f) # DictReader fixes off-by-one error from before
        for i, l in enumerate(reader):
            feats = {}
            for feat in features:
                if feat in continuous_feats:
                    feats[feat] = int(l[feat])
                else:
                    feats[feat] = l[feat]
#             feats = {feat: l[feat] for feat in features}
#             # convert some to continuous features
#             for feat in continuous_feats:
#                 feats[feat] = float(feats[feat])
            x_data.append(feats)
            label = l[break_label] in break_set
            y_data.append(label)
    return x_data, y_data

def classify_my_model(pipeline, param_grid, X, y, model_name, scorer='f1', excluded_features=set()):
    print('#'*35, model_name, '#'*35)
    folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=int(time()))
#     param_grid['selector__percentile'] = [10, 25, 50, 90, 95, 100]
    
    for entry in X:
        for f in excluded_features:
            if f in entry:
                entry.pop(f)
    
    gs = GridSearchCV(pipeline,
                      param_grid,
                      scoring=scorer,
                      cv=5,
                      n_jobs=-1,
                      verbose=1)
    t0 = time()
    gs.fit(X, y)
    train_time = time() - t0
    print("Train time: %0.3fs" % train_time)
#     print("Real train time: %0.3fs" % (train_time * (TOTAL_COUNT/DEV_COUNT)))
    print("Best score: %0.3f" % gs.best_score_)
    best_params = gs.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
        print("\t%s: %r" % (param_name, best_params[param_name]))
    return gs.best_score_, gs.best_estimator_
    
#     with open(model_name+'_best_'+type(clf).__name__+".pkl", 'wb') as handle:
#         pickle.dump(best_model, handle)

In [3]:
# list of classifier models

from sklearn.metrics import fbeta_score, make_scorer
fdot25_scorer = make_scorer(fbeta_score, beta=.25)
dict_vectorizer = DictVectorizer()
# select_percentile = SelectPercentile(percentile=100)

clf_map = [
#     (
#         BernoulliNB(),
#         {
#             'clf__alpha': [.001, .01, .1, 1],
#             'clf__fit_prior': [True, False],
#         }
#     ),
    (
        LinearSVC(),
        {
            'clf__C': [.1, 1, 10, 100],
            'clf__penalty': ['l2'],
            'clf__loss': ['hinge', 'squared_hinge'],
        }
    ),
    (
        LogisticRegression(),
        {
            'clf__penalty': ['l1','l2'],
            'clf__fit_intercept': [True, False],
            'clf__C':[.1, 1, 10, 100],
        }
    ),
    (
        RandomForestClassifier(random_state=2557),
        {
            'clf__n_estimators': [10, 20],
            'clf__max_features': ["auto", "log2",None]
        }
    ),
#     (
#         DecisionTreeClassifier(random_state=2557),
#         {
#             'clf__criterion': ["gini", "entropy"]
#         }
#     ),

]


In [4]:
FILE_NAME = "../info-status/games-data-20180413.csv"

### Basic Model

In [6]:
features = [
    'word',
#     'word_pos_tag',
#     'word_pos_tag_simplified',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS'
]

continuous_feats = ['word_number_of_syllables',]

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)


In [7]:
# majority class baseline classifier
y_pred = [0] * len(X_test)
print(metrics.classification_report(y_test, y_pred))

             precision    recall  f1-score   support

      False       0.83      1.00      0.90     11618
       True       0.00      0.00      0.00      2448

avg / total       0.68      0.83      0.75     14066



  'precision', 'predicted', average, warn_for)


In [7]:
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

################################### BernoulliNB ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   10.1s finished


Train time: 11.402s
Best score: 0.540
	clf__alpha: 1
	clf__fit_prior: True
             precision    recall  f1-score   support

      False       0.90      0.91      0.90     11618
       True       0.54      0.51      0.52      2448

avg / total       0.84      0.84      0.84     14066

################################### LinearSVC ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   59.8s finished


Train time: 61.202s
Best score: 0.668
	clf__C: 0.1
	clf__loss: 'squared_hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.88      0.97      0.92     11618
       True       0.70      0.38      0.49      2448

avg / total       0.85      0.86      0.85     14066

################################### LogisticRegression ###################################
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.8s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  2.2min finished


Train time: 135.619s
Best score: 0.678
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.88      0.97      0.92     11618
       True       0.73      0.34      0.46      2448

avg / total       0.85      0.86      0.84     14066

################################### DecisionTreeClassifier ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   11.8s remaining:    7.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   15.6s finished


Train time: 21.193s
Best score: 0.518
	clf__criterion: 'gini'
             precision    recall  f1-score   support

      False       0.89      0.91      0.90     11618
       True       0.53      0.45      0.49      2448

avg / total       0.83      0.83      0.83     14066



## Model with syntactic features

In [9]:
features = [
    'word',
#     'word_pos_tag',
#     'word_pos_tag_simplified',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function'
]

continuous_feats = ['word_number_of_syllables',]

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

    
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

################################### BernoulliNB ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   10.3s finished


Train time: 11.434s
Best score: 0.512
	clf__alpha: 1
	clf__fit_prior: True
             precision    recall  f1-score   support

      False       0.91      0.88      0.90     11618
       True       0.52      0.60      0.56      2448

avg / total       0.84      0.83      0.84     14066

################################### LinearSVC ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   54.8s finished


Train time: 56.338s
Best score: 0.674
	clf__C: 0.1
	clf__loss: 'squared_hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.89      0.96      0.92     11618
       True       0.71      0.42      0.53      2448

avg / total       0.86      0.87      0.86     14066

################################### LogisticRegression ###################################
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   17.0s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  2.0min finished


Train time: 122.417s
Best score: 0.687
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.88      0.97      0.92     11618
       True       0.72      0.39      0.50      2448

avg / total       0.85      0.87      0.85     14066

################################### DecisionTreeClassifier ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:    8.6s remaining:    5.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   12.3s finished


Train time: 18.527s
Best score: 0.527
	clf__criterion: 'gini'
             precision    recall  f1-score   support

      False       0.89      0.91      0.90     11618
       True       0.53      0.47      0.50      2448

avg / total       0.83      0.84      0.83     14066



## Model with syntactic features and current and previous mention information

In [39]:
features = [
    'word',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function',
    'Most_Recent_Mention_Syntactic_Function',
    'Recent_Explicit_Mention_Syntactic_Function',
    'Recent_Implicit_Mention_Syntactic_Function',
    'Most_Recent_Mention_PoS',
    'Recent_Explicit_Mention_PoS',
    'Recent_Implicit_Mention_PoS',
]

continuous_feats = ['word_number_of_syllables',]

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

    
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

################################### BernoulliNB ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   20.8s finished


Train time: 23.152s
Best score: 0.495
	clf__alpha: 0.001
	clf__fit_prior: True
             precision    recall  f1-score   support

      False       0.91      0.88      0.89     11618
       True       0.50      0.59      0.54      2448

avg / total       0.84      0.83      0.83     14066

################################### LinearSVC ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.3min finished


Train time: 82.325s
Best score: 0.684
	clf__C: 0.1
	clf__loss: 'hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.87      0.97      0.92     11618
       True       0.73      0.33      0.46      2448

avg / total       0.85      0.86      0.84     14066

################################### LogisticRegression ###################################
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   37.5s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  2.9min finished


Train time: 179.075s
Best score: 0.690
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.88      0.97      0.92     11618
       True       0.72      0.39      0.51      2448

avg / total       0.85      0.87      0.85     14066

################################### RandomForestClassifier ###################################
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  3.5min finished


Train time: 229.150s
Best score: 0.678
	clf__max_features: 'log2'
	clf__n_estimators: 20
             precision    recall  f1-score   support

      False       0.87      0.98      0.92     11618
       True       0.74      0.31      0.44      2448

avg / total       0.85      0.86      0.84     14066

################################### DecisionTreeClassifier ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   16.6s remaining:   11.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   21.6s finished


Train time: 31.051s
Best score: 0.531
	clf__criterion: 'gini'
             precision    recall  f1-score   support

      False       0.89      0.92      0.90     11618
       True       0.54      0.47      0.50      2448

avg / total       0.83      0.84      0.83     14066



In [5]:
def get_input_mentioned(X_data):
    for i in range(len(X_data)):
        X_data[i]['mentioned'] = True if X_data[i]['Most_Recent_Mention'] else False

def set_input_num_mentions(X_data):
    for i in range(len(X_data)):
        num = X_data[i]['Number_Of_Coref_Mentions']
        X_data[i]['Number_Of_Coref_Mentions'] = int(num) if num else 0

def get_input_far_back_mentioned(X_data):
    for i in range(len(X_data)):
        curr_time = float(X_data[i]['word_end_time'])
        most_recent_time = X_data[i]['Most_Recent_Mention']
        most_recent_time = float(most_recent_time) if most_recent_time else curr_time # if no Most_Recent_Mention data, value = 0
        X_data[i]['time_between_mentions'] = curr_time - most_recent_time


In [40]:
features = [
    'word',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function',
    'Most_Recent_Mention_Syntactic_Function',
    'Recent_Explicit_Mention_Syntactic_Function',
    'Recent_Implicit_Mention_Syntactic_Function',
    'Most_Recent_Mention_PoS',
    'Recent_Explicit_Mention_PoS',
    'Recent_Implicit_Mention_PoS',
    'Most_Recent_Mention',
    'Number_Of_Coref_Mentions',
    'word_end_time'
]

continuous_feats = ['word_number_of_syllables']

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
get_input_mentioned(x_data)
set_input_num_mentions(x_data)
get_input_far_back_mentioned(x_data)

excluded_features = ['Most_Recent_Mention','word_end_time']
for entry in x_data:
    for feature in excluded_features:
        if feature in entry:
            entry.pop(feature)
print(x_data[0])

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

    
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

{'time_between_mentions': 0.0, 'Recent_Explicit_Mention_PoS': '', 'Recent_Explicit_Mention_Syntactic_Function': '', 'word_number_in_task': '1', 'word': 'yup', 'total_number_of_words_in_turn': '1', 'Most_Recent_Mention_PoS': '', 'Recent_Implicit_Mention_Syntactic_Function': '', 'word_number_in_turn': '1', 'total_number_of_words_in_task': '50', 'Recent_Implicit_Mention_PoS': '', 'Number_Of_Coref_Mentions': 0, 'Stanford_PoS': 'NN', 'word_number_of_syllables': 1.0, 'mentioned': False, 'syntactic_function': 'ROOT', 'Most_Recent_Mention_Syntactic_Function': ''}
################################### BernoulliNB ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   21.8s finished


Train time: 24.116s
Best score: 0.488
	clf__alpha: 0.001
	clf__fit_prior: True
             precision    recall  f1-score   support

      False       0.91      0.88      0.89     11618
       True       0.49      0.56      0.52      2448

avg / total       0.83      0.82      0.83     14066

################################### LinearSVC ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.3min finished


Train time: 147.904s
Best score: 0.665
	clf__C: 0.1
	clf__loss: 'hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.87      0.97      0.92     11618
       True       0.73      0.33      0.46      2448

avg / total       0.85      0.86      0.84     14066

################################### LogisticRegression ###################################
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   41.4s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  2.6min finished


Train time: 160.188s
Best score: 0.687
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.88      0.97      0.92     11618
       True       0.72      0.39      0.51      2448

avg / total       0.85      0.87      0.85     14066

################################### RandomForestClassifier ###################################
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  3.6min finished


Train time: 228.650s
Best score: 0.682
	clf__max_features: 'auto'
	clf__n_estimators: 20
             precision    recall  f1-score   support

      False       0.88      0.97      0.92     11618
       True       0.74      0.36      0.49      2448

avg / total       0.85      0.87      0.85     14066

################################### DecisionTreeClassifier ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   14.5s remaining:    9.7s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   19.1s finished


Train time: 27.965s
Best score: 0.539
	clf__criterion: 'gini'
             precision    recall  f1-score   support

      False       0.89      0.92      0.90     11618
       True       0.54      0.47      0.51      2448

avg / total       0.83      0.84      0.83     14066



# Adding POS features

In [8]:
def add_dist_end_turn(x_data):
    # distance from end of turn
    DIST_END_TURN = "DIST_END_TURN"
    for i in range(len(x_data)):
        x_data[i][DIST_END_TURN] = int(x_data[i]['total_number_of_words_in_turn']) - int(x_data[i]['word_number_in_turn'])
        
def add_pos_bigram(x_data, left=True, right=True):
    POS_TURN_BIGRAM_LEFT = "POS_TURN_BIGRAM_LEFT"
    POS_TURN_BIGRAM_RIGHT = "POS_TURN_BIGRAM_RIGHT"
    
    def add_left_bigram(x_data):
        if x_data[i]['word_number_in_turn'] == '1':
            x_data[i][POS_TURN_BIGRAM_LEFT] = 'BEGIN/'+x_data[i]['Stanford_PoS']
        else:
            x_data[i][POS_TURN_BIGRAM_LEFT] = x_data[i-1]['Stanford_PoS']+"/"+x_data[i]['Stanford_PoS']
    def add_right_bigram(x_data):
        word_number, total_word_number = int(x_data[i]['word_number_in_turn']), int(x_data[i]['total_number_of_words_in_turn'])
        if word_number == total_word_number:
            x_data[i][POS_TURN_BIGRAM_RIGHT] = x_data[i]['Stanford_PoS']+"/END"
        else:
            x_data[i][POS_TURN_BIGRAM_RIGHT] = x_data[i]['Stanford_PoS']+"/"+x_data[i+1]['Stanford_PoS']
            
    if not left: add_left_bigram = lambda x: None
    if not right: add_right_bigram = lambda x: None
        
    for i in range(len(x_data)):
        add_left_bigram(x_data)
        add_right_bigram(x_data)

def add_word_bigram(x_data, left=True, right=True):
    POS_TURN_BIGRAM_LEFT = "POS_WORD_BIGRAM_LEFT"
    POS_TURN_BIGRAM_RIGHT = "POS_WORD_BIGRAM_RIGHT"
    
    def add_left_bigram(x_data):
        if x_data[i]['word_number_in_turn'] == '1':
            x_data[i][POS_TURN_BIGRAM_LEFT] = 'BEGIN/'+x_data[i]['word']
        else:
            x_data[i][POS_TURN_BIGRAM_LEFT] = x_data[i-1]['word']+"/"+x_data[i]['word']
    def add_right_bigram(x_data):
        word_number, total_word_number = int(x_data[i]['word_number_in_turn']), int(x_data[i]['total_number_of_words_in_turn'])
        if word_number == total_word_number:
            x_data[i][POS_TURN_BIGRAM_RIGHT] = x_data[i]['word']+"/END"
        else:
            x_data[i][POS_TURN_BIGRAM_RIGHT] = x_data[i]['word']+"/"+x_data[i+1]['word']
            
    if not left: add_left_bigram = lambda x: None
    if not right: add_right_bigram = lambda x: None
        
    for i in range(len(x_data)):
        add_left_bigram(x_data)
        add_right_bigram(x_data)
        
def add_pos_trigram(x_data):
    POS_TURN_TRIGRAM = "POS_TURN_TRIGRAM"
    for i in range(len(x_data)):
        # to the left
        left = "BEGIN"
        if x_data[i]['word_number_in_turn'] != '1':
            left = x_data[i-1]['Stanford_PoS']
        # to the right
        right = "END"
        word_number, total_word_number = int(x_data[i]['word_number_in_turn']), int(x_data[i]['total_number_of_words_in_turn'])
        if x_data[i]['word_number_in_turn'] != x_data[i]['total_number_of_words_in_turn']:
            right = x_data[i+1]['Stanford_PoS']
        x_data[i][POS_TURN_TRIGRAM] = left+"/"+x_data[i]['Stanford_PoS']+"/"+right
        
def add_is_stutter(x_data):
    IS_STUTTER = "IS_STUTTER"
    for i in range(len(x_data)):
        x_data[i][IS_STUTTER] = x_data[i]['word'][-1] == '-'

## With bigram features, without coref mention

In [46]:
features = [
    'word',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function',
    'Most_Recent_Mention_Syntactic_Function',
    'Recent_Explicit_Mention_Syntactic_Function',
    'Recent_Implicit_Mention_Syntactic_Function',
    'Most_Recent_Mention_PoS',
    'Recent_Explicit_Mention_PoS',
    'Recent_Implicit_Mention_PoS',
#     'Most_Recent_Mention',
#     'Number_Of_Coref_Mentions',
#     'word_end_time'
]

continuous_feats = ['word_number_of_syllables']

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
add_dist_end_turn(x_data)
add_pos_bigram(x_data)


X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

    
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

################################### BernoulliNB ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   42.5s finished


Train time: 46.236s
Best score: 0.521
	clf__alpha: 1
	clf__fit_prior: True
             precision    recall  f1-score   support

      False       0.93      0.86      0.90     11618
       True       0.52      0.69      0.59      2448

avg / total       0.86      0.83      0.84     14066

################################### LinearSVC ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  3.6min finished


Train time: 230.678s
Best score: 0.725
	clf__C: 0.1
	clf__loss: 'hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.90      0.96      0.93     11618
       True       0.75      0.50      0.60      2448

avg / total       0.88      0.88      0.88     14066

################################### LogisticRegression ###################################
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  3.8min finished


Train time: 234.669s
Best score: 0.737
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.90      0.97      0.93     11618
       True       0.77      0.50      0.61      2448

avg / total       0.88      0.89      0.88     14066

################################### RandomForestClassifier ###################################
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  6.5min finished


Train time: 414.595s
Best score: 0.730
	clf__max_features: 'log2'
	clf__n_estimators: 20
             precision    recall  f1-score   support

      False       0.88      0.98      0.93     11618
       True       0.78      0.38      0.51      2448

avg / total       0.86      0.87      0.85     14066

################################### DecisionTreeClassifier ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   23.4s remaining:   15.6s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   31.5s finished


Train time: 45.519s
Best score: 0.593
	clf__criterion: 'gini'
             precision    recall  f1-score   support

      False       0.90      0.92      0.91     11618
       True       0.59      0.53      0.56      2448

avg / total       0.85      0.86      0.85     14066



## With bigram features, with coref mention

In [45]:
#no is_stutter
features = [
    'word',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function',
    'Most_Recent_Mention_Syntactic_Function',
    'Recent_Explicit_Mention_Syntactic_Function',
    'Recent_Implicit_Mention_Syntactic_Function',
    'Most_Recent_Mention_PoS',
    'Recent_Explicit_Mention_PoS',
    'Recent_Implicit_Mention_PoS',
    'Most_Recent_Mention',
    'Number_Of_Coref_Mentions',
    'word_end_time'
]

continuous_feats = ['word_number_of_syllables']

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
add_dist_end_turn(x_data)
add_pos_bigram(x_data)
get_input_mentioned(x_data)
set_input_num_mentions(x_data)
get_input_far_back_mentioned(x_data)

excluded_features = ['Most_Recent_Mention','word_end_time']
for entry in x_data:
    for feature in excluded_features:
        if feature in entry:
            entry.pop(feature)
print(x_data[0])

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

    
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

{'Recent_Explicit_Mention_PoS': '', 'word_number_in_task': '1', 'word': 'yup', 'total_number_of_words_in_turn': '1', 'total_number_of_words_in_task': '50', 'Stanford_PoS': 'NN', 'syntactic_function': 'ROOT', 'POS_TURN_BIGRAM_RIGHT': 'NN/END', 'Recent_Explicit_Mention_Syntactic_Function': '', 'time_between_mentions': 0.0, 'Recent_Implicit_Mention_PoS': '', 'POS_TURN_BIGRAM_LEFT': 'BEGIN/NN', 'Most_Recent_Mention_PoS': '', 'Recent_Implicit_Mention_Syntactic_Function': '', 'Most_Recent_Mention_Syntactic_Function': '', 'word_number_in_turn': '1', 'Number_Of_Coref_Mentions': 0, 'DIST_END_TURN': 0, 'mentioned': False, 'word_number_of_syllables': 1.0}
################################### BernoulliNB ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   33.8s finished


Train time: 37.017s
Best score: 0.519
	clf__alpha: 0.001
	clf__fit_prior: True
             precision    recall  f1-score   support

      False       0.93      0.87      0.90     11618
       True       0.52      0.68      0.59      2448

avg / total       0.86      0.83      0.84     14066

################################### LinearSVC ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  3.3min finished


Train time: 220.400s
Best score: 0.720
	clf__C: 0.1
	clf__loss: 'hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.90      0.97      0.93     11618
       True       0.75      0.49      0.59      2448

avg / total       0.87      0.88      0.87     14066

################################### LogisticRegression ###################################
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  4.0min finished


Train time: 245.104s
Best score: 0.739
	clf__C: 0.1
	clf__fit_intercept: False
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.90      0.97      0.93     11618
       True       0.77      0.50      0.61      2448

avg / total       0.88      0.89      0.88     14066

################################### RandomForestClassifier ###################################
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  6.3min finished


Train time: 401.336s
Best score: 0.733
	clf__max_features: 'log2'
	clf__n_estimators: 20
             precision    recall  f1-score   support

      False       0.88      0.98      0.93     11618
       True       0.79      0.38      0.51      2448

avg / total       0.87      0.87      0.86     14066

################################### DecisionTreeClassifier ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:   24.8s remaining:   16.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   32.5s finished


Train time: 46.967s
Best score: 0.598
	clf__criterion: 'gini'
             precision    recall  f1-score   support

      False       0.90      0.92      0.91     11618
       True       0.60      0.53      0.56      2448

avg / total       0.85      0.86      0.85     14066



In [48]:
features = [
    'word',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function',
    'Most_Recent_Mention_Syntactic_Function',
    'Recent_Explicit_Mention_Syntactic_Function',
    'Recent_Implicit_Mention_Syntactic_Function',
    'Most_Recent_Mention_PoS',
    'Recent_Explicit_Mention_PoS',
    'Recent_Implicit_Mention_PoS',
    'Most_Recent_Mention',
    'Number_Of_Coref_Mentions',
    'word_end_time'
]

continuous_feats = ['word_number_of_syllables']

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
add_dist_end_turn(x_data)
add_pos_bigram(x_data)
add_is_stutter(x_data)

get_input_mentioned(x_data)
set_input_num_mentions(x_data)
get_input_far_back_mentioned(x_data)

excluded_features = ['Most_Recent_Mention','word_end_time']
for entry in x_data:
    for feature in excluded_features:
        if feature in entry:
            entry.pop(feature)
print(x_data[0])

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)


{'Recent_Explicit_Mention_PoS': '', 'word_number_in_task': '1', 'word': 'yup', 'total_number_of_words_in_turn': '1', 'total_number_of_words_in_task': '50', 'Stanford_PoS': 'NN', 'syntactic_function': 'ROOT', 'POS_TURN_BIGRAM_RIGHT': 'NN/END', 'Recent_Explicit_Mention_Syntactic_Function': '', 'time_between_mentions': 0.0, 'Recent_Implicit_Mention_PoS': '', 'POS_TURN_BIGRAM_LEFT': 'BEGIN/NN', 'Most_Recent_Mention_PoS': '', 'Recent_Implicit_Mention_Syntactic_Function': '', 'Most_Recent_Mention_Syntactic_Function': '', 'IS_STUTTER': False, 'Number_Of_Coref_Mentions': 0, 'DIST_END_TURN': 0, 'word_number_of_syllables': 1.0, 'mentioned': False, 'word_number_in_turn': '1'}


In [50]:

for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

################################### BernoulliNB ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   27.2s finished


Train time: 30.654s
Best score: 0.520
	clf__alpha: 0.001
	clf__fit_prior: True
             precision    recall  f1-score   support

      False       0.93      0.87      0.90     11618
       True       0.52      0.68      0.59      2448

avg / total       0.86      0.83      0.84     14066

################################### LinearSVC ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  3.3min finished


Train time: 212.602s
Best score: 0.720
	clf__C: 0.1
	clf__loss: 'hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.90      0.96      0.93     11618
       True       0.73      0.51      0.60      2448

avg / total       0.87      0.88      0.87     14066

################################### LogisticRegression ###################################
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  3.7min finished


Train time: 228.341s
Best score: 0.738
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.90      0.97      0.93     11618
       True       0.77      0.51      0.61      2448

avg / total       0.88      0.89      0.88     14066

################################### RandomForestClassifier ###################################
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  5.4min finished


Train time: 341.995s
Best score: 0.736
	clf__max_features: 'log2'
	clf__n_estimators: 20
             precision    recall  f1-score   support

      False       0.88      0.98      0.93     11618
       True       0.79      0.39      0.53      2448

avg / total       0.87      0.88      0.86     14066



## Now with trigram

In [53]:
features = [
    'word',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function',
    'Most_Recent_Mention_Syntactic_Function',
    'Recent_Explicit_Mention_Syntactic_Function',
    'Recent_Implicit_Mention_Syntactic_Function',
    'Most_Recent_Mention_PoS',
    'Recent_Explicit_Mention_PoS',
    'Recent_Implicit_Mention_PoS',
]

continuous_feats = ['word_number_of_syllables']

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
add_dist_end_turn(x_data)
add_pos_bigram(x_data)
add_pos_trigram(x_data)
add_is_stutter(x_data)

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.accuracy_score(y_test, y_pred))

################################### BernoulliNB ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   27.2s finished


Train time: 29.672s
Best score: 0.528
	clf__alpha: 1
	clf__fit_prior: True
             precision    recall  f1-score   support

      False       0.93      0.86      0.90     11618
       True       0.52      0.71      0.60      2448

avg / total       0.86      0.84      0.85     14066

0.8361296743921512
################################### LinearSVC ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.4min finished


Train time: 155.162s
Best score: 0.726
	clf__C: 0.1
	clf__loss: 'hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.91      0.96      0.93     11618
       True       0.75      0.52      0.62      2448

avg / total       0.88      0.89      0.88     14066

0.8863216266173752
################################### LogisticRegression ###################################
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   45.9s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  3.1min finished


Train time: 188.210s
Best score: 0.735
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.90      0.97      0.93     11618
       True       0.77      0.51      0.61      2448

avg / total       0.88      0.89      0.88     14066

0.887814588369117
################################### RandomForestClassifier ###################################
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  6.1min finished


Train time: 390.040s
Best score: 0.728
	clf__max_features: 'log2'
	clf__n_estimators: 20
             precision    recall  f1-score   support

      False       0.89      0.97      0.93     11618
       True       0.77      0.41      0.53      2448

avg / total       0.87      0.88      0.86     14066

0.8757287075216835


In [52]:
features = [
    'word',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
    'syntactic_function',
    'Most_Recent_Mention_Syntactic_Function',
    'Recent_Explicit_Mention_Syntactic_Function',
    'Recent_Implicit_Mention_Syntactic_Function',
    'Most_Recent_Mention_PoS',
    'Recent_Explicit_Mention_PoS',
    'Recent_Implicit_Mention_PoS',
    'Most_Recent_Mention',
    'Number_Of_Coref_Mentions',
    'word_end_time'
]

continuous_feats = ['word_number_of_syllables']

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
add_dist_end_turn(x_data)
add_pos_bigram(x_data)
add_pos_trigram(x_data)
add_is_stutter(x_data)

get_input_mentioned(x_data)
set_input_num_mentions(x_data)
get_input_far_back_mentioned(x_data)

excluded_features = ['Most_Recent_Mention','word_end_time']
for entry in x_data:
    for feature in excluded_features:
        if feature in entry:
            entry.pop(feature)
print(x_data[0])

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.accuracy_score(y_test, y_pred))

{'POS_TURN_TRIGRAM': 'BEGIN/NN/END', 'Recent_Explicit_Mention_PoS': '', 'word_number_in_task': '1', 'word': 'yup', 'total_number_of_words_in_turn': '1', 'total_number_of_words_in_task': '50', 'Stanford_PoS': 'NN', 'syntactic_function': 'ROOT', 'IS_STUTTER': False, 'POS_TURN_BIGRAM_RIGHT': 'NN/END', 'Recent_Explicit_Mention_Syntactic_Function': '', 'time_between_mentions': 0.0, 'Recent_Implicit_Mention_PoS': '', 'POS_TURN_BIGRAM_LEFT': 'BEGIN/NN', 'Most_Recent_Mention_PoS': '', 'Recent_Implicit_Mention_Syntactic_Function': '', 'Most_Recent_Mention_Syntactic_Function': '', 'word_number_in_turn': '1', 'Number_Of_Coref_Mentions': 0, 'DIST_END_TURN': 0, 'mentioned': False, 'word_number_of_syllables': 1.0}
################################### BernoulliNB ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   30.3s finished


Train time: 33.305s
Best score: 0.524
	clf__alpha: 1
	clf__fit_prior: True
             precision    recall  f1-score   support

      False       0.93      0.86      0.90     11618
       True       0.52      0.69      0.59      2448

avg / total       0.86      0.83      0.84     14066

0.8349210863074079
################################### LinearSVC ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  2.7min finished


Train time: 175.566s
Best score: 0.711
	clf__C: 0.1
	clf__loss: 'hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.90      0.96      0.93     11618
       True       0.74      0.50      0.60      2448

avg / total       0.87      0.88      0.87     14066

0.8827669557798948
################################### LogisticRegression ###################################
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   52.8s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  3.3min finished


Train time: 203.261s
Best score: 0.737
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.90      0.97      0.93     11618
       True       0.76      0.51      0.61      2448

avg / total       0.88      0.89      0.88     14066

0.8876013081188682
################################### RandomForestClassifier ###################################
Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:  6.3min finished


Train time: 397.991s
Best score: 0.730
	clf__max_features: 'auto'
	clf__n_estimators: 20
             precision    recall  f1-score   support

      False       0.89      0.97      0.93     11618
       True       0.76      0.46      0.57      2448

avg / total       0.87      0.88      0.87     14066

0.8802786861936585


# Adding word embedding features

In [54]:
EMBEDDINGS_FILE = "/mnt/e/word2vec/glove.twitter.27B.200d.txt"
# EMBEDDING_DIM = 50
#load embeddings
gembeddings_index = {}
with open(EMBEDDINGS_FILE, 'r') as f:
    for line in f:
        values = line.split(' ')
        word = values[0]
        gembedding = np.asarray(values[1:], dtype='float32')
        gembeddings_index[word] = gembedding

print('G Word embeddings:', len(gembeddings_index))

G Word embeddings: 1193514


In [27]:
def add_embeddings(x_data, embeddings_d):
    dim_len = len(embeddings_d['the'])
    for i in range(len(x_data)):
        vector = [0] * dim_len
        word = x_data[i]['word']
        if word in embeddings_d:
            vector = embeddings_d[word]
        
        for dim in range(dim_len):
            x_data[i]['embedding_dim{}'.format(dim)] = vector[dim]


def add_embedding_cluster(x_data, embeddings_d, sphere=True, clusters=10):
    dim_len = len(embeddings_d['the'])
    x_words = [x['word'].lower() for x in x_data]
    embeddings = [embeddings_d.get(word) if word in embeddings_d else [0.0] * dim_len for word in x_words]
    
    if sphere: # equivalent to cosine similarity
        print('doing spherical kmeans')
        kmeans = SphericalKMeans(n_clusters=clusters, random_state=2557).fit(embeddings)
    else: # Euclidean distance
        kmeans = KMeans(n_clusters=clusters, random_state=2557).fit(embeddings)

    for i in range(len(x_data)):
        x_data[i]['embedding_cluster'] = kmeans.labels_[i]
    return kmeans

In [11]:
# list of classifier models
# redefine for quicker running
clf_map = [
    (
        LinearSVC(),
        {
            'clf__C': [.1],
            'clf__penalty': ['l2'],
            'clf__loss': ['hinge', 'squared_hinge'],
        }
    ),
    (
        LogisticRegression(),
        {
            'clf__penalty': ['l2'],
            'clf__fit_intercept': [True],
            'clf__C':[.1],
        }
    ),
    (
        RandomForestClassifier(random_state=2557),
        {
            'clf__n_estimators': [20],
            'clf__max_features': ["auto", "log2"]
        }
    ),

]


In [55]:
features = [
    'word',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task',
    'Stanford_PoS',
]

continuous_feats = [
    'word_number_of_syllables',
#     'word_number_in_turn',
#     'word_number_in_task',
#     'total_number_of_words_in_turn',
#     'total_number_of_words_in_task',
]

x_data, y_data = generate_data(FILE_NAME, features, continuous_feats)
# add_dist_end_turn(x_data)
# add_pos_bigram(x_data)
# add_is_stutter(x_data)

# get_input_mentioned(x_data)
# set_input_num_mentions(x_data)
# get_input_far_back_mentioned(x_data)
# add_embeddings(x_data, gembeddings_index)
kmeans = add_embedding_cluster(x_data, gembeddings_index, sphere=True, clusters=50)

# excluded_features = ['Most_Recent_Mention','word_end_time']
# for entry in x_data:
#     for feature in excluded_features:
#         if feature in entry:
#             entry.pop(feature)

X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)
print(x_data[0])



doing spherical kmeans
{'embedding_cluster': 38, 'total_number_of_words_in_turn': '1', 'Stanford_PoS': 'NN', 'word_number_of_syllables': 1, 'word': 'yup', 'total_number_of_words_in_task': '50', 'word_number_in_turn': '1', 'word_number_in_task': '1'}


In [56]:
with open('cluster50.sphere.twitter200d.pkl', 'wb') as f:
    pickle.dump(kmeans.labels_, f)


In [57]:
print("TWITTER 50 EMBED-FEATURES")
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__, scorer=fdot25_scorer)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))
    print(metrics.accuracy_score(y_test, y_pred))

TWITTER 50 EMBED-FEATURES
################################### LinearSVC ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  5.9min remaining:  3.9min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:  9.9min finished


Train time: 603.860s
Best score: 0.666
	clf__C: 0.1
	clf__loss: 'squared_hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.88      0.96      0.92     11618
       True       0.69      0.39      0.49      2448

avg / total       0.85      0.86      0.85     14066

0.8629318925067538
################################### LogisticRegression ###################################
Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=-1)]: Done   2 out of   5 | elapsed:  2.2min remaining:  3.3min
[Parallel(n_jobs=-1)]: Done   5 out of   5 | elapsed:  5.3min finished


Train time: 324.970s
Best score: 0.668
	clf__C: 0.1
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.88      0.97      0.92     11618
       True       0.72      0.37      0.49      2448

avg / total       0.85      0.86      0.85     14066

0.8649225081757429
################################### RandomForestClassifier ###################################
Fitting 5 folds for each of 2 candidates, totalling 10 fits


[Parallel(n_jobs=-1)]: Done   6 out of  10 | elapsed:  6.8min remaining:  4.5min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed: 11.8min finished


Train time: 743.293s
Best score: 0.618
	clf__max_features: 'log2'
	clf__n_estimators: 20
             precision    recall  f1-score   support

      False       0.88      0.96      0.92     11618
       True       0.66      0.36      0.47      2448

avg / total       0.84      0.86      0.84     14066

0.8570311389165364


In [None]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
        
def show_least_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names), key=lambda x: abs(x[0]))
    top = coefs_with_fns[:n]
    for (coef_1, fn_1) in top:
        print("\t%.4f\t%-15s" % (coef_1, fn_1))

In [None]:
show_most_informative_features(model.steps[0][1], model.steps[2][1])

In [None]:
show_least_informative_features(model.steps[0][1], model.steps[2][1], n=100)

In [None]:
len(model.steps[2][1].coef_[0])

In [None]:
from collections import defaultdict
def show_feature_importance(vectorizer, clf):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = zip(clf.coef_[0], feature_names)
    feat_count = defaultdict(list)
    for (coef_1, fn_1) in coefs_with_fns:
        feat = fn_1.split('=')[0]
        feat_count[feat].append(abs(coef_1))
    for feat in feat_count.keys():
        print(feat, sum(feat_count[feat])/len(feat_count[feat]))
    return feat_count

In [None]:
feat_count = show_feature_importance(model.steps[0][1], model.steps[2][1])

In [None]:
import matplotlib.pyplot as plt

In [None]:
for feat in feat_count.keys():
    hist, bins = np.histogram(feat_count[feat], bins=int(len(feat_count[feat])**0.5))
    width = 0.7 * (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    plt.bar(center, hist, align='center', width=width)
    plt.title(feat)
    plt.show()