In [1]:
## Imports
import csv
import sys
import numpy as np
import pickle
from time import time
import re
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectPercentile, SelectKBest
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn import metrics
import spacy
import en_core_web_md
nlp = en_core_web_md.load()
# nlp = spacy.load('en')

In [2]:
# some variables for processing data

features = [
    'word',
    'word_pos_tag',
    'word_pos_tag_simplified',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task'
]

continuous_feats = [
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task'
]

In [3]:
file_name = "big-table-PoS.csv"

def generate_data(file_name, features, continuous_feats):
    break_label = 'word_tobi_break_index'
    break_set = set(["4", "4-", "4p"])
    x_data = []
    y_data = []

    with open(file_name, 'r') as f:
        reader = csv.DictReader(f) # DictReader fixes off-by-one error from before
        for i, l in enumerate(reader):
            feats = {feat: l[feat] for feat in features}
            # convert some to continuous features
            for feat in continuous_feats:
                feats[feat] = float(feats[feat])
            x_data.append(feats)
            label = l[break_label] in break_set
            y_data.append(label)
    return x_data, y_data

x_data, y_data = generate_data("big-table-PoS.csv", features, continuous_feats)
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

In [7]:
print(X_train[0], y_train[0])

{'word_number_of_syllables': 1.0, 'word_pos_tag_simplified': 'O', 'word_number_in_task': 12.0, 'word_pos_tag': 'PRP', 'word_number_in_turn': 2.0, 'word': 'it'} False


In [8]:
def classify_my_model(pipeline, param_grid, X, y, model_name, scorer='f1'):
    print('#'*35, model_name, '#'*35)
    folds = StratifiedKFold(n_splits=3, shuffle=True, random_state=int(time()))
#     param_grid['selector__percentile'] = [10, 25, 50, 90, 95, 100]
    
    gs = GridSearchCV(pipeline,
                      param_grid,
                      scoring=scorer,
                      cv=5,
                      n_jobs=-1,
                      verbose=1)
    t0 = time()
    gs.fit(X, y)
    train_time = time() - t0
    print("Train time: %0.3fs" % train_time)
#     print("Real train time: %0.3fs" % (train_time * (TOTAL_COUNT/DEV_COUNT)))
    print("Best score: %0.3f" % gs.best_score_)
    best_params = gs.best_estimator_.get_params()
    for param_name in sorted(param_grid.keys()):
        print("\t%s: %r" % (param_name, best_params[param_name]))
    return gs.best_score_, gs.best_estimator_
    
#     with open(model_name+'_best_'+type(clf).__name__+".pkl", 'wb') as handle:
#         pickle.dump(best_model, handle)

In [9]:
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=2)
dict_vectorizer = DictVectorizer()
# select_percentile = SelectPercentile(percentile=100)

clf_map = [
    (
        MultinomialNB(),
        {
            'clf__alpha': [.001, .01, .1, 1],
            'clf__fit_prior': [True, False],
        }
    ),
    (
        LinearSVC(),
        {
            'clf__C': [.1, 1, 10, 100],
            'clf__penalty': ['l2'],
            'clf__loss': ['hinge', 'squared_hinge'],
        }
    ),
    (
        LogisticRegression(),
        {
            'clf__penalty': ['l1','l2'],
            'clf__fit_intercept': [True, False],
            'clf__C':[.1, 1, 10, 100],
        }
    ),

]

for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

################################### MultinomialNB ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    7.2s finished


Train time: 9.482s
Best score: 0.639
	clf__alpha: 0.001
	clf__fit_prior: False
             precision    recall  f1-score   support

      False       0.90      0.80      0.85      6787
       True       0.56      0.74      0.64      2341

avg / total       0.81      0.78      0.79      9128

################################### LinearSVC ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.1min finished


Train time: 75.038s
Best score: 0.590
	clf__C: 0.1
	clf__loss: 'squared_hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.89      0.52      0.66      6787
       True       0.37      0.81      0.51      2341

avg / total       0.76      0.60      0.62      9128

################################### LogisticRegression ###################################
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   13.0s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   36.1s finished


Train time: 38.849s
Best score: 0.617
	clf__C: 100
	clf__fit_intercept: False
	clf__penalty: 'l1'
             precision    recall  f1-score   support

      False       0.86      0.93      0.89      6787
       True       0.73      0.55      0.63      2341

avg / total       0.82      0.83      0.82      9128



In [10]:
continuous_feats = [
    'word_number_of_syllables'
]

x_data, y_data = generate_data("big-table-PoS.csv", features, continuous_feats)
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

In [11]:
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

################################### MultinomialNB ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   10.7s finished


Train time: 13.246s
Best score: 0.633
	clf__alpha: 0.1
	clf__fit_prior: False
             precision    recall  f1-score   support

      False       0.90      0.80      0.84      6787
       True       0.56      0.73      0.63      2341

avg / total       0.81      0.78      0.79      9128

################################### LinearSVC ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   54.2s finished


Train time: 62.130s
Best score: 0.612
	clf__C: 10
	clf__loss: 'squared_hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.86      0.93      0.89      6787
       True       0.72      0.55      0.62      2341

avg / total       0.82      0.83      0.82      9128

################################### LogisticRegression ###################################
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   18.8s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:  1.4min finished


Train time: 84.955s
Best score: 0.611
	clf__C: 10
	clf__fit_intercept: True
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.86      0.92      0.89      6787
       True       0.72      0.55      0.62      2341

avg / total       0.82      0.83      0.82      9128



In [13]:
features = [
    'word',
    'word_pos_tag',
    'word_pos_tag_simplified'
]
continuous_feats = []

x_data, y_data = generate_data("big-table-PoS.csv", features, continuous_feats)
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

In [15]:
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

################################### MultinomialNB ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    5.6s finished


Train time: 7.661s
Best score: 0.639
	clf__alpha: 0.1
	clf__fit_prior: False
             precision    recall  f1-score   support

      False       0.90      0.79      0.84      6787
       True       0.56      0.76      0.64      2341

avg / total       0.82      0.78      0.79      9128

################################### LinearSVC ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:   36.0s finished


Train time: 41.422s
Best score: 0.614
	clf__C: 100
	clf__loss: 'squared_hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.86      0.91      0.88      6787
       True       0.69      0.57      0.62      2341

avg / total       0.82      0.82      0.82      9128

################################### LogisticRegression ###################################
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    9.0s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   44.7s finished


Train time: 47.581s
Best score: 0.615
	clf__C: 100
	clf__fit_intercept: False
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.86      0.93      0.89      6787
       True       0.73      0.54      0.62      2341

avg / total       0.82      0.83      0.82      9128



In [16]:
features = [
    'word',
    'word_pos_tag',
    'word_pos_tag_simplified',
    'word_number_of_syllables',
    'word_number_in_ipu',
]

continuous_feats = [
    'word_number_of_syllables',
    'word_number_in_ipu',
]

x_data, y_data = generate_data("big-table-PoS.csv", features, continuous_feats)
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

In [17]:
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

################################### MultinomialNB ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    6.9s finished


Train time: 8.829s
Best score: 0.634
	clf__alpha: 0.01
	clf__fit_prior: False
             precision    recall  f1-score   support

      False       0.90      0.80      0.84      6787
       True       0.56      0.74      0.64      2341

avg / total       0.81      0.78      0.79      9128

################################### LinearSVC ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:  1.1min finished


Train time: 70.579s
Best score: 0.614
	clf__C: 1
	clf__loss: 'squared_hinge'
	clf__penalty: 'l2'
             precision    recall  f1-score   support

      False       0.86      0.93      0.89      6787
       True       0.73      0.55      0.62      2341

avg / total       0.82      0.83      0.82      9128

################################### LogisticRegression ###################################
Fitting 5 folds for each of 16 candidates, totalling 80 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.8s
[Parallel(n_jobs=-1)]: Done  80 out of  80 | elapsed:   46.2s finished


Train time: 55.538s
Best score: 0.617
	clf__C: 100
	clf__fit_intercept: True
	clf__penalty: 'l1'
             precision    recall  f1-score   support

      False       0.86      0.93      0.89      6787
       True       0.73      0.55      0.62      2341

avg / total       0.82      0.83      0.82      9128



In [18]:
features = [
    'word',
    'word_pos_tag',
    'word_pos_tag_simplified',
    'word_number_of_syllables',
    'word_number_in_ipu',
]

continuous_feats = [
    'word_number_of_syllables',
]

x_data, y_data = generate_data("big-table-PoS.csv", features, continuous_feats)
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

In [None]:
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

################################### MultinomialNB ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


[Parallel(n_jobs=-1)]: Done  40 out of  40 | elapsed:    6.5s finished


Train time: 8.388s
Best score: 0.634
	clf__alpha: 0.01
	clf__fit_prior: False
             precision    recall  f1-score   support

      False       0.90      0.80      0.85      6787
       True       0.56      0.74      0.63      2341

avg / total       0.81      0.78      0.79      9128

################################### LinearSVC ###################################
Fitting 5 folds for each of 8 candidates, totalling 40 fits


In [None]:
features = [
    'word',
    'word_pos_tag',
    'word_pos_tag_simplified',
    'word_number_of_syllables',
]

continuous_feats = [
    'word_number_of_syllables',
]

x_data, y_data = generate_data("big-table-PoS.csv", features, continuous_feats)
X_train, X_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2, random_state=2557)

In [None]:
for (clf, param_grid) in clf_map:
    pipeline = Pipeline([
        ('dictvec', dict_vectorizer),
    #         ('selector', select_percentile),
        ('clf', clf)
    ])

    best_score, best_model = classify_my_model(pipeline, param_grid, X_train, y_train, clf.__class__.__name__)
    y_pred = best_model.predict(X_test)
    print(metrics.classification_report(y_test, y_pred))

# Adding POS features

In [None]:
# Best one was LogReg with only num of syllables as a continuous feature
features = [
    'word',
    'word_pos_tag',
    'word_pos_tag_simplified',
    'word_number_of_syllables',
    'word_number_in_turn',
    'word_number_in_task',
    'total_number_of_words_in_turn',
    'total_number_of_words_in_task'
]
feat_indices = [18, 19, 20, 22, 11, 12, 14, 15]
label_index = 27
continuous_feats = [
    'word_number_of_syllables'
]
## Read the file
file_name = "big-table-PoS.csv"
x_data = []
y_data = []
labels = []
with open(file_name, 'r') as f:
    for i, l in enumerate(csv.reader(f)):
        if i == 0: continue
#         elif i == 2: print x_data, y_data
        feats = {feat: l[i] for feat, i in zip(features,feat_indices)}
        # convert some to continuous features
        for feat in continuous_feats:
            feats[feat] = float(feats[feat])
        x_data.append(feats)
        label = l[label_index] == "4" or l[label_index] == "4-" or l[label_index] == "4p"
        y_data.append(label)
        labels.append(l[label_index])

In [None]:
# distance from end of ipu and turn
DIST_END_TURN = "DIST_END_TURN"
for i in range(len(x_data)):
    x_data[i][DIST_END_TURN] = int(x_data[i]['total_number_of_words_in_turn']) - int(x_data[i]['word_number_in_turn'])

## POS Bigram, segmented by IPU (don't use this, since we're not given IPU information from text)

In [None]:
POS_IPU_BIGRAM_LEFT = "POS_IPU_BIGRAM_LEFT"
POS_IPU_BIGRAM_RIGHT = "POS_IPU_BIGRAM_RIGHT"
for i in range(len(x_data)):
    # to the left
    if x_data[i]['word_number_in_ipu'] == '1':
        x_data[i][POS_IPU_BIGRAM_LEFT] = 'BEGIN/'+x_data[i]['word_pos_tag']
    else:
        x_data[i][POS_IPU_BIGRAM_LEFT] = x_data[i-1]['word_pos_tag']+"/"+x_data[i]['word_pos_tag']
    # to the right
    if x_data[i]['word_number_in_ipu'] == x_data[i]['total_number_of_words_in_ipu']:
        x_data[i][POS_IPU_BIGRAM_RIGHT] = x_data[i]['word_pos_tag']+"/END"
    else:
        x_data[i][POS_IPU_BIGRAM_RIGHT] = x_data[i]['word_pos_tag']+"/"+x_data[i+1]['word_pos_tag']

In [None]:
# With dist features
for (clf, param_grid) in clf_map:
    classify_my_model(clf, param_grid)

In [None]:
for i in range(len(x_data)):
    del x_data[i][POS_IPU_BIGRAM_LEFT]
    del x_data[i][POS_IPU_BIGRAM_RIGHT]

## POS Bigram, segmented by TURN

In [None]:
# Using both turn and ipu pos bigrams
POS_TURN_BIGRAM_LEFT = "POS_TURN_BIGRAM_LEFT"
POS_TURN_BIGRAM_RIGHT = "POS_TURN_BIGRAM_RIGHT"
for i in range(len(x_data)):
    # to the left
    if x_data[i]['word_number_in_turn'] == '1':
        x_data[i][POS_TURN_BIGRAM_LEFT] = 'BEGIN/'+x_data[i]['word_pos_tag']
    else:
        x_data[i][POS_TURN_BIGRAM_LEFT] = x_data[i-1]['word_pos_tag']+"/"+x_data[i]['word_pos_tag']
    # to the right
    if x_data[i]['word_number_in_turn'] == x_data[i]['total_number_of_words_in_turn']:
        x_data[i][POS_TURN_BIGRAM_RIGHT] = x_data[i]['word_pos_tag']+"/END"
    else:
        x_data[i][POS_TURN_BIGRAM_RIGHT] = x_data[i]['word_pos_tag']+"/"+x_data[i+1]['word_pos_tag']

In [None]:
for (clf, param_grid) in clf_map:
    classify_my_model(clf, param_grid)

In [None]:
for i in range(len(x_data)):
    del x_data[i][POS_TURN_BIGRAM_LEFT]
    del x_data[i][POS_TURN_BIGRAM_RIGHT]

## Segmented by IPU is better than turn. Let's add the feature of whether word ends with a hyphen (denotes a stutter)

In [None]:
IS_STUTTER = "IS_STUTTER"
for i in range(len(x_data)):
    x_data[i][IS_STUTTER] = x_data[i]['word'][-1] == '-'

In [None]:
for (clf, param_grid) in clf_map:
    classify_my_model(clf, param_grid)

# Segmented by TURN trigram as well?

In [None]:
POS_IPU_TRIGRAM = "POS_TURN_TRIGRAM"
for i in range(len(x_data)):
    # to the left
    left = "BEGIN"
    if x_data[i]['word_number_in_turn'] != '1':
        left = x_data[i-1]['word_pos_tag']
    # to the right
    right = "END"
    if x_data[i]['word_number_in_turn'] != x_data[i]['total_number_of_words_in_turn']:
        right = x_data[i+1]['word_pos_tag']
    x_data[i][POS_IPU_TRIGRAM] = left+"/"+x_data[i]['word']+"/"+right

In [None]:
for (clf, param_grid) in clf_map:
    classify_my_model(clf, param_grid, 'turn_trigram')

#  Adding syntactic features

In [None]:
import regex as re
from spacy.tokenizer import Tokenizer
nlp.tokenizer = Tokenizer(nlp.vocab, token_match=re.compile(r"'").match)

In [None]:
SIZE_SUBTREE = "SIZE_SUBTREE"
NUM_SIBLINGS = "NUM_SIBLINGS"
# Because we can't assume that every turn contains a fully connected dependency parse, we can treat distance as either 1, 2, or 3 (meaning 2 or more)
# dist = 0 when this term is last term
# dist = 1 when next term is head or a child of current term
# else dist = 2 when there is overlap between (current term's head and children) and (next term's head and children)
# else dist = 3
ARC_DIST_NEXT = "ARC_DIST_NEXT"
FUNC = "FUNC"
def subtree_size(token):
    return sum([subtree_size(child) for child in token.children])
def num_siblings(token):
    return len(list(token.head.children))
def arc_dist(token, next_token):
    if not next_token: return 0
    elif next_token.text == token.head.text or next_token.text in [a.text for a in token.children]: return 1
    elif any([a.text in [b.text for b in list(token.children)+[token.head]] for a in list(next_token.children)+[next_token.head]]): return 2
    else: return 3
def get_func(token):
    if "SUBJ" in token.dep_: return 0
    elif "DOBJ" in token.dep_: return 1
    elif "POBJ" in token.dep_: return 2
    else: return 3
# set these features to 0 by default
for i in range(len(x_data)):
    x_data[i][SIZE_SUBTREE] = '-1'
    x_data[i][NUM_SIBLINGS] = '-1'
    x_data[i][ARC_DIST_NEXT] = '-1'
    x_data[i][FUNC] = '-1'
for i in range(len(x_data)):
    if x_data[i]['word_number_in_turn'] == '1':
        # figure out the current turn
        turn = ' '.join([word['word'] for word in x_data[i:i+int(x_data[i]['total_number_of_words_in_turn'])]])
#         print(turn)
        doc = nlp(turn)
        for j, token in enumerate(doc):
#             print(token)
            next_token = doc[i+1] if i+1<len(doc) else None
            x_data[i+j][SIZE_SUBTREE] = str(subtree_size(token))
            x_data[i+j][NUM_SIBLINGS] = str(num_siblings(token))
            x_data[i+j][ARC_DIST_NEXT] = str(arc_dist(token, next_token))
            x_data[i+j][FUNC] = str(get_func(token))

In [None]:
for (clf, param_grid) in clf_map:
    classify_my_model(clf, param_grid, 'syntactic')

In [None]:
# Find most significant features
model = pickle.load(open('syntactic_best_LogisticRegression.pkl', 'rb'))


In [None]:
model.steps

In [None]:
def show_most_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
    for (coef_1, fn_1), (coef_2, fn_2) in top:
        print("\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2))
        
def show_least_informative_features(vectorizer, clf, n=20):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names), key=lambda x: abs(x[0]))
    top = coefs_with_fns[:n]
    for (coef_1, fn_1) in top:
        print("\t%.4f\t%-15s" % (coef_1, fn_1))

In [None]:
show_most_informative_features(model.steps[0][1], model.steps[2][1])

In [None]:
show_least_informative_features(model.steps[0][1], model.steps[2][1], n=100)

In [None]:
len(model.steps[2][1].coef_[0])

In [None]:
from collections import defaultdict
def show_feature_importance(vectorizer, clf):
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = zip(clf.coef_[0], feature_names)
    feat_count = defaultdict(list)
    for (coef_1, fn_1) in coefs_with_fns:
        feat = fn_1.split('=')[0]
        feat_count[feat].append(abs(coef_1))
    for feat in feat_count.keys():
        print(feat, sum(feat_count[feat])/len(feat_count[feat]))
    return feat_count

In [None]:
feat_count = show_feature_importance(model.steps[0][1], model.steps[2][1])

In [None]:
import matplotlib.pyplot as plt

In [None]:
for feat in feat_count.keys():
    hist, bins = np.histogram(feat_count[feat], bins=int(len(feat_count[feat])**0.5))
    width = 0.7 * (bins[1] - bins[0])
    center = (bins[:-1] + bins[1:]) / 2
    plt.bar(center, hist, align='center', width=width)
    plt.title(feat)
    plt.show()