## Matrix Multiplication for Dialog Heads

This file perform matrix multiplication learning for dialog heads.

Content:

* Clean data a bit, remove low frequency count.
* Build Coo Matrix.
* Parameter search.

In [1]:
# DATA LOAD
import os
import itertools
import pandas as pd
import numpy as np

import sys
# sys.path.insert(0, '/home/kits-adm/Workspace/play/dodgson_play/implicit_local/implicit') # local compile that has gpu support
import implicit
import numpy as np

from copy import deepcopy
from sklearn import metrics
from scipy.sparse import coo_matrix
from implicit.evaluation import train_test_split, precision_at_k, mean_average_precision_at_k

os.environ["MKL_NUM_THREADS"] = "1"

BASE = '/home/kits-adm/Datasets/dodgsons/dodgson/outputs/no_split/'
RANDOM_STATE = 28145037

def seed(value=RANDOM_STATE):
    np.random.seed(value)

In [2]:
data = pd.read_csv('all_dialogs_heads.csv')
data = data.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)
data

Unnamed: 0,sent_id,show_id,char_name,char_id,head_info,head_pos,token,token_par,position,is_stop,head_text
0,1063785,QueerAsFolk,MICHAEL,l22346,POS,VBZ,be,be,2,True,VBZ|be|be
1,233952,TheSimpsons,Lisa Simpson,a15823,DEP,det,a,girl,8,True,det|a|girl
2,926937,SonsOfAnarchy,CLAY,l24811,DEP,compound,club,problem,5,False,compound|club|problem
3,504583,OnceUponATime,MARY MARGARET,l19393,DEP,punct,.,Nothing,-25,False,punct|.|Nothing
4,508646,OnceUponATime,CINDERELLA,l19407,POS,JJ,entire,entire,77,False,JJ|entire|entire
5,78453,DoctorWho,BARBARA,l8667,DEP,advcl,be,hold,34,True,advcl|be|hold
6,146223,VeronicaMars,VERONICA,l30591,DEP,compound,drug,testing,5,False,compound|drug|testing
7,19636,DoctorWho,ELEVENTH DOCTOR,l8661,POS,.,.,.,66,False,.|.|.
8,717221,GilmoreGirls,RORY,l11613,DEP,ROOT,buy,buy,0,False,ROOT|buy|buy
9,161951,Smallville,CLARK,l24611,DEP,punct,",",believe,4,False,"punct|,|believe"


## Basic cleaning

Basic filtering of all data except the following:


* All punct except "?"
* No ROOT.

In addition, remove all sentence feature that have freq less than 5.

In [3]:
data = data.loc[(data.head_pos!='ROOT') & (data.head_pos!='punct')]
data

Unnamed: 0,sent_id,show_id,char_name,char_id,head_info,head_pos,token,token_par,position,is_stop,head_text
0,1063785,QueerAsFolk,MICHAEL,l22346,POS,VBZ,be,be,2,True,VBZ|be|be
1,233952,TheSimpsons,Lisa Simpson,a15823,DEP,det,a,girl,8,True,det|a|girl
2,926937,SonsOfAnarchy,CLAY,l24811,DEP,compound,club,problem,5,False,compound|club|problem
4,508646,OnceUponATime,CINDERELLA,l19407,POS,JJ,entire,entire,77,False,JJ|entire|entire
5,78453,DoctorWho,BARBARA,l8667,DEP,advcl,be,hold,34,True,advcl|be|hold
6,146223,VeronicaMars,VERONICA,l30591,DEP,compound,drug,testing,5,False,compound|drug|testing
7,19636,DoctorWho,ELEVENTH DOCTOR,l8661,POS,.,.,.,66,False,.|.|.
10,1059921,Charmed1998,ANDY,l6497,DEP,ccomp,be,think,-18,True,ccomp|be|think
11,504657,OnceUponATime,MARY MARGARET,l19393,POS,PRP$,her,her,45,True,PRP$|her|her
12,672740,Alias,JACK,l1275,POS,UH,um,um,9,False,UH|um|um


In [4]:
head_count = data.head_text.value_counts()
print(len(head_count))
head_count

1084813


.|.|.                           576174
,|,|,                           428302
PRP|you|you                     310320
PRP|i|i                         265113
.|?|?                           252663
DT|the|the                      205609
VBZ|be|be                       180086
NNP|PERSON|PERSON               146516
TO|to|to                        146201
VBP|be|be                       145453
RB|not|not                      141320
DT|a|a                          135013
PRP|it|it                       132420
CC|and|and                       92985
WP|what|what                     79302
IN|of|of                         78788
PRP|we|we                        77222
DT|that|that                     74381
VBD|be|be                        60514
IN|in|in                         57542
PRP|me|me                        56727
VBP|do|do                        53599
nsubj|it|be                      53510
DT|this|this                     52876
IN|to|to                         52822
.|!|!                    

In [5]:
sent_count = data.sent_id.value_counts()
print(len(sent_count))
sent_count

929614


831222     170
763046     163
467363     154
831028     150
719532     147
965936     143
241286     135
831108     132
398397     132
715491     132
704331     130
909670     128
731202     128
745303     127
952114     127
967251     127
575390     126
974959     126
900148     125
979734     125
716739     124
974554     123
325812     122
426906     122
663693     121
845686     121
427592     121
840637     121
306281     119
734729     119
          ... 
774933       4
269051       4
340097       4
438969       4
694480       4
641063       4
364214       4
1001061      4
873415       4
741229       4
482523       4
323765       4
350072       4
144686       4
733401       4
683566       4
339627       4
510044       4
319894       4
222081       4
880332       4
718774       4
357302       4
735303       4
683565       4
342747       4
1005511      4
224865       4
761605       4
515802       4
Name: sent_id, Length: 929614, dtype: int64

In [6]:
# -- Filter Inspection --
# BEFORE WE FILTER: inspect percentage of data left after filtering, we do not want to remove too much lol
# we keep at >5 freqency features.

_count = 0
_len = data.shape[0]
for text, freq in data.head_text.value_counts().iteritems():
    if 5 < freq:
        _count += freq

print(_count/_len)  # precentage of data still got left after filtering, got 90% left, pretty good!

0.9053410982539049


In [7]:
# remove all low frequency features and save to file
data = data[data.groupby('head_text')['head_text'].transform('count').ge(5)]
data

Unnamed: 0,sent_id,show_id,char_name,char_id,head_info,head_pos,token,token_par,position,is_stop,head_text
0,1063785,QueerAsFolk,MICHAEL,l22346,POS,VBZ,be,be,2,True,VBZ|be|be
1,233952,TheSimpsons,Lisa Simpson,a15823,DEP,det,a,girl,8,True,det|a|girl
4,508646,OnceUponATime,CINDERELLA,l19407,POS,JJ,entire,entire,77,False,JJ|entire|entire
5,78453,DoctorWho,BARBARA,l8667,DEP,advcl,be,hold,34,True,advcl|be|hold
6,146223,VeronicaMars,VERONICA,l30591,DEP,compound,drug,testing,5,False,compound|drug|testing
7,19636,DoctorWho,ELEVENTH DOCTOR,l8661,POS,.,.,.,66,False,.|.|.
10,1059921,Charmed1998,ANDY,l6497,DEP,ccomp,be,think,-18,True,ccomp|be|think
11,504657,OnceUponATime,MARY MARGARET,l19393,POS,PRP$,her,her,45,True,PRP$|her|her
12,672740,Alias,JACK,l1275,POS,UH,um,um,9,False,UH|um|um
13,144292,VeronicaMars,VERONICA,l30591,POS,VBZ,do,do,0,True,VBZ|do|do


In [8]:
# -- VALIDATE --
# validate the last step (freq filtering) is successful.

head_count = data.head_text.value_counts()
print(len(head_count))
head_count

142447


.|.|.                           576174
,|,|,                           428302
PRP|you|you                     310320
PRP|i|i                         265113
.|?|?                           252663
DT|the|the                      205609
VBZ|be|be                       180086
NNP|PERSON|PERSON               146516
TO|to|to                        146201
VBP|be|be                       145453
RB|not|not                      141320
DT|a|a                          135013
PRP|it|it                       132420
CC|and|and                       92985
WP|what|what                     79302
IN|of|of                         78788
PRP|we|we                        77222
DT|that|that                     74381
VBD|be|be                        60514
IN|in|in                         57542
PRP|me|me                        56727
VBP|do|do                        53599
nsubj|it|be                      53510
DT|this|this                     52876
IN|to|to                         52822
.|!|!                    

## Utilities for Matrix Multiplication

This section defines common functions of MM.

### Note: Approximate Good Params

The following params works well in general, regardless of variations:

conf=20, factors=30, regularization=0.1, iterations=200

We choose to not push too much on accuracy, and use same params for all variations. You can push quiet a lot on accuarcy by perform a grid search. We provide grid search code here.

Acc record for each variations are recorded here (% on top 1 sentence retrival):

* v1 (keep all): 28.975672759668893

In [9]:
# -- UTILITY BLOCK --

# -- process to coo matrix --
def to_coo(df, _obj_col, _feature_col):
    obj_cat = data[_obj_col].astype('category')
    feature_cat = data[_feature_col].astype('category')
#     df[_obj_col] = sent_cat
#     df[_feature_col] = feature_cat

    # create a sparse matrix of all the users/repos
    return (
        coo_matrix((np.ones(df.shape[0]), (feature_cat.cat.codes.copy(), obj_cat.cat.codes.copy()))),
        obj_cat, feature_cat,
    )

# Perform training and optional testing.
# optimal: {'factors': 27, 'regularization': 0.1, 'iterations': 200, 'model': als, 'confidence': 30}
def get_train(df, top_n=1, random_state = 28145037, report_test_acc=False,
              conf=30, factors=27, regularization=0.1, iterations=200):
    _obj_col='sent_id'
    _feature_col='head_text'
    
    obj_cat = df[_obj_col].astype('category')
    feature_cat = df[_feature_col].astype('category')
#     data[_obj_col] = sent_cat
#     data[_feature_col] = feature_cat

    # build coo matrix
    coo = coo_matrix((np.ones(df.shape[0]), (feature_cat.cat.codes.copy(), obj_cat.cat.codes.copy())))
    print(repr(coo))  # for manual sanity check

    # sanity check for accuracy
    if report_test_acc:
        print('--SANITY: Validate accuracy --')
        np.random.seed(random_state)
        train_csr, test_csr = train_test_split(coo, train_percentage=0.7)
        train_csr, test_csr

        _als = implicit.als.AlternatingLeastSquares(factors=factors,
                                                    regularization=regularization, iterations=iterations)
        _als.fit(train_csr * conf)
        prec = precision_at_k(_als, train_csr.T, test_csr.T, K=top_n)
        print('Sent prec: {}', prec*100)
    
    # train include all data, return the model and categories
    print('--TRAIN: train on complete matrix --')
    _als = implicit.als.AlternatingLeastSquares(factors=factors,
                                                regularization=regularization, iterations=iterations)
    _als.fit(coo * conf)
    return _als, obj_cat, feature_cat


# Build refer maps for sentences.
def build_maps(df):
    sent_map = dict(enumerate(df['sent_id'].astype('category').cat.categories))
    # key: sentence id. value: category id of sentence(what the MM model indexing).
    inv_sent_map = {r: i for i, r in sent_map.items()}
    print(len(inv_sent_map), len(sent_map))
    return sent_map, inv_sent_map

In [10]:
# -- PARAMETER SEARCH --
# NOTE: we did not perform a full scale parameter search due to our hardwares.
# We only found the params that are about right. You can play around to get more accurate model.

# this is for link to GPU. GPU build has to perform locally. Check on implicit repo for details.
# NOTE: we did not made this work due to hardware condition.
# GPU version is not validated by us, use by your own risk.
# sys.path.insert(0, '/home/kits-adm/Workspace/play/dodgson_play/implicit_local') 

# -- perform an approximately grid search --
# Warning: this this a pretty complete search, cost a lot of times (several days on dencent GPU) 
# _searches = {'factors': [10, 20, 30, 40, 50, 80, 120],
#              'regularization': [0.0, 1e-5, 1e-3, 1e-1, 1e1, 1e2],
#              'iterations': [1, 10, 15, 50, 100, 500, 1000, 3000],
#              'model': [implicit.als.AlternatingLeastSquares, implicit.bpr.BayesianPersonalizedRanking],
#              'confidence': [1, 2, 3, 4, 5, 10, 20],
#             }

# best guess
# this paramters are in general works well for implicit, regardless of dataset
# We recommand to use it if you running out of patience for the last one.
_searches = {'factors': [27, 30, 33, 36],
             'regularization': [0.0, 1e-3, 1e-1],
             'iterations': [200, 500,],
             'model': [implicit.als.AlternatingLeastSquares],
             'confidence': [10, 20, 30],
            }

# best guess with gpu (factor by 32)
# however we did not made this work due to py37 is not compatable with GPU version LoL, you can try it, should be fast)
# also a note here: The testing may actually takes longer than training due to on CPU, so...this may not be useful after all 
# _searches = {'factors': [32, 64, 96],
#              'regularization': [0.0, 1e-5, 1e-3, 1e-1, 1e1, 1e2],
#              'iterations': [500,],
#              'model': [implicit.als.AlternatingLeastSquares],
#              'confidence': [10, 20, 30],
#              'use_gpu': [True,],
#             }

# fixed to a specific value
# _searches = {'factors': [24,],
#              'regularization': [0.01,],
#              'iterations': [500,],
#              'model': [implicit.als.AlternatingLeastSquares],
#              'confidence': [20,],
#             }

def grid_search(train_csr, test_csr, param_grid, top_k=10):
    best = {
        'prec': None, 'mprec': None, 'params': None, 'model': None,
    }  # mprec, param, model

    keys, values = zip(*param_grid.items())
    for v in itertools.product(*values):
        params = dict(zip(keys, v))
        print('--------')
        print(params)

        # setup model
        _base_model = params.pop('model')
        _model_init = deepcopy(_base_model)
        _conf_magnify = params.pop('confidence')
        _model = _model_init(**params)
        print(_model)

        # train
        seed()
        _model.fit(train_csr * _conf_magnify)
        prec = precision_at_k(_model, train_csr.T, test_csr.T, K=top_k)
#         mprec = mean_average_precision_at_k(_model, train_csr.T, test_csr.T, K=top_k)

        print(prec)
#         print(mprec)
        if (best['prec'] is None) or prec > best['prec']:
            best['prec'] = prec
#             best['mprec'] = mprec
            best['params'] = params
            best['model'] = _model
            print('...New Best...')
    print('--COMPLETE--')
    return best

# def 
# best_models = {}
# for top_k in [1]: #, 10, 100, 1000]:
#     print('-- start {} --'.format(top_k))
#     best = grid_search(train_csr, test_csr,
#                        _searches, top_k=top_k,)
#     best_models[top_k] = best

# usage:
def search_all(df, random_state=3262832):
    _obj_col='sent_id'
    _feature_col='head_text'
    
    obj_cat = df[_obj_col].astype('category')
    feature_cat = df[_feature_col].astype('category')

    # build coo matrix
    coo = coo_matrix((np.ones(df.shape[0]), (feature_cat.cat.codes.copy(), obj_cat.cat.codes.copy())))
    print(repr(coo))  # for manual sanity check

    # sanity check for accuracy
    print('--SANITY: Validate accuracy --')
    np.random.seed(random_state)
    train_csr, test_csr = train_test_split(coo, train_percentage=0.7)
    best = grid_search(train_csr, test_csr, _searches, top_k=100,)

# search_all(data)

## Utilities for model inspection

### Why

The model requires to be manually inspected. This is due to in semi-supervised setting, accuracy may not be an good indicator of learning, due to:

* model accuracy depends on how data setup. Due to it indicate roughly: "how well the model is guessing the data correctly".
* How data should be setup is exactly what we want to investigate. Therefore based on the last point, the question is circular and can not be solved. Only thing we can do is to manually inspect it.

In addition, the inspection want to answer all of the following question:

* How good is each variations of the model learning? In details, what exactly does it learn? To take apart this question, in details, we want to ask:
    * What exactly does the variations of the model care about? Is it sentence structure (syntactic), content (sematic), or something else?
    * If it is learning something useful, how does it help us normalize data?
    * How well is the variation of the model learning, for whatever it is learning?
* In terms of clustering, what is the optimal sum of determine if it should be centrality?
    * Can median / avg be an good indicator for centrality?
* In terms of the sentence characteristic (simple vs complex sentence, special structure or meanings...) what does sum / avg be good at indicate them?
* What exactly determine a good ranking, or a good confidence value? E.g., if sentence 1 against sentence 2 has confidence of 0.8, and rank 90, what does it mean?


### What

The inspection code follows procedure:

* random draw n=40 (called sample_n) sentences from dataset.
* for each drawed sentences, retrive t=10,000 (called top_n) top ranked result against the sentence. Keep confidence and rank value.
* manual inspect (program print out) some samples of the ranked sentences:
    * Sentence will be sampled by rank ranges, for each range random draw 2 samples from them:
        * [R1] 1-10
        * [R2] 10-20
        * [R3] 20-30
        * [R4] 30-60
        * [R5] 60-120
        * [R6] 120-300
        * [R7] 300-600
        * [R8] 600-1000
        * [R9] 1000-4000
* In addition, all drawed sentence will be print out by its average and sum of top 10,000 value, this value is changeable, we call the value accepable_rank. (Note: the value can not be bigger than 10,000 due to it is based on the last calcuated result).

In [11]:
# --dialogs data need to be loaded --

dialogs = pd.read_csv('all_dialogs.csv')
dialogs

Unnamed: 0,show_id,char_name,char_id,dia1,dia2
0,DoctorWho,FOURTH DOCTOR,l8653,Get me the medical officer. Lieutenant Sulliva...,Human history.
1,DoctorWho,FOURTH DOCTOR,l8653,It's something that happened when we first met.,"I tell you, Brigadier, there's nothing to worr..."
2,DoctorWho,FOURTH DOCTOR,l8653,"This the patient, sir?",And stupid. If the square on the hypotenuse eq...
3,DoctorWho,FOURTH DOCTOR,l8653,"There you are. Now come along, Doctor, you're ...",Am I? Don't you mean the infirmary?
4,DoctorWho,FOURTH DOCTOR,l8653,"No, I do not mean the infirmary. I mean the si...",Not fit? I'm the Doctor.
5,DoctorWho,FOURTH DOCTOR,l8653,"No, Doctor, I'm the doctor and I say that you'...","You may be a doctor, but I'm the Doctor. The d..."
6,DoctorWho,FOURTH DOCTOR,l8653,"Look here, Doctor. You're not fit",Not fit? Not fit? Of course I'm fit. All syste...
7,DoctorWho,FOURTH DOCTOR,l8653,"I say, I don't think that can be right.","Both a bit fast, are they?"
8,DoctorWho,FOURTH DOCTOR,l8653,"Well, I","Still, must be patient. A new body's like a ne..."
9,DoctorWho,FOURTH DOCTOR,l8653,"Well, I really don't know.","Well, of course you don't. Why should you? You..."


In [12]:
import random

class ModelInspection:
    RANKS = [1, 10, 20, 30, 60, 120, 300, 600, 1000, 4000, 10000]
    CONFS = np.flip(np.arange(-2.0, 2.0, 0.3))
#     ALLOW_SORTING = {'mean', 'median', 'sum'}

    def __init__(self, model, sent_map, invert_sent_map, dialog_df, top_n=100):
        self.model = model
        self.sent_map = sent_map  # key: sent cat, value: sent id
        self.inv_map = invert_sent_map  # invert key value for sent_map.
        assert len(self.sent_map) == len(self.inv_map)  # should equal due to sent cat are unique
        # can not be smaller than max range, otherwise range can not be fetched.
        assert top_n <= self.RANKS[-1]
        self.top_n = top_n
        self.dialog_df = dialog_df
        # store computed results before ranking.
        self._res = {}  # key: sentence id; value: array of ranks, confs, sents. 

    def compute(self, sample_n=40, random_state=19374, overwrite=False):
        """Draw n samples and compute top_n ranking data, save to class before ranking.
        """
        if not overwrite:  # not allow to overwrite last results
            assert not self._res
        self._res = {}
        # cost some memory to store, maybe not a good idea to be bigger than that
        assert sample_n < 1000
        # draw n samples
        random.seed(random_state)
        for s_cat in random.sample(self.sent_map.keys(), sample_n):
            self._res[self.sent_map[s_cat]] = self._query_sent(s_cat)
        print('FINISH COMPUTE...')

    def _query_sent(self, sent_cat):
        """Query most simular sentences to the sentence category by learned matrix multiplication model.

        Params:
            sent_cat: sent cateogry id.
            model: learned implicit MM model.
            top: limites to n top rank of each sentence (more than the rank will be ignored).
            bounds: list of sentence id, boudary of candidates to be considered (other sentences will be ignored).

        Return:
            Array consists of three elements:
                numpy array of sentence ids in acending ranking order (or conf order, the same).
                numpy array of ranks (to sent ids).
                numpy array of confidence value (to sent ids).
        """
        _sims = self.model.similar_users(sent_cat, N=self.top_n)
#         _scale = None
        _rank = 0
        ranks = []
        confs = []
        sents = []
        for scat, conf in _sims:
            _rank += 1
            if scat == sent_cat:  # itself
                continue
            ranks.append(_rank)
            confs.append(conf)
            sents.append(self.sent_map[scat])
#             if fid in bounds:
#                 res.append(sent_cat, conf, bounds[fid], _rank,))
        return [np.array(sents), np.array(ranks), np.array(confs)]

    def _retrive_dialog(self, sid):
#         print(type(self.dialog_df.iloc[sid].dia1))
#         print(sid)
        return [self.dialog_df.loc[sid].dia1, self.dialog_df.loc[sid].dia2]

    def inspect_by_rank(self, smaller=None, random_state=82628):
#         # NOTE: sort_on in this case has SAME order of results for mean and sum, due to fixed size on rank.
#         assert sort_on in ALLOW_SORTING
        if smaller is None:
            smaller = self.top_n  # by default using biggest range.
        assert smaller >= self.top_n
        # -- first compute sorting order --
        _sort_res = {}  # key: sent id. Value: sum.
        _order = []  # list of sorted
        for sent_id, payload in self._res.items():
            _sort_res[sent_id] = payload[2][:smaller].mean()
        for sid in sorted(_sort_res, key=_sort_res.get, reverse=True):
            _order.append(sid)
        # -- report sentence on ranges --
        random.seed(random_state)
        for o, sid in enumerate(_order):
            # summary
            print()
            print('----------SENT {} @{}---------'.format(sid, o))
            print('TEXT: |{}|'.format(self._retrive_dialog(sid)))
            print('MEAN CONF TOP{}: {}'.format(self.top_n, _sort_res[sid]))
            # ranges
            _last_range = 0
            payload = self._res[sid]
            _confs = payload[2]
            _ranks = payload[1]
            _sents = payload[0]
            for ir, r in enumerate(self.RANKS):
                if r > self.top_n:
                    break
                range_confs = _confs[_last_range:r]
                print()
                if not len(range_confs):
                    print('-- {} range D{} {}-{} is empty'.format(sid, ir, _last_range, r))
                    continue
                print('-- {} range D{} {}-{}, mean conf: {}--'.format(sid, ir, _last_range, r, range_confs.mean()))
                # random sample one and print out.
                sampled_index = random.sample(list(range(_last_range, r)), 1)[0]
                print('rank {}, conf: {}'.format(_ranks[sampled_index], _confs[sampled_index]))
                print('sent {}: |{}|'.format(_sents[sampled_index], self._retrive_dialog(_sents[sampled_index])))
                _last_range = r

## Variation V1: Keep ALL

For all above data. keep all variation.

Full filtering manifest:

For DEP (syntactic dependency):
* No punct dependency.
* No ROOT.

For POS (senmatic token): Keep All.

Conclusions:
* If accepatable: Yes but not ideal.
* top 30 is accepable, and more than 0.6 conf.

Hypothesis of conclusions:
* either too much (complex grammer struct) or too little (low frequency head got removed) sythatic head would not lead to accurate prediction of likelyness.

In [13]:
my_model, SENT_CATS, FEATURE_CATS = get_train(data, report_test_acc=False)

<142447x929612 sparse matrix of type '<class 'numpy.float64'>'
	with 13530074 stored elements in COOrdinate format>
--TRAIN: train on complete matrix --


100%|██████████| 200.0/200 [06:46<00:00,  2.23s/it]


In [14]:
SENT_MAP, INV_SENT_MAP = build_maps(data)

929612 929612


In [15]:
mi = ModelInspection(my_model, SENT_MAP, INV_SENT_MAP, dialog_df=dialogs, top_n=1000)
mi.compute()
mi.inspect_by_rank()

FINISH COMPUTE...

----------SENT 291092 @0---------
TEXT: |["Agent Booth? I'm, uh, Jimmy Walpert. I'm sorry that I wasn't here, but I-I got a bit sick.", "Well, that's no surprise there."]|
MEAN CONF TOP1000: 1.3065152168273926

-- 291092 range D0 0-1, mean conf: 1.5529706478118896--
rank 2, conf: 1.5529706478118896
sent 332133: |["No, no, it's just that it's getting late...", "Hey, hey, it's fine. It's totally fine. We've got plenty of margaritas. It's all good."]|

-- 291092 range D1 1-10, mean conf: 1.4870613813400269--
rank 5, conf: 1.4965108633041382
sent 519378: |['Well, why are you surprised? ... now that we got ourselves a vampire.', "Just because he's a vampire doesn't mean he's a murderer!"]|

-- 291092 range D2 10-20, mean conf: 1.4501526355743408--
rank 21, conf: 1.4402787685394287
sent 980941: |["Ohh. No, it's just that I'm really tired and\x97", "Yeah, cool. That's\x97tired."]|

-- 291092 range D3 20-30, mean conf: 1.4315941333770752--
rank 31, conf: 1.4233970642089844
s

In [16]:
# Continue from here only if you want to see other variations.
# Depend on your data distribution, other variations may be more benefitial.

# raise Exception('End of research model.')

## Variation V2: Remove Stop words

Same as V1 except remove stop words, including tokens and heads.

Full manifest:

For DEP (syntactic dependency):
* No punct dependency.
* No ROOT.
* No stop words.

For POS (senmatic token):
* No stop words.

Conclusion:
Not accepable.

In [17]:
data_v2 = data.loc[data.is_stop==False]
data_v2

Unnamed: 0,sent_id,show_id,char_name,char_id,head_info,head_pos,token,token_par,position,is_stop,head_text
4,508646,OnceUponATime,CINDERELLA,l19407,POS,JJ,entire,entire,77,False,JJ|entire|entire
6,146223,VeronicaMars,VERONICA,l30591,DEP,compound,drug,testing,5,False,compound|drug|testing
7,19636,DoctorWho,ELEVENTH DOCTOR,l8661,POS,.,.,.,66,False,.|.|.
12,672740,Alias,JACK,l1275,POS,UH,um,um,9,False,UH|um|um
14,537460,StarTrek,JANEWAY,l25466,POS,NN,deflector,deflector,61,False,NN|deflector|deflector
18,662463,StarTrek,PHLOX,l25490,DEP,nsubj,condition,be,-16,False,nsubj|condition|be
21,238693,TheSimpsons,C. Montgomery Burns,a15869,POS,NN,blood,blood,63,False,NN|blood|blood
22,606411,StarTrek,WORF,l25435,POS,",",",",",",72,False,",|,|,"
24,751240,GilmoreGirls,RICHARD,l11615,POS,JJ,ready,ready,19,False,JJ|ready|ready
32,971419,DawsonsCreek,DAWSON,l7946,DEP,npadvmod,PERSON,know,15,False,npadvmod|PERSON|know


In [18]:
model_v2, SENT_CATS_v2, FEATURE_CATS_v2 = get_train(data_v2, report_test_acc=False)
SENT_MAP_v2, INV_SENT_MAP_v2 = build_maps(data_v2)

<83039x928629 sparse matrix of type '<class 'numpy.float64'>'
	with 5310467 stored elements in COOrdinate format>
--TRAIN: train on complete matrix --


100%|██████████| 200.0/200 [03:10<00:00,  1.02s/it]


928629 928629


In [19]:
mi = ModelInspection(model_v2, SENT_MAP_v2, INV_SENT_MAP_v2, dialog_df=dialogs)
mi.compute()
mi.inspect_by_rank()

FINISH COMPUTE...

----------SENT 578570 @0---------
TEXT: |["Granted. I'm Matthew Ryan, first officer.", "Jonathan Archer, Captain of the Enterprise. Sorry we couldn't meet under better circumstances. What happened here?"]|
MEAN CONF TOP100: 1.1057536602020264

-- 578570 range D0 0-1, mean conf: 1.1178059577941895--
rank 2, conf: 1.1178059577941895
sent 643771: |['This is Lieutenant Lyra, my next in command.', 'Lyra? Wow, what a beautiful name. Did you know that in Greek, Lyra means harp? And nobody plays the harp better than the Harper. I have never seen anyone so']|

-- 578570 range D1 1-10, mean conf: 1.1101062297821045--
rank 5, conf: 1.1106817722320557
sent 408947: |["Hotel security came up on a noise complaint. Found two DBs. Room's registered to a uniform officer, Manny Senteno.", 'One of ours?']|

-- 578570 range D2 10-20, mean conf: 1.1074599027633667--
rank 21, conf: 1.1068904399871826
sent 675571: |["Oh, thank goodness you're still open! I'm Marie Robinson, this is my assoc

## Variation V3: Only accept nouns and verbs as token.

The variation is based on v2 modification, and only keep nouns and verbs as token.

Full manifest:

For DEP (syntactic dependency):
* No punct dependency.
* No ROOT.

For POS (senmatic token):

* Only keep nouns and verbs, remove all others.
    * Noun keeps:
        * NN: Noun, singular or mass
        * NNS: Noun, plural
        * NNP: Proper noun, singular
        * NNPS: Proper noun, plural
    * Verb keeps:
        * VB: Verb, base form
        * VBD: Verb, past tense
        * VBG: Verb, gerund or present participle
        * VBN: Verb, past participle
        * VBP: Verb, non-3rd person singular present
        * VBZ: Verb, 3rd person singular present 
        
Ref to Peen Treebank: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html

Conclusion:
Reject or accept.

In [20]:
_allow_pos = set([
    'NN', 'NNS', 'NNP', 'NNPS',
    'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ',
])

# if POS, have to be in allowed. If not POS, add it anyway.
data_v3 = data.loc[(data.head_info!='POS') | (data.head_pos.isin(_allow_pos))]
data_v3

Unnamed: 0,sent_id,show_id,char_name,char_id,head_info,head_pos,token,token_par,position,is_stop,head_text
0,1063785,QueerAsFolk,MICHAEL,l22346,POS,VBZ,be,be,2,True,VBZ|be|be
1,233952,TheSimpsons,Lisa Simpson,a15823,DEP,det,a,girl,8,True,det|a|girl
5,78453,DoctorWho,BARBARA,l8667,DEP,advcl,be,hold,34,True,advcl|be|hold
6,146223,VeronicaMars,VERONICA,l30591,DEP,compound,drug,testing,5,False,compound|drug|testing
10,1059921,Charmed1998,ANDY,l6497,DEP,ccomp,be,think,-18,True,ccomp|be|think
13,144292,VeronicaMars,VERONICA,l30591,POS,VBZ,do,do,0,True,VBZ|do|do
14,537460,StarTrek,JANEWAY,l25466,POS,NN,deflector,deflector,61,False,NN|deflector|deflector
15,923983,SonsOfAnarchy,JAX,l24810,POS,VBZ,be,be,26,True,VBZ|be|be
18,662463,StarTrek,PHLOX,l25490,DEP,nsubj,condition,be,-16,False,nsubj|condition|be
19,642832,StarTrek,HARPER,l1893,DEP,prep,beyond,be,24,True,prep|beyond|be


In [21]:
model_v3, SENT_CATS_v3, FEATURE_CATS_v3 = get_train(data_v3, report_test_acc=False)
SENT_MAP_v3, INV_SENT_MAP_v3 = build_maps(data_v3)

<137129x924111 sparse matrix of type '<class 'numpy.float64'>'
	with 7917166 stored elements in COOrdinate format>
--TRAIN: train on complete matrix --


100%|██████████| 200.0/200 [03:34<00:00,  1.14s/it]


924111 924111


In [22]:
mi = ModelInspection(model_v3, SENT_MAP_v3, INV_SENT_MAP_v3, dialog_df=dialogs)
mi.compute()
mi.inspect_by_rank()

FINISH COMPUTE...

----------SENT 475992 @0---------
TEXT: |["Don't be a dork! It's what you love to do. If you don't want to, then we'll work together every day, and I'll love it, but if you do, as much as I'll miss spending my days with you, I'll love spending my nights with you.", 'I love you, Millicent Huxtable.']|
MEAN CONF TOP100: 1.352049708366394

-- 475992 range D0 0-1, mean conf: 1.4974424839019775--
rank 2, conf: 1.4974424839019775
sent 216537: |["Marge, I hate to interrupt the ol' solilo-diddly-iloquy, but I just want to thank you for your early Christmas card.", "Christmas card? I haven't even taken the photo yet!"]|

-- 475992 range D1 1-10, mean conf: 1.4503693580627441--
rank 5, conf: 1.4632606506347656
sent 949769: |['I love you, Julie, and I want tobe there for you, and our son.', "She's pregnant with Frank's baby?"]|

-- 475992 range D2 10-20, mean conf: 1.4009217023849487--
rank 21, conf: 1.3864495754241943
sent 975386: |['I suspected as much.', "And he doesn't trus

## Variation V4: Dep only

Same as all versions except only keep dep.

Full manifest:

For DEP (syntactic dependency):
* No punct dependency.
* No ROOT.

For POS (senmatic token): NO POS.

Decision: reject.

In [23]:
data_v4 = data.loc[data.head_info!='POS']
data_v4

Unnamed: 0,sent_id,show_id,char_name,char_id,head_info,head_pos,token,token_par,position,is_stop,head_text
1,233952,TheSimpsons,Lisa Simpson,a15823,DEP,det,a,girl,8,True,det|a|girl
5,78453,DoctorWho,BARBARA,l8667,DEP,advcl,be,hold,34,True,advcl|be|hold
6,146223,VeronicaMars,VERONICA,l30591,DEP,compound,drug,testing,5,False,compound|drug|testing
10,1059921,Charmed1998,ANDY,l6497,DEP,ccomp,be,think,-18,True,ccomp|be|think
18,662463,StarTrek,PHLOX,l25490,DEP,nsubj,condition,be,-16,False,nsubj|condition|be
19,642832,StarTrek,HARPER,l1893,DEP,prep,beyond,be,24,True,prep|beyond|be
23,526675,GreysAnatomy,DEREK,l12263,DEP,mark,if,be,5,True,mark|if|be
27,66679,DoctorWho,GWEN,l9078,DEP,det,no,weapon,9,True,det|no|weapon
31,580866,StarTrek,ARCHER,l25486,DEP,mark,if,agree,5,True,mark|if|agree
32,971419,DawsonsCreek,DAWSON,l7946,DEP,npadvmod,PERSON,know,15,False,npadvmod|PERSON|know


In [24]:
model_v4, SENT_CATS_v4, FEATURE_CATS_v4 = get_train(data_v4, report_test_acc=False)
SENT_MAP_v4, INV_SENT_MAP_v4 = build_maps(data_v4)

<116443x906631 sparse matrix of type '<class 'numpy.float64'>'
	with 5039860 stored elements in COOrdinate format>
--TRAIN: train on complete matrix --


100%|██████████| 200.0/200 [02:01<00:00,  1.58it/s]


906631 906631


In [25]:
mi = ModelInspection(model_v4, SENT_MAP_v4, INV_SENT_MAP_v4, dialog_df=dialogs)
mi.compute()
mi.inspect_by_rank()

FINISH COMPUTE...

----------SENT 773306 @0---------
TEXT: |["And with government agents after us, we had better get our asses in gear, don't you think?", 'Agents?']|
MEAN CONF TOP100: 1.1626336574554443

-- 773306 range D0 0-1, mean conf: 1.1730817556381226--
rank 2, conf: 1.1730817556381226
sent 220583: |["I deserved that. Look, I know we let you down, but me and the boys, we still think you've got a big future in racketeering and extortion.", "Sorry Fat Tony. I used to think your gang was cool, but now I learned that crime doesn't pay."]|

-- 773306 range D1 1-10, mean conf: 1.170893669128418--
rank 5, conf: 1.1716468334197998
sent 957615: |['you got great taste in movies, I think your smart, I think your interesting', 'really']|

-- 773306 range D2 10-20, mean conf: 1.1670458316802979--
rank 21, conf: 1.166184663772583
sent 38894: |["All right, carry on, fine. How far do you think you'll get without\nthis? (the fluid link)", 'Give that to me.']|

-- 773306 range D3 20-30, mean conf

## Variation V5: Keep and only keep all tokens.

Similar to V4 except for POS.

Full manifest:

For DEP (syntactic dependency): NO DEP.

For POS (senmatic token).

Conclusion: Reject

In [26]:
data_v5 = data.loc[data.head_info=='POS']
data_v5

Unnamed: 0,sent_id,show_id,char_name,char_id,head_info,head_pos,token,token_par,position,is_stop,head_text
0,1063785,QueerAsFolk,MICHAEL,l22346,POS,VBZ,be,be,2,True,VBZ|be|be
4,508646,OnceUponATime,CINDERELLA,l19407,POS,JJ,entire,entire,77,False,JJ|entire|entire
7,19636,DoctorWho,ELEVENTH DOCTOR,l8661,POS,.,.,.,66,False,.|.|.
11,504657,OnceUponATime,MARY MARGARET,l19393,POS,PRP$,her,her,45,True,PRP$|her|her
12,672740,Alias,JACK,l1275,POS,UH,um,um,9,False,UH|um|um
13,144292,VeronicaMars,VERONICA,l30591,POS,VBZ,do,do,0,True,VBZ|do|do
14,537460,StarTrek,JANEWAY,l25466,POS,NN,deflector,deflector,61,False,NN|deflector|deflector
15,923983,SonsOfAnarchy,JAX,l24810,POS,VBZ,be,be,26,True,VBZ|be|be
21,238693,TheSimpsons,C. Montgomery Burns,a15869,POS,NN,blood,blood,63,False,NN|blood|blood
22,606411,StarTrek,WORF,l25435,POS,",",",",",",72,False,",|,|,"


In [27]:
model_v5, SENT_CATS_v5, FEATURE_CATS_v5 = get_train(data_v5, report_test_acc=False)
SENT_MAP_v5, INV_SENT_MAP_v5 = build_maps(data_v5)

<26004x929612 sparse matrix of type '<class 'numpy.float64'>'
	with 8490214 stored elements in COOrdinate format>
--TRAIN: train on complete matrix --


100%|██████████| 200.0/200 [05:23<00:00,  1.81s/it]


929612 929612


In [28]:
mi = ModelInspection(model_v5, SENT_MAP_v5, INV_SENT_MAP_v5, dialog_df=dialogs)
mi.compute()
mi.inspect_by_rank()

FINISH COMPUTE...

----------SENT 43012 @0---------
TEXT: |['You are welcome, welcome indeed. Though we know you only as a record in our charts of space and time, yet you seem to us like an old friend.', "Well, that's very kind of you to make me feel so welcome."]|
MEAN CONF TOP100: 2.520731210708618

-- 43012 range D0 0-1, mean conf: 2.6744749546051025--
rank 2, conf: 2.6744749546051025
sent 633894: |['I predict that when these findings are made public, they will become a new model for warp field operation.', 'Sounds interesting.']|

-- 43012 range D1 1-10, mean conf: 2.6253905296325684--
rank 5, conf: 2.631696939468384
sent 769941: |["Taylor, everyone, there will be millions of questions, some of them even legitimate, but the bottom line is, you know me. I've been apart of this town for. . .well, look how big my daughter is - for that long. And opening this inn has been a dream of mine and of Sookie's for most of that time. Along with Michel, we plan to make this community as proud o

## Variation V6: Keep and only keep tokens on nouns and verbs.

Full manifest:

For DEP (syntactic dependency): NO DEP.

For POS (senmatic token):
* Only keep nouns and verbs, remove all others.

REJECT.

In [29]:
data_v6 = data.loc[(data.head_info=='POS') & (data.head_pos.isin(_allow_pos))]
data_v6

Unnamed: 0,sent_id,show_id,char_name,char_id,head_info,head_pos,token,token_par,position,is_stop,head_text
0,1063785,QueerAsFolk,MICHAEL,l22346,POS,VBZ,be,be,2,True,VBZ|be|be
13,144292,VeronicaMars,VERONICA,l30591,POS,VBZ,do,do,0,True,VBZ|do|do
14,537460,StarTrek,JANEWAY,l25466,POS,NN,deflector,deflector,61,False,NN|deflector|deflector
15,923983,SonsOfAnarchy,JAX,l24810,POS,VBZ,be,be,26,True,VBZ|be|be
21,238693,TheSimpsons,C. Montgomery Burns,a15869,POS,NN,blood,blood,63,False,NN|blood|blood
33,866034,HowIMetYourMother,ROBIN,l13772,POS,NN,info,info,63,False,NN|info|info
45,189666,TheSimpsons,Homer Simpson,a15835,POS,NNP,PERSON,PERSON,5,False,NNP|PERSON|PERSON
46,608478,StarTrek,SPOCK,l25411,POS,VB,get,get,30,True,VB|get|get
58,460395,OneTreeHill,LUCAS,l19593,POS,NN,mom,mom,27,False,NN|mom|mom
59,357634,Friends,MONICA,l10692,POS,VBG,play,play,110,False,VBG|play|play


In [30]:
model_v6, SENT_CATS_v6, FEATURE_CATS_v6 = get_train(data_v6, report_test_acc=False)
SENT_MAP_v6, INV_SENT_MAP_v6 = build_maps(data_v6)

<20686x904907 sparse matrix of type '<class 'numpy.float64'>'
	with 2877306 stored elements in COOrdinate format>
--TRAIN: train on complete matrix --


100%|██████████| 200.0/200 [01:37<00:00,  1.99it/s]


904907 904907


In [31]:
mi = ModelInspection(model_v6, SENT_MAP_v6, INV_SENT_MAP_v6, dialog_df=dialogs)
mi.compute()
mi.inspect_by_rank()

FINISH COMPUTE...

----------SENT 519664 @0---------
TEXT: |["Sookie. All of the things that you need to be protected from, all of the trouble you're in, you're in because of me. So you needing to be protected has nothing to de with you or who you are. All of it is my fault. So why don't you just go to sleep and let me be the one to worry about it?", "Bill, all the trouble I'm in, it's mine. I chose it. I chose it when I chose you."]|
MEAN CONF TOP100: 1.3187565803527832

-- 519664 range D0 0-1, mean conf: 1.3921537399291992--
rank 2, conf: 1.3921537399291992
sent 803056: |['Ah. Well, please think about it. And call me. We will do whatever we can to get you to stay.', 'Okay.']|

-- 519664 range D1 1-10, mean conf: 1.3719496726989746--
rank 5, conf: 1.3747535943984985
sent 878413: |["Let's not worry about her anymore.", "I have to sign off. I'll talk to you later."]|

-- 519664 range D2 10-20, mean conf: 1.350116491317749--
rank 21, conf: 1.3364603519439697
sent 52599: |["Don't go to sl

## V7: V like this.

Manufest:
* DEP: all.
* POS: all noun forms only.

Decent but less than V1?

Idea: random draw from top 10 for context correct candidate. 
Reason: to avoid the risk of getting same ranked 1 sentence for various dialog 1s. Protential risk including model give weights too much to centain sentences features which may casuse bias in training set. 

In [32]:
data_v7 = data.loc[(data.head_info=='DEP') | (data.head_pos.isin(set(['NN', 'NNS', 'NNP', 'NNPS',])))]
data_v7

Unnamed: 0,sent_id,show_id,char_name,char_id,head_info,head_pos,token,token_par,position,is_stop,head_text
1,233952,TheSimpsons,Lisa Simpson,a15823,DEP,det,a,girl,8,True,det|a|girl
5,78453,DoctorWho,BARBARA,l8667,DEP,advcl,be,hold,34,True,advcl|be|hold
6,146223,VeronicaMars,VERONICA,l30591,DEP,compound,drug,testing,5,False,compound|drug|testing
10,1059921,Charmed1998,ANDY,l6497,DEP,ccomp,be,think,-18,True,ccomp|be|think
14,537460,StarTrek,JANEWAY,l25466,POS,NN,deflector,deflector,61,False,NN|deflector|deflector
18,662463,StarTrek,PHLOX,l25490,DEP,nsubj,condition,be,-16,False,nsubj|condition|be
19,642832,StarTrek,HARPER,l1893,DEP,prep,beyond,be,24,True,prep|beyond|be
21,238693,TheSimpsons,C. Montgomery Burns,a15869,POS,NN,blood,blood,63,False,NN|blood|blood
23,526675,GreysAnatomy,DEREK,l12263,DEP,mark,if,be,5,True,mark|if|be
27,66679,DoctorWho,GWEN,l9078,DEP,det,no,weapon,9,True,det|no|weapon


In [33]:
model_v7, SENT_CATS_v7, FEATURE_CATS_v7 = get_train(data_v7, report_test_acc=False)
SENT_MAP_v7, INV_SENT_MAP_v7 = build_maps(data_v7)

<129816x921574 sparse matrix of type '<class 'numpy.float64'>'
	with 6259648 stored elements in COOrdinate format>
--TRAIN: train on complete matrix --


100%|██████████| 200.0/200 [02:24<00:00,  1.29it/s]


921574 921574


In [34]:
mi = ModelInspection(model_v7, SENT_MAP_v7, INV_SENT_MAP_v7, dialog_df=dialogs)
mi.compute()
mi.inspect_by_rank()

FINISH COMPUTE...

----------SENT 885975 @0---------
TEXT: |['And it had nothing to do with feeling?', "Not beyond the feeling I possess for any of God's needy creatures... a hungry child..."]|
MEAN CONF TOP100: 1.0700825452804565

-- 885975 range D0 0-1, mean conf: 1.1104716062545776--
rank 2, conf: 1.1104716062545776
sent 2132: |['Yes, but it has nothing to do with clinical pathology.', 'Does that crystalline lattice remind you of anything?']|

-- 885975 range D1 1-10, mean conf: 1.099260687828064--
rank 5, conf: 1.1025006771087646
sent 769898: |['But this has nothing to do with -', "You've always been the head pilgrim girl at the food drive table."]|

-- 885975 range D2 10-20, mean conf: 1.08853018283844--
rank 21, conf: 1.0861375331878662
sent 576130: |['I know nothing of this. Send for someone learned in witchcraft and let him examine him. I will have nothing to do with it.', 'My lord, at least arrange for me to meet Mister Atoz.']|

-- 885975 range D3 20-30, mean conf: 1.07910084

## V8: 

Manifest:
* DEP: everything except det.
* POS: all nouns.

In [35]:
data_v8 = data.loc[((data.head_info=='DEP') & (data.head_pos!='det')) | (data.head_pos.isin(set(['NN', 'NNS', 'NNP', 'NNPS',])))]
data_v8

Unnamed: 0,sent_id,show_id,char_name,char_id,head_info,head_pos,token,token_par,position,is_stop,head_text
5,78453,DoctorWho,BARBARA,l8667,DEP,advcl,be,hold,34,True,advcl|be|hold
6,146223,VeronicaMars,VERONICA,l30591,DEP,compound,drug,testing,5,False,compound|drug|testing
10,1059921,Charmed1998,ANDY,l6497,DEP,ccomp,be,think,-18,True,ccomp|be|think
14,537460,StarTrek,JANEWAY,l25466,POS,NN,deflector,deflector,61,False,NN|deflector|deflector
18,662463,StarTrek,PHLOX,l25490,DEP,nsubj,condition,be,-16,False,nsubj|condition|be
19,642832,StarTrek,HARPER,l1893,DEP,prep,beyond,be,24,True,prep|beyond|be
21,238693,TheSimpsons,C. Montgomery Burns,a15869,POS,NN,blood,blood,63,False,NN|blood|blood
23,526675,GreysAnatomy,DEREK,l12263,DEP,mark,if,be,5,True,mark|if|be
31,580866,StarTrek,ARCHER,l25486,DEP,mark,if,agree,5,True,mark|if|agree
32,971419,DawsonsCreek,DAWSON,l7946,DEP,npadvmod,PERSON,know,15,False,npadvmod|PERSON|know


In [36]:
model_v8, SENT_CATS_v8, FEATURE_CATS_v8 = get_train(data_v8, report_test_acc=False)
SENT_MAP_v8, INV_SENT_MAP_v8 = build_maps(data_v8)

<118938x921300 sparse matrix of type '<class 'numpy.float64'>'
	with 5848314 stored elements in COOrdinate format>
--TRAIN: train on complete matrix --


100%|██████████| 200.0/200 [02:25<00:00,  1.34it/s]


921300 921300


In [37]:
mi = ModelInspection(model_v8, SENT_MAP_v8, INV_SENT_MAP_v8, dialog_df=dialogs)
mi.compute()
mi.inspect_by_rank()

FINISH COMPUTE...

----------SENT 754017 @0---------
TEXT: |['I guess. You know, when I was with Dean, I always knew that no matter what happened, he would be there.', 'Dean was very dependable.']|
MEAN CONF TOP100: 1.0681836605072021

-- 754017 range D0 0-1, mean conf: 1.1625285148620605--
rank 2, conf: 1.1625285148620605
sent 395604: |["Not one word since high school. Listen, when we found that torso in the hole, on the crest of that hill ... I recognized the kid's shirt, and I knew he'd been with Leland, so ...", 'So you assumed your brother was capable of murder.']|

-- 754017 range D1 1-10, mean conf: 1.1162421703338623--
rank 5, conf: 1.1247273683547974
sent 996615: |["parents had two constant arguments, while ah, they were driving, over either how fast my father was going, or how much gas was left in the tank.\xa0 My father had a standard defense for either one of these, it was always, that's because you're looking at it from an angle.\xa0 If you were over here -- it looks from 

## V9: Noun, verbs, and clean DEP

DEP: all DEP but det.
POS: verbs and nouns.

In [38]:
data_v9 = data.loc[((data.head_info=='DEP') & (data.head_pos!='det')) | (data.head_pos.isin(_allow_pos))]
data_v9

Unnamed: 0,sent_id,show_id,char_name,char_id,head_info,head_pos,token,token_par,position,is_stop,head_text
0,1063785,QueerAsFolk,MICHAEL,l22346,POS,VBZ,be,be,2,True,VBZ|be|be
5,78453,DoctorWho,BARBARA,l8667,DEP,advcl,be,hold,34,True,advcl|be|hold
6,146223,VeronicaMars,VERONICA,l30591,DEP,compound,drug,testing,5,False,compound|drug|testing
10,1059921,Charmed1998,ANDY,l6497,DEP,ccomp,be,think,-18,True,ccomp|be|think
13,144292,VeronicaMars,VERONICA,l30591,POS,VBZ,do,do,0,True,VBZ|do|do
14,537460,StarTrek,JANEWAY,l25466,POS,NN,deflector,deflector,61,False,NN|deflector|deflector
15,923983,SonsOfAnarchy,JAX,l24810,POS,VBZ,be,be,26,True,VBZ|be|be
18,662463,StarTrek,PHLOX,l25490,DEP,nsubj,condition,be,-16,False,nsubj|condition|be
19,642832,StarTrek,HARPER,l1893,DEP,prep,beyond,be,24,True,prep|beyond|be
21,238693,TheSimpsons,C. Montgomery Burns,a15869,POS,NN,blood,blood,63,False,NN|blood|blood


In [39]:
model_v9, SENT_CATS_v9, FEATURE_CATS_v9 = get_train(data_v9, report_test_acc=False)
SENT_MAP_v9, INV_SENT_MAP_v9 = build_maps(data_v9)

<126251x923918 sparse matrix of type '<class 'numpy.float64'>'
	with 7505832 stored elements in COOrdinate format>
--TRAIN: train on complete matrix --


100%|██████████| 200.0/200 [03:02<00:00,  1.04it/s]


923918 923918


In [40]:
mi = ModelInspection(model_v9, SENT_MAP_v9, INV_SENT_MAP_v9, dialog_df=dialogs)
mi.compute()
mi.inspect_by_rank()

FINISH COMPUTE...

----------SENT 290211 @0---------
TEXT: |['I want to see if I could find a link between the old witch and the new witch.', "All right, you know what? Go for it. It's all yours."]|
MEAN CONF TOP100: 1.0366235971450806

-- 290211 range D0 0-1, mean conf: 1.1804099082946777--
rank 2, conf: 1.1804099082946777
sent 485533: |["I'm so sorry. I wanted to see you as soon as I found out.", "It's okay. Thank you, though."]|

-- 290211 range D1 1-10, mean conf: 1.0964984893798828--
rank 5, conf: 1.0976508855819702
sent 502236: |["Keep looking for as long as you like. But down here, this is my shop. And you won't find anything unless I want you to.", 'Be careful with your threats.']|

-- 290211 range D2 10-20, mean conf: 1.0747586488723755--
rank 21, conf: 1.0695421695709229
sent 303049: |['Well, what I want to do is I want to find a doctor to find him and pump him with enough morphine to stun an elephant.', "Physicians can't do that, it's against the Hippocratic Oath. First, do 

## V10, V1 - det

DEP: keep all except det.

POS: keep all.

In [41]:
data_v10 = data.loc[data.head_pos!='det']
data_v10

Unnamed: 0,sent_id,show_id,char_name,char_id,head_info,head_pos,token,token_par,position,is_stop,head_text
0,1063785,QueerAsFolk,MICHAEL,l22346,POS,VBZ,be,be,2,True,VBZ|be|be
4,508646,OnceUponATime,CINDERELLA,l19407,POS,JJ,entire,entire,77,False,JJ|entire|entire
5,78453,DoctorWho,BARBARA,l8667,DEP,advcl,be,hold,34,True,advcl|be|hold
6,146223,VeronicaMars,VERONICA,l30591,DEP,compound,drug,testing,5,False,compound|drug|testing
7,19636,DoctorWho,ELEVENTH DOCTOR,l8661,POS,.,.,.,66,False,.|.|.
10,1059921,Charmed1998,ANDY,l6497,DEP,ccomp,be,think,-18,True,ccomp|be|think
11,504657,OnceUponATime,MARY MARGARET,l19393,POS,PRP$,her,her,45,True,PRP$|her|her
12,672740,Alias,JACK,l1275,POS,UH,um,um,9,False,UH|um|um
13,144292,VeronicaMars,VERONICA,l30591,POS,VBZ,do,do,0,True,VBZ|do|do
14,537460,StarTrek,JANEWAY,l25466,POS,NN,deflector,deflector,61,False,NN|deflector|deflector


In [42]:
model_v10, SENT_CATS_v10, FEATURE_CATS_v10 = get_train(data_v10, report_test_acc=False)
SENT_MAP_v10, INV_SENT_MAP_v10 = build_maps(data_v10)

<131569x929612 sparse matrix of type '<class 'numpy.float64'>'
	with 13118740 stored elements in COOrdinate format>
--TRAIN: train on complete matrix --


100%|██████████| 200.0/200 [06:44<00:00,  2.21s/it]


929612 929612


In [43]:
mi = ModelInspection(model_v10, SENT_MAP_v10, INV_SENT_MAP_v10, dialog_df=dialogs)
mi.compute()
mi.inspect_by_rank()

FINISH COMPUTE...

----------SENT 928397 @0---------
TEXT: |["You know, I'm with you on Marks, Jax. But I don't know how much longer I'm gonna be able to keep playing double agent.", 'Niners are the only ones that can get us close to Marks so we can get Bobby back.']|
MEAN CONF TOP100: 1.3698421716690063

-- 928397 range D0 0-1, mean conf: 1.8243440389633179--
rank 2, conf: 1.8243440389633179
sent 924606: |["For every 24 hours August has to wait, he's gonna cut off another part of your guy. I mean, I'm with you, but I don't know how much longer I'm gonna be able to keep playing double agent.", 'Jesus.']|

-- 928397 range D1 1-10, mean conf: 1.46809983253479--
rank 5, conf: 1.4859862327575684
sent 506721: |['I’m not gonna be around much longer unless we find where this belongs.', 'I think that belongs to me.']|

-- 928397 range D2 10-20, mean conf: 1.4162633419036865--
rank 21, conf: 1.4000674486160278
sent 944546: |["ok dad first of all, i'm really sorry an now I know your not gonna be

## Utility: Bench same sentences against different models

Same sentence is benched against all models to determine what the model cares about.

The sentences is normally choosed as a hard/simple sentence to capture.

In [47]:
class Bench:
    MODELS = {
        'v1': [my_model, SENT_MAP, INV_SENT_MAP],
        'v2': [model_v2, SENT_MAP_v2, INV_SENT_MAP_v2],
        'v3': [model_v3, SENT_MAP_v3, INV_SENT_MAP_v3],
        'v4': [model_v4, SENT_MAP_v4, INV_SENT_MAP_v4],
        'v5': [model_v5, SENT_MAP_v5, INV_SENT_MAP_v5],
        'v6': [model_v6, SENT_MAP_v6, INV_SENT_MAP_v6],
        'v7': [model_v7, SENT_MAP_v7, INV_SENT_MAP_v7],
        'v8': [model_v8, SENT_MAP_v8, INV_SENT_MAP_v8],
        'v9': [model_v9, SENT_MAP_v9, INV_SENT_MAP_v9],
        'v10': [model_v10, SENT_MAP_v10, INV_SENT_MAP_v10],
    }

    def __init__(self, dialog_df, models=None):
        self.dialog_df = dialog_df
        self.top_n = 100
        if models is None:
            self.models = self.MODELS.keys()
        else:
            self.models = models

    def _retrive_dialog(self, sid):
        return [self.dialog_df.loc[sid].dia1, self.dialog_df.loc[sid].dia2]
    
    def _query_sent(self, sent_cat, model):
        """Query most simular sentences to the sentence category by learned matrix multiplication model.

        Params:
            sent_cat: sent cateogry id.
            model: learned implicit MM model.
            top: limites to n top rank of each sentence (more than the rank will be ignored).
            bounds: list of sentence id, boudary of candidates to be considered (other sentences will be ignored).

        Return:
            Array consists of three elements:
                numpy array of sentence ids in acending ranking order (or conf order, the same).
                numpy array of ranks (to sent ids).
                numpy array of confidence value (to sent ids).
        """
        _sims = model.similar_users(sent_cat, N=self.top_n)
        _rank = 0
        ranks = []
        confs = []
        sents = []
        for sid, conf in _sims:
            _rank += 1
            if sid == sent_cat:  # itself
                continue
            ranks.append(_rank)
            confs.append(conf)
            sents.append(sid)
        return [np.array(sents), np.array(ranks), np.array(confs)]

    def run(self, sent_id):
#         print('TARGET: {}'.format(self._retrive_dialog(sent_id)))
#         for k, m in self.MODELS.items():
        for k in self.models:
            m = self.MODELS[k]
#             if k not in self.models:
#                 continue
            print()
            print('---------MODEL {}---------'.format(k))
            print('TARGET: {}'.format(self._retrive_dialog(sent_id)))
            sent_cat = m[2][sent_id]
            sents, ranks, confs = self._query_sent(sent_cat, m[0])
            print('MEAN CONF: {}'.format(confs.mean()))
            for i, sid in enumerate(sents[:10]):
                print()
                print('id: {}, conf: {}, rank {}'.format(m[1][sid], confs[i], ranks[i]))
                print(self._retrive_dialog(m[1][sid]))

    def sample_run(self):
        random.seed(1231312312)
        for sid in random.sample(INV_SENT_MAP.keys(), 10):
            print('++++++++++++{}++++++++++\n'.format(sid))
            self.run(sid)

In [51]:
bench = Bench(dialogs, models=['v1', 'v9', 'v10'])
bench.run(556622)#456590)#89799)
# bench.sample_run()


---------MODEL v1---------
TARGET: ["You're going to have to destroy it in order to escape.", 'How much longer before we reach them?']
MEAN CONF: 1.005842924118042

id: 317604, conf: 1.103016972541809, rank 2
['“You must be going through a canyon.”', '“Cordelia, can you hear me?”']

id: 415933, conf: 1.0803239345550537, rank 3
["The shooting review board wants to talk to you. You're going to have to explain your tactical decision for shooting in the direction of a fellow officer.", "I'm ready when they are."]

id: 237701, conf: 1.0758733749389648, rank 4
["Seems almost a shame I'm gonna have to bang you against a wall till you die.", "No! Let's take the eagle to Shelbyville Animal Rescue. Where you took that injured hummingbird you found on the front lawn."]

id: 156934, conf: 1.0756711959838867, rank 5
["There's gonna be millions of eyes on you. You're gonna be under a magnifying glass.", 'I have pulled it off before.']

id: 228662, conf: 1.074569821357727, rank 6
['I wrote a whole s