In [1]:
# DATA LOAD
import os
import itertools
import pandas as pd
import numpy as np
import random
import sys
import implicit
import numpy as np
import math
from collections import Counter
import re

from copy import deepcopy
from sklearn import metrics
from scipy.sparse import coo_matrix
from sklearn.model_selection import KFold
from implicit.evaluation import train_test_split, precision_at_k, mean_average_precision_at_k

from pprint import pprint

os.environ["MKL_NUM_THREADS"] = "1"

In [2]:
# -- simple df inspection function --
def inspect_df(df, col1, col2):
    _count = getattr(df, col1).value_counts()
    print('COUNT {}: {}'.format(col1, len(_count)))
    _count = getattr(df, col2).value_counts()
    print('COUNT {}: {}'.format(col2, len(_count)))

## Load data

This procedure is the same as last pipeline.

Data is loaded:
* Dialogs
* Syntactic Heads for dialogs.
* Character HLA data.

In [3]:
# heads data
heads = pd.read_csv('all_dialogs_heads.csv')
heads = heads.loc[(heads.head_pos!='ROOT') & (heads.head_pos!='punct')]
heads = heads.sample(frac=1, random_state=578153).reset_index(drop=True)  # note: all random state from random.org
heads = heads[heads.groupby('head_text')['head_text'].transform('count').ge(5)]

inspect_df(heads, 'sent_id', 'head_text')

heads

COUNT sent_id: 929612
COUNT head_text: 142447


Unnamed: 0,sent_id,show_id,char_name,char_id,head_info,head_pos,token,token_par,position,is_stop,head_text
0,356464,Friends,MONICA,l10692,POS,RB,just,just,112,True,RB|just|just
1,197222,TheSimpsons,Homer Simpson,a15835,POS,",",",",",",33,False,",|,|,"
3,786268,TheOfficeUS,MICHAEL,l19352,POS,",",",",",",14,False,",|,|,"
4,906740,TheBigBangTheory,PENNY,l4391,DEP,nsubj,that,mean,5,True,nsubj|that|mean
5,394956,CSIVerse,GRISSOM,l7484,POS,IN,at,at,88,True,IN|at|at
...,...,...,...,...,...,...,...,...,...,...,...
14823069,492990,OnceUponATime,EMMA,l19391,POS,NN,something,something,16,True,NN|something|something
14823070,158444,Smallville,CLARK,l24611,DEP,nsubj,i,hold,2,True,nsubj|i|hold
14823071,549904,StarTrek,PICARD,l25431,POS,RB,soon,soon,37,False,RB|soon|soon
14823072,659662,StarTrek,REED,l25489,POS,NNS,ship,ship,70,False,NNS|ship|ship


In [4]:
# --dialogs data need to be loaded --

# Note: dialog can NOT be random swapped! Index matters.
dialogs = pd.read_csv('all_dialogs.csv')

# exclude any characters that has less than 600 lines of dialogs
_exclude = set()
for cid, count in dialogs.char_id.value_counts().iteritems():
    if count < 600:
        _exclude.add(cid)
print('excluded insufficent characters:')
print(_exclude)
dialogs = dialogs.loc[~dialogs.char_id.isin(_exclude)]

# remove any dialogs that are empty
_old_len = len(dialogs)
dialogs = dialogs.loc[~pd.isnull(dialogs.dia1)]
dialogs = dialogs.loc[~pd.isnull(dialogs.dia2)]
print('cleaned: {}/{}'.format(len(dialogs), _old_len))

# remove l20098
# Note: the character Rebekah Mikaelson (l20098) exists in both TheVampireDiaries and TheOriginals
# to avoid possible break the testing/training isolation, we remove this character.
dialogs = dialogs.loc[dialogs.char_id!='l20098']

dialogs

excluded insufficent characters:
{'l11630', 'l23597', 'l28488', 'l19605', 'l19253', 'l19401', 'l7505', 'l19407', 'a13181', 'l24814', 'l12270', 'l20102', 'l28484', 'l30596', 'l19358', 'l18793', 'l1282', 'l19247', 'l23053', 'l7496', 'l5165', 'a15837', 'l23610', 'l19613', 'l19405', 'a15831', 'l15696', 'l24816', 'l30608', 'l19251', 'l23234', 'l22351', 'a3411', 'l15692', 'l30594', 'l19615', 'a3412', 'a15914', 'l25974', 'l24818', 'l24820', 'l11652', 'l26136', 'l24877', 'l19599', 'l19400', 'l19245', 'l24825', 'l1279', 'l1287', 'a15885', 'l27749', 'l23235', 'l27757', 'l26150', 'l20100', 'a15827', 'l15689', 'l7497', 'l24636', 'l5838', 'l24621', 'a3413', 'l6497', 'l24946', 'l20103', 'l23052', 'l23566', 'l23236', 'l27751', 'l19399'}
cleaned: 1043868/1043922


Unnamed: 0,show_id,char_name,char_id,dia1,dia2
0,DoctorWho,FOURTH DOCTOR,l8653,Get me the medical officer. Lieutenant Sulliva...,Human history.
1,DoctorWho,FOURTH DOCTOR,l8653,It's something that happened when we first met.,"I tell you, Brigadier, there's nothing to worr..."
2,DoctorWho,FOURTH DOCTOR,l8653,"This the patient, sir?",And stupid. If the square on the hypotenuse eq...
3,DoctorWho,FOURTH DOCTOR,l8653,"There you are. Now come along, Doctor, you're ...",Am I? Don't you mean the infirmary?
4,DoctorWho,FOURTH DOCTOR,l8653,"No, I do not mean the infirmary. I mean the si...",Not fit? I'm the Doctor.
...,...,...,...,...,...
1072564,QueerAsFolk,DEBBIE,l22350,"Don't worry, I've already have things worked o...",A place of your own? At your age?
1072565,QueerAsFolk,DEBBIE,l22350,Your never a kid.,"I know, I've always be kid to me."
1072566,QueerAsFolk,DEBBIE,l22350,Right.,"Yeah, right. Damn-right."
1072567,QueerAsFolk,DEBBIE,l22350,He had other things to do.,What other things are important to this?


In [5]:
# 
_CHAR_BASE_FOLDER = os.path.expanduser('~/Datasets/dodgsons/formal_aaai/')

_live_chars = pd.read_csv(_CHAR_BASE_FOLDER + 'char_features_live.csv')
_animated_chars = pd.read_csv(_CHAR_BASE_FOLDER +'char_features_animated.csv')
chars = _live_chars.append(_animated_chars, ignore_index = True)
# filtering low count features and characters as noise
chars = chars[chars.groupby('char_id')['feature'].transform('count').ge(5)]
chars = chars[chars.groupby('feature')['char_id'].transform('count').ge(5)]
chars = chars.sample(frac=1, random_state=718281).reset_index(drop=True)

inspect_df(chars, 'char_id', 'feature')
chars

COUNT char_id: 45821
COUNT feature: 12815


Unnamed: 0,feature,char_id,work,char_name
0,XtremeKoolLetterz,l41543,StarWars,Xev Xrexus
1,OneHitKill,l3731,Battlebots,Son of Whyachi (#6)
2,TheLancer,l21770,Primeval,Connor Temple
3,BigBrotherInstinct,l18267,ModernFamily,Manny Delgado
4,FinalDeath,a639,Coco,H?ctor
...,...,...,...,...
945514,MissingMom,a4190,Archer,Malory Archer
945515,ButNowIMustGo,l21681,ThePretender,Jarod
945516,EvenTheGuysWantHim,l27524,Survivor,Ken McNickle
945517,HollywoodGenetics,l24084,ShamelessUS,Liam Gallagher


In [6]:
(len(_live_chars.char_id.unique()) + len(_animated_chars.char_id.unique())) /  chars.shape[0]

0.05862917614558777

In [7]:
_animated_chars

Unnamed: 0,feature,char_id,work,char_name
0,ActionDad,a1,OneHundredAndOneDalmatians,Pongo
1,ActionMom,a1,OneHundredAndOneDalmatians,Pongo
2,AffectionateNickname,a1,OneHundredAndOneDalmatians,Pongo
3,BadassAdorable,a1,OneHundredAndOneDalmatians,Pongo
4,BattleCouple,a1,OneHundredAndOneDalmatians,Pongo
...,...,...,...,...
379480,CatsAreMean,a21176,ZipZip,Fluffy and Nugget
379481,SitcomArchNemesis,a21176,ZipZip,Fluffy and Nugget
379482,Tsundere,a21176,ZipZip,Fluffy and Nugget
379483,ValleyGirl,a21176,ZipZip,Fluffy and Nugget


In [8]:
getattr(chars, 'feature').value_counts()

DeadpanSnarker                  5455
Jerkass                         5188
BigBad                          4367
BerserkButton                   4278
JerkWithAHeartOfGold            3974
                                ... 
TheWestern                         5
MilkmanConspiracy                  5
FriendlyFire                       5
CrossDressingVoice                 5
ExpelledFromEveryOtherSchool       5
Name: feature, Length: 12815, dtype: int64

In [9]:
# -- load manual correction data --
manual_correction = pd.read_csv('/home/kits-adm/Workspace/play/dodgson_play/all_hla_cleaned.csv')
cleaned_hlas = manual_correction[(manual_correction.mark==1) & (manual_correction.conditioned==0)]
cleaned_hlas = set(cleaned_hlas.hla.unique().tolist())

print(len(cleaned_hlas))

709


## Utilities for MM calcuation

Utilities for Convert to COO matrix from dataframe.

Utilities for building reference maps back to original category index.

Utilities for training MM models, and perform validation for fraction of the dataset.

Utilities for break into training/testing here for:
* Removing a specific show from training set, and split into testing set.
* Print out info for training / testing set. 
* Perrform full sanity validation for data:
    * prevent training set mapped into testing set.
    * prevent wrong char id passed in.

Manual checking of print out info is required.

In [10]:
# -- UTILITY BLOCK --

# process to coo matrix
def to_coo(df, _obj_col, _feature_col):
    obj_cat = df[_obj_col].astype('category')
    feature_cat = df[_feature_col].astype('category')
#     df[_obj_col] = sent_cat
#     df[_feature_col] = feature_cat

    # create a sparse matrix of all the users/repos
    return (
        coo_matrix((np.ones(df.shape[0]), (feature_cat.cat.codes.copy(), obj_cat.cat.codes.copy()))),
        obj_cat, feature_cat,
    )

# Build refer maps for sentences.
def build_maps(obj_cat):
    obj_map = dict(enumerate(obj_cat.cat.categories))
    # key: sentence id. value: category id of sentence(what the MM model indexing).
    inv_obj_map = {r: i for i, r in obj_map.items()}
    assert len(obj_map) == len(inv_obj_map)
#     print(len(obj_map), len(inv_obj_map))
    return obj_map, inv_obj_map

# Perform training and optional testing.
def get_train(df, col1='sent_id', col2='head_text',
              top_n=200, report_test_acc=True, test_df=None,
              conf=30, factors=30, regularization=0.1, iterations=200, random_state=None):
    """Get training data with optional accuracy reporting.
    
    Params:
        * df: the training dataframe.
            Note: the df is required to be shuffled and only contain training data.
        * col1: object col name in df.
        * col2: feature col name in df.
        * random_state: a random state used in dividing training and testing, if test_df is not pass in.
        * top_n: top n used in evaluate precision.
        * report_test_acc: bool, default true. If false, precision evaluate will not be performed.
            Recommand to enable this option for sanity checks.
        * test_df: a dataframe is explicitly passed in. The column format is expected to be the same as df.
            Default None. If none, testing data is randomly divided. Otherwise use it as testing data.
        * conf: int or float, default 30. Conf value used in MM training.
            Note: We use simple confidence (multiply to coo) for training data.
        * factors: int, default 30. Factor used for MM training.
        * regularization: float, default 0.1. regularzation used in MM training.
        * iterations: int, default 200. Iterations for MM training.
    """

    coo, obj_cat, feature_cat = to_coo(df, col1, col2)
    print(repr(coo))  # for manual sanity check of dem

    # sanity check for accuracy
    if report_test_acc:
        print('--SANITY: Validate accuracy --')
        if test_df is None:
            # seeding is now a job of controller class, only seed if explicit
            if random_state:
                np.random.seed(random_state)
            train_csr, test_csr = train_test_split(coo, train_percentage=0.7)
            train_csr, test_csr
        else:
#             train_csr = to_coo(df, col1, col2)
            assert len(test_df) > 0
            train_csr = coo
            test_csr = to_coo(test_df, col1, col2)
        _als = implicit.als.AlternatingLeastSquares(factors=factors,
                                                    regularization=regularization, iterations=iterations)
        _als.fit(train_csr * conf)
        prec = precision_at_k(_als, train_csr.T, test_csr.T, K=top_n)
        print('OBJ precision at top {}: {:.5f}%'.format(top_n, prec*100))
    
    # train include all data, return the model and categories
    print('--TRAIN: train on complete matrix --')
    _als = implicit.als.AlternatingLeastSquares(factors=factors,
                                                regularization=regularization, iterations=iterations)
    _als.fit(coo * conf)
    obj_map, inv_obj_map = build_maps(obj_cat)
    return _als, obj_map, inv_obj_map

def break_data(dialog_df, head_df, target_char_id, test_shows):
    """Clean dialog and head data with info and strong sanity validation.

    Return:
        * train_dialogs: df for all dialogs data allow to be used in bert training.
        * train_heads: df for all head data that allow to be used in MM training.
        * test_dialogs: df for all testing dialogs data.
            The testing dialog data include all dialogs in the show, exclude the target.
        * target_dialogs: df for all dialogs of the target character.
    """
    # details: the char id should be and only should be in show for both dialog and head data.
    test_shows = set(test_shows)
    test_dialogs = dialog_df.loc[(dialog_df.char_id!=target_char_id) & (dialog_df.show_id.isin(test_shows))]
    target_dialogs = dialog_df.loc[dialog_df.char_id==target_char_id]
    assert len(test_dialogs) > 0  # testing set not empty
    assert len(target_dialogs) > 0  # target set not empty
    target_row = target_dialogs.iloc[0]
    # validate target character is in test shows
    assert target_row.show_id in test_shows
    print('TARGET: {} from {}, total of {} dialogs'.format(target_row.char_name, target_row.show_id, len(target_dialogs)))

    # validate the target character is not in training.
    train_dialogs = dialog_df.loc[~dialog_df.show_id.isin(test_shows)]
    # check target is not in training set.
    assert len(train_dialogs.loc[train_dialogs.char_id==target_char_id]) == 0
    # double check none of test show is not in training.
    for sid in test_shows:
        assert len(train_dialogs.loc[train_dialogs.show_id==sid]) == 0
    # validate no overlap of characters between train and test
    assert len(set(train_dialogs.char_id.unique()).intersection(set(test_dialogs.char_id.unique()))) == 0
    # validate no overlap of shows between train and test
    assert len(set(train_dialogs.show_id.unique()).intersection(set(test_dialogs.show_id.unique()))) == 0
    # validate only one character is in target set, and is our target
    assert len(set(target_dialogs.char_id.unique())) == 1
    assert list(target_dialogs.char_id.unique())[0] == target_char_id
    
    print('Total training dialogs {};'.format(len(train_dialogs)))
    print('Total testing dialogs {};'.format(len(test_dialogs)))
    print('Total target dialogs {};'.format(len(target_dialogs)))

    # filter heads data on train + target dialogs data
    sents = set(train_dialogs.index.values)
    train_heads = head_df.loc[head_df.sent_id.isin(sents)]
    train_heads[train_heads.groupby('head_text')['head_text'].transform('count').ge(5)]
    print('Total training heads: {}, Avg {:.2f} per sentence'.format(
        len(head_df), float(len(head_df)) / len(train_dialogs)))

    # validate on train heads does not overlap with the testing.
    assert len(train_heads.loc[train_heads.char_id==target_char_id]) == 0
    # none of test shows is in training.
    for sid in test_shows:
        assert len(train_heads.loc[train_heads.show_id==sid]) == 0

    return train_dialogs, train_heads, test_dialogs, target_dialogs


# my_model, SENT_MAP, INV_SENT_MAP = get_train(data, report_test_acc=True)

## Utilties for retriving positive and negative character clusters.


### Objectives

Given a target character Ct, retrive a positive cluster of characters that most close to target, and negative cluster that is least close to target. All characters within two clusters have to have dialog data.

The idea is to use positive characters' dialogs for recovering target's talking style, and negative candidates' dialogs to guide agent away from the styles it is not suppose to talk.


### What
A character MM model is first trained to learn corrlations between characters.

Later, the function returns top n and bottom n prec chracters that most and least corrlated to target character.

We perform the algorithm in a fashion simuliar to community algorithms to increase results' robustness: the algorithm looks at two level corrlations to target, and count overlap in order to reduce bias (for details, see following section: Why expanding to neighbours).

Algo details:
* Step 0: train a matrix multiplication (MM) model to learn characters and their corrlations on Human level attributes (HLAs). The model is trained on row-wise HLAs, and column-wise characters. The objectives is to learn which characters are more corrlated to target characters.
* Step 1: Level one ranking. Rank similar users in terms of confidence value against target character based on learned MM model. The top prec_cutoff is a percentage value to control the range. Of all characters fall in the top range of this percentage are considered to be positive corrlated to target. We call all characters being identified here as "level 1 targets".
* Step 3: For each of level 1 target identified in the last step, expand again with their confidence values. This gives a bigger sets of characters for each level one targets.
* Step 4: Counts overlap of positives to neighbours, and rank their frequncy to acquire the list.

Adjustable parameters:
* prec_cutoff: precentage of top characters in consideration.
* level1_limit: top of those neighbours in positive / negative sets are used to expand.
    The value may be none, which consider all candidates in precentage cutoff.
* level2_limit: for each neighbours, its positive neighbours will be used in counting overlap.
* acceptable_overlap (per set): If a character overlap freqency is bigger than this number, it is considered as a positive character. Smaller, it indicates the character is less likely to be in the same community to the target character, which will be considered as "neutral" or "negative".
    * Note: this is a safety measure to ensure quality of data.
    * E.g., if the number is 5 for positive set, means the candidates has to rank high for at least 5 times in level 1 ranking.

In our experiments with some human inspection, the following paramters in general works well. However, human opinion are highly biased. But this is currently the best we can do. What we do know is that, the algorithm tends to provides relatively stable results (the set of positive characters) for same target character, regardless of parameters (as long as they are not too extreme).

Our experiment condition (best bet):
* prec_cutoff: 5(%).
* level1_limit: None.
* level2_limit: 30.
* Acceptable_overlap: 10.

Note: this setting roughly give us around 140 characters as positives and decent amount of negative characters in general, for any characters.

### Why expanding to neighbours

There are multiple good reasons expand to neighbours' positives is a better idea than simple retrieval of top/bottom neighbours.

* The MM model sometimes cares about specific character HLA, or a set of HLAs of the target characters. This is bad because the neighbours are retrived with high bias to the HLA. In a wrost case, e.g., If target Ct correlated to C1 becuase F1, and C2 because F2, count C1 and C2 means higher biases in training data to F1 and F2. Expanding to neighbours and counting overlap helps to minimize this problem by minimizing the risks of considering multiple neighbours to balance it out with their positives. E.g., in last example, The risk of considering F1 as high bias feature is minimized by minimizing probability of F1 in both sets, not only because it uses F2 as balance, but also due to it is less likely to consider F1 again twice, overlapping with C1 neighbours. 
* In some cases (not very often), the top/bottom neighbours are not enough (not very often, however in some cases, like Rachel from Friend does not have enough negative candidates, most likely because she represents a "norm" characters that happens a lot in training set).

## Utilities of perform HLA reductions

### Objectives

Due to the massive size of HLAs, making it hard for machine to model. The idea of HLA reduction is to dramatically reduce the size of overall HLAs to be smaller.

### How

The overall procedure of HLA reduction follows two steps: division and reduction mapping.

Division: Identify and divide all HLAs into two set: "big" and "small" set. The "big" HLAs represent a general "big" concepts that been close to many HLAs. We measure this by frequency of every HLAs corrlations references to any other in their top-n list. Depend on reduction rate (RR), we took the top RR precent of highest frequency as "big" HLAs.

Reduction mapping: once we acquire the big set, we simply loop through every HLAs and attempt to map to its "bigger" counter part:

* If the HLA is in big set, map to itself.
* If the HLA is in small set (not in big set), map to the first big HLA encountered in its MM corrlation list.
* If the HLA is in small set and fail to locate any big HLA, we count as reduction failure. See below for details.

### Parameters and reduction failures

Two parameters in play. We observed top-n as 30 and reduction rate as 0.3 produce decent results.

* top-n: top n most correlated HLAs that will be consider to be close set.
* reduction rate: precentage of HLAs that you want to left with. Notice a important tradeoff: when this value is small, it overall reduce the diffculity of modeling dialogs. However, the concept mapping becomes more "fuzzy", which rely more on MM model's believe of HLAs. In extreme cases of smaller value, it is impossible for MM model to do a good job. In addition, failure rate would be high, see below.

Reduction failure happens when an HLA is in the smaller set, and fail to locate any "big" HLA in its top n correlation list. In the case of failure, the HLA is mapped to itself.

Too much failures may damage the overall effectiveness of procedure, due to make it overall sparsed, like before. We observed a very small failure rate when choose top=100 at 25% success rate.

In [11]:
# limited to cleaned HLAs (only for this notebook)
# Comment out this and next block to cancel manual cleaning procedure (our main result).
# notice this is code block is the only different one.

chars

Unnamed: 0,feature,char_id,work,char_name
0,XtremeKoolLetterz,l41543,StarWars,Xev Xrexus
1,OneHitKill,l3731,Battlebots,Son of Whyachi (#6)
2,TheLancer,l21770,Primeval,Connor Temple
3,BigBrotherInstinct,l18267,ModernFamily,Manny Delgado
4,FinalDeath,a639,Coco,H?ctor
...,...,...,...,...
945514,MissingMom,a4190,Archer,Malory Archer
945515,ButNowIMustGo,l21681,ThePretender,Jarod
945516,EvenTheGuysWantHim,l27524,Survivor,Ken McNickle
945517,HollywoodGenetics,l24084,ShamelessUS,Liam Gallagher


In [13]:
chars = chars[chars.feature.isin(cleaned_hlas)]

In [14]:
# Note: character (HLA) model does not need to be split. Use the same model for everything.
char_model, CHAR_MAP, INV_CHAR_MAP = get_train(chars, random_state=649128,
                                               factors=36,  regularization=100, iterations=500, conf=20,
                                               top_n=100, col1='char_id', col2='feature')

  0%|          | 0/500 [00:00<?, ?it/s]

<709x44288 sparse matrix of type '<class 'numpy.float64'>'
	with 384353 stored elements in COOrdinate format>
--SANITY: Validate accuracy --


100%|██████████| 500.0/500 [04:37<00:00,  1.80it/s]
100%|██████████| 44288/44288 [00:07<00:00, 5903.46it/s]
  0%|          | 0/500 [00:00<?, ?it/s]

OBJ precision at top 100: 45.09622%
--TRAIN: train on complete matrix --


100%|██████████| 500.0/500 [04:53<00:00,  1.70it/s]


In [15]:
# -- Merge features (Optionally apply) --

class HLAShrinker:
    def __init__(self, model=char_model, char_df=chars):
        self.model = model
        # establish maps
        # for inv: key is feature, value is id.
        self.map, self.inv_map = build_maps(char_df.feature.astype('category'))

    def inspect(self, feature_name=None):
        if feature_name is None:  # inspect all data structure
            pprint(self.inv_map)
        else:
            try:
                _fid = self.inv_map[feature_name]
            except:
                raise Exception('No such HLA: {}'.format(feature_name))
            print('--Inspect close to {}--'.format(feature_name))
            _sims = self.model.similar_items(_fid, N=20)
            for fid, conf in _sims:
                print(self.map[fid], conf)
            print()

    def shrink(self, top=30, to=0.3):
        """Shrink HLAs to smaller set and build map for fast indexing.
        
        Each HLA will be identify as "big" or "small" HLA. In general "big" HLA represent an overall general concept
        of certain HLA concept, and in general measured by appearances of top ranking amoung all others.
        All of big HLA forms the smaller set of mapping. All of small HLA will attempt to map to the most likely big
        HLAs.
        
        Params:
        * top: int, default 30. The top n HLA will be considered as approximate equal for each HLA.
        
        Return:
            HLA map: map of every HLA to its smaller equal.
                Key: HLA. All HLAs is included as key.
                Value: HLA, the smaller and approximate equal of the HLA.
        """
        print('-- Start HLA reduction --')
        assert (to > 0 and to < 1)
        # first, loop through once to establish frequency on each HLA expansion.
        _freq = Counter()  # key: hla id, value: freq.
        for hla_id in self.map.keys():
            for fid, _ in self.model.similar_items(hla_id, N=top):
                if fid != hla_id:  # not measure self
                    _freq[fid] += 1
        _total = len(_freq)
        _reducted = int(_total * to)
        print('Total feature {}, apply reduction to {}'.format(_total, _reducted))
        
        # establish mapping
        bigs = set()  # keep set of all bigs
        maps = {}  # key: small, value big.
        failures = 0  # failures of mapping to big. This is a log info, no real impact
        for hla_id, _ in _freq.most_common(_reducted):
            bigs.add(hla_id)
        for hla_id in self.map.keys():
            hla_name = self.map[hla_id]
            success = False
            if hla_id in bigs:  # the hla is already big
                maps[hla_name] = hla_name  # map to self
            else:
                for fid, _ in self.model.similar_items(hla_id, N=top):
                    if (fid != hla_id) and (fid in bigs):
                        maps[hla_name] = self.map[fid]  # small to big, map to the first big encountered
                        success = True
                        break
                # fail to map to any bigs
                if not success:
                    failures += 1
                    maps[hla_name] = hla_name  # just map to self
        print('total missed in reduction: {}/{}'.format(failures, len(maps)))
        return maps


sh = HLAShrinker()
# sh.inspect()
sh.inspect('AbusiveParents')
# sh.inspect('ActionGirl')
# sh.inspect('TheHeart')
# sh.inspect('BrokenBird')
# sh.inspect('KickTheDog')
# sh.inspect('MadeOfIron')
# sh.inspect('Bookworm')
REDUCTED_HLAs = sh.shrink()  # one time calcuation
pprint(REDUCTED_HLAs)

--Inspect close to AbusiveParents--
AbusiveParents 3.9166937
ParentalNeglect 3.0428889
FreudianExcuse 2.8469195
DomesticAbuse 2.458502
OffingTheOffspring 2.4528542
TheAlcoholic 2.3407366
SelfMadeOrphan 2.1293437
DrowningMySorrows 2.066665
MyBelovedSmother 2.042013
DisappearedDad 2.0347943
UsedToBeASweetKid 2.0004985
ParentalAbandonment 1.9975828
BrokenBird 1.9942929
MissingMom 1.9851972
WellDoneSonGuy 1.9291316
TheUnfavorite 1.8310939
DarkAndTroubledPast 1.7971201
LackOfEmpathy 1.7938548
CallingTheOldManOut 1.7793034
ReallyGetsAround 1.7720815

-- Start HLA reduction --
Total feature 709, apply reduction to 212
total missed in reduction: 2/709
{'AFatherToHisMen': 'TheCaptain',
 'AGodAmI': 'AGodAmI',
 'AbhorrentAdmirer': 'IJustWantToHaveFriends',
 'AbusiveParents': 'SelfMadeOrphan',
 'AcePilot': 'BadassInDistress',
 'AchillesHeel': 'TheJuggernaut',
 'Acrofatic': 'Gasshole',
 'ActionDad': 'BadassInCharge',
 'ActionGirl': 'LadyOfWar',
 'ActionMom': 'BattleCouple',
 'ActionSurvivor': 'Comb

In [16]:
# a little helper for display character names. For human readability.

_CHAR_NOTES_CACHE = {}
def get_char_note(cid, char_df=chars):
    if cid in _CHAR_NOTES_CACHE:
        return _CHAR_NOTES_CACHE[cid]
    select = char_df.loc[char_df.char_id==cid]
    if len(select) == 0:
        return 'Minor character (not enough features).'
        # raise Exception('Can not find character with id {}.'.format(cid))
    first = select.iloc[0]  # all rows has same char_id and work, so get first.
    return '{}|{}'.format(first.work, first.char_name)

In [17]:
CHAR_COUNT = len(CHAR_MAP)

class RetriveCharCluster:
    def __init__(self, target_char_id,
#                  all_dialog_df=dialogs, all_char_features=chars,  # those two for debug info only
                 char_mm=char_model, char_df=chars, char_map=CHAR_MAP, invert_char_map=INV_CHAR_MAP,
                 # Limit to cleaned hlas
                 limit_hlas=None):
        self.char_df = char_df
        self.target = target_char_id
        self.char_mm = char_mm
        print('\n----- LOCK target character {} -----'.format(get_char_note(self.target)))
        # manual checking to see if it is good, should be enough to provide good results.
#         print('Character info. Total {} dialogs, {} features.'.format(
#             len(all_dialog_df.loc[all_dialog_df.char_id==self.target]), ))
        
        # character information
        self.map = char_map  # key: char cat; value: char id.
        self.invert_map = invert_char_map  # key: char id, value: char cat.
        # uniqueness sanity check
        assert (len(self.map) != 0) and (len(self.map) == len(self.invert_map))
        # total amount of unique character, this will be used multiple times later.
        self.count_all = len(self.map.keys())

        # build complete ranking toward target
        self._ranks = self.char_mm.similar_users(self.invert_map[target_char_id], N=self.count_all)
        
        # pass on hlas limits
        self.limit_hlas = limit_hlas
        if limit_hlas:
            # has to be set and not empty
            assert type(limit_hlas) == type(set())
            assert len(limit_hlas) > 0

    def _expand(self, ranks, candidate_map, level1_limit, level2_limit, acceptable_overlap):
        """Expand based on given top rank items. For detail see docs.
        
        top_ranks params requires to be ranked relative to positive/negative and limits to only candidates.
        
        Params:
            * ranks: return results of similar_users. List of two tuple with (confident, )
            * candidate_map: dict, map to candidate that has dialogs.
                If key does not exists, it is not a candidate.
                Key: candidate char cat; value: candidate char id.
            * level1_limit, level2_limit, acceptable_overlap: see documentation.
        
        Returns:
            Return list of elements.
            Each list contain tuple elements. Tuple element format:
                * rank: ranking position.
                * char_id: corrlated character id.
                * conf: confidence value.
                * char_text: tv show name + character name. For manual checking and displaying propose.
        """
        assert len(ranks)  # not empty
        if level1_limit is None:
            level1_limit = self.count_all
        assert level1_limit > 3
        assert level2_limit > 3  # not too low, high bias.
        assert acceptable_overlap > 1  # can not equal to 1.
        # level 2 holder
        # key: candidate category. Value: list of two elements: [frequence, score].
        level2 = {}
        # level 1 candidate holder.
        level1 = []  # list of level 1 char cat.
#         print('Considering all of {} as level1 neighbour candidates.'.format(len(ranks)))
        for r in ranks:
            # input ranking element format: char cat, confidence.
            if len(level1) > level1_limit:
                break
            level1.append(r[0])
        assert len(level1)
#         acceptable_overlap = int(float(len(level1)) / acceptable_overlap_per_set)
        print('Level 1 total {}.'.format(len(level1)))
        # expand to level 2
        for char_cat in level1:
            for rank in self.char_mm.similar_users(char_cat, N=level2_limit):
                if rank[0] in level2:
                    level2[rank[0]] = level2[rank[0]] + 1
                else:
                    level2[rank[0]] = 1
        assert len(level2)
        print('Level 2 total {}'.format(len(level2)))
        print('filtering on {} accepable overlap.'.format(acceptable_overlap))
        pos = {}  # key: freq. value: list of element.
        neu = {}
        for char_cat, freq in level2.items():
            if char_cat in candidate_map:
                holder = None
                if freq >= acceptable_overlap:
                    holder = pos
                else:
                    holder = neu
                cid = self.map[char_cat]
                if freq in holder:
                    holder[freq].append(cid)
                else:
                    holder[freq] = [cid]
#                     res.add((self.map[char_cat], freq))  # add back character id.
        return pos, neu

    def _build_chars(self, list_, log_scale=True):
        res = []
        for freq in sorted(list_.keys(), reverse=True):
            for cid in list_[freq]:
                note = get_char_note(cid)
                if log_scale:
                    weight = math.log(freq)
                else:
                    weight = freq
                res.append((weight, cid, note, freq))
                print('freq {} weight {}: {}'.format(freq, weight, note))
        return res

    def retrive(self, df_dialog_train,
                prec_cutoff=10, level1_limit=None, level2_limit=30, acceptable_overlap=10, log_scale=True):
        """Given a dialog training set, retrive all positive and negative character candidates to target character.
        
        Returns:
            * pos_l: positives character set.
                Each element as format: weight, char_id, note, actual weight.
            * neg_l: negative character list. (Each element follows format of positives).
            * attributes: ranked attributes of the target character.
                The ranking is calcuated from highest cross over freq to lowest in positive set.
        """
        _cands = {}  # map like invert char map, but only for candidiates (chars that have dialogs)
        # calculate possible overall ranking positions that allow to be considered as pos/neg candidates.
        assert 3 < prec_cutoff < 49  # check safe amount of cutoff
        select_amount = int(float(self.count_all)* prec_cutoff / 100)
        print('Considering {} out of {} top ranked characters.'.format(select_amount, self.count_all))
        print(len(df_dialog_train.char_id.unique()))
        for cid in df_dialog_train.char_id.unique():  # for all chars that has training dialogs
            if cid in self.invert_map:  # there is a possibility that cid is filtered out due to low freq count
                _cands[self.invert_map[cid]] = cid  # key: char cat; value: char id.
        pos, neu = self._expand(self._ranks[:select_amount], candidate_map=_cands,
                                level1_limit=level1_limit, level2_limit=level2_limit,
                                acceptable_overlap=acceptable_overlap)
        # return format: dict, with freq as key, list of candidate as value.
#         res = []
#         for freq in sorted(positives.keys(), reverse=True):
#             for cid in positives[freq]:
#                 note = self._get_text(cid)
#                 res.append((freq, cid, note))
#                 print('Weight {}: {}'.format(freq, self._get_text(cid)))
        print('----POSITIVE----')
        pos_l = self._build_chars(pos, log_scale=log_scale)
        print('----NEUTRAL----')
        neu_l = self._build_chars(neu, log_scale=log_scale)
        print('total positive: {}, neutral {}.'.format(len(pos_l), len(neu_l)))
        return pos_l, neu_l

def weight_shock(char_id, positive_chars=None, char_df=chars,
                 most_common_attr=40, apply_shock=False, decouple_attr=False, reduction_map=None, cleaned_hlas=None):
    """Weight the attributes to character then apply a culture shock.
    
    Parameters:
    * char_id: character id.
    * positive_chars: list of positive character.
        The list follow standard format (see retrive).
    * char_df: characte dataframe.
    * most_common_attr: filter the return results to max of this amount.
    """
     # == calcuate common attributes ==
    # Note: the attributes is weighted.
    cnt = Counter()
    positive_weights = Counter()
    for p in positive_chars:
        for hla in (char_df.loc[char_df.char_id == p[1]]).itertuples():
            feature_name = hla.feature
            if cleaned_hlas is not None:
                if feature_name not in cleaned_hlas:
                    continue  # ignore any hla that is not clean
            if reduction_map is not None:
                feature_name = reduction_map[feature_name]
            cnt[feature_name] += 1
            positive_weights[feature_name] += (p[0] + 1)

    # weight and shock: collect all freq on HLA for the positive cluster.
    # note: the attribute is first weighted and then apply culture shock.
    # The attribute only count within if the freqency is bigger than min shock prec (prevent insuffient data)
    weight_freq = {}  # key: attribute name, value: weighted value.
    for attr in cnt.most_common():
        if apply_shock:
            weight_freq[attr[0]] = (positive_weights[attr[0]]) / (math.log(attr[1]) + 1)
        else:
            weight_freq[attr[0]] = (cnt[attr[0]])
    # filter to character attributes
    attrs = []
    target_hlas = set((char_df.loc[char_df.char_id == char_id]).feature.tolist())
    if reduction_map is not None:
        tmp = set()
        for hla in target_hlas:
            tmp.add(reduction_map[hla])
        target_hlas = tmp  # replace it
#     print(weight_freq)
#     print('---------')
#     print(sorted(weight_freq.items(), key=lambda kv: kv[1], reverse=True))
    for attr in sorted(weight_freq.items(), key=lambda kv: kv[1], reverse=True):
        if attr[0] in target_hlas:
            if decouple_attr:
                # break to words
                # credit: https://stackoverflow.com/questions/5020906
                attrs.append(re.sub(r'((?<=[a-z])[A-Z]|(?<!\A)[A-Z](?=[a-z]))', r' \1', attr[0]).lower())
            else:
                attrs.append(attr[0])
        if len(attrs) >= most_common_attr:
            break
    return attrs


# # test, l16826, Don from MadMen. 
# _rc = RetriveCharCluster('l16826')
# _rc.retrive(dialogs.loc[dialogs.show_id!='MadMen'])
# # test, l16829, 
# _rc = RetriveCharCluster('l11612')
# _rc.retrive(dialogs.loc[dialogs.show_id!='GilmoreGirls'])
# test, l16257, Hiccup, how to train your dragon
# _rc = RetriveCharCluster('l16257')
# _rc.retrive(dialogs)

# # test, l10691, Rachel from Friends.
# _rc = RetriveCharCluster('l10691')
# _ps, _ns = _rc.retrive(dialogs.loc[dialogs.show_id!='Friends'])

# Sheldon
_rc = RetriveCharCluster('l4390')
_ps, _ns = _rc.retrive(dialogs.loc[dialogs.show_id!='TheBigBangTheory'])

print('========POSITIVES========')
pprint(_ps)
print('========NEGATIVES========')
pprint(_ns)
print('========attrs=======')
pprint(weight_shock('l4390', _ps, decouple_attr=True))
print('========reducted======')
pprint(weight_shock('l4390', _ps, decouple_attr=True, reduction_map=REDUCTED_HLAs))


----- LOCK target character TheBigBangTheory|Sheldon Cooper from The Big Bang Theory and Young Sheldon -----
Considering 4428 out of 44288 top ranked characters.
320
Level 1 total 4428.
Level 2 total 23469
filtering on 10 accepable overlap.
----POSITIVE----
freq 118 weight 4.770684624465665: Merlin2008|Merlin The Character
freq 114 weight 4.736198448394496: Supernatural|Sam Winchester
freq 106 weight 4.663439094112067: Supernatural|Dean Winchester
freq 101 weight 4.61512051684126: OnceUponATime|Rumplestiltskin/Mr.Gold/Weaver
freq 95 weight 4.553876891600541: Buffyverse|Angel, n? Liam
freq 74 weight 4.30406509320417: TheSimpsons|Lisa Marie Simpson
freq 73 weight 4.290459441148391: Friends|Monica E. Geller
freq 73 weight 4.290459441148391: Merlin2008|Arthur Pendragon
freq 67 weight 4.204692619390966: TheSimpsons|open/close all folders
freq 64 weight 4.1588830833596715: HowIMetYourMother|Barney Stinson
freq 60 weight 4.0943445622221: Supernatural|Castiel
freq 59 weight 4.07753744390572: 

freq 11 weight 2.3978952727983707: TheOC|Marissa Cooper
freq 11 weight 2.3978952727983707: Smallville|Lois Lane
freq 11 weight 2.3978952727983707: OnceUponATime|Princess Snow White/Mary Margaret Nolan
freq 10 weight 2.302585092994046: TheSimpsons|Nedward "Ned" Flanders
freq 10 weight 2.302585092994046: StarTrek|Captain Jonathan Archer (Scott Bakula)
freq 10 weight 2.302585092994046: StarTrek|Captain Kathryn Janeway
freq 10 weight 2.302585092994046: NCIS|Senior Special Agent Timothy "Tim" McGee
freq 10 weight 2.302585092994046: GilmoreGirls|Zach van Gerbig
freq 10 weight 2.302585092994046: Smallville|Lana Lang
----NEUTRAL----
freq 9 weight 2.1972245773362196: Supernatural|Bobby Singer
freq 9 weight 2.1972245773362196: AdventureTime|Princess Bubblegum
freq 9 weight 2.1972245773362196: MyLittlePonyFriendshipIsMagic|Characterization and Background
freq 9 weight 2.1972245773362196: TeenWolf|Lydia
freq 9 weight 2.1972245773362196: StarTrek|Lieutenant Commander Tuvok
freq 9 weight 2.197224577

freq 1 weight 0.0: QueerAsFolk|Debbie Novotny
freq 1 weight 0.0: TheOriginals|Hayley Marshall-Kenner
freq 1 weight 0.0: OneTreeHill|Quinn James
freq 1 weight 0.0: DoctorWho|Jamie McCrimmon
freq 1 weight 0.0: OneTreeHill|Antwon "Skills" Taylor
freq 1 weight 0.0: Charmed1998|Leo Wyatt
freq 1 weight 0.0: DoctorWho|Rani Chandra
freq 1 weight 0.0: Smallville|Jonathan Kent
freq 1 weight 0.0: DoctorWho|Barbara Wright
freq 1 weight 0.0: DoctorWho|Gwen Cooper
freq 1 weight 0.0: NCIS|Director Jenny Shepard
freq 1 weight 0.0: Smallville|Martha Kent
freq 1 weight 0.0: DawsonsCreek|Audrey Liddell
freq 1 weight 0.0: Merlin2008|Gaius
freq 1 weight 0.0: Charmed1998|Phoebe Halliwell
freq 1 weight 0.0: Charmed1998|Paige Matthews
freq 1 weight 0.0: OneTreeHill|Deb Scott
freq 1 weight 0.0: TheOfficeUS|Phyllis Margaret Lapin-Vance
freq 1 weight 0.0: QueerAsFolk|Justin Taylor
freq 1 weight 0.0: TheLWord|Alice Pieszecki
freq 1 weight 0.0: GreysAnatomy|George O'Malley
freq 1 weight 0.0: OneTreeHill|Chase Adam

['hypocrite',
 'butt monkey',
 'adorkable',
 'jerk with a heart of gold',
 'pet the dog',
 'its all about me',
 'jerkass',
 'took a level in jerkass',
 'running gag',
 'freudian excuse',
 'innocently insensitive',
 'foil',
 'morality pet',
 'bunny ears lawyer',
 'brutal honesty',
 'abusive parents',
 'manipulative bastard',
 'took a level in kindness',
 'large ham',
 'man child',
 'cloudcuckoolander',
 'brilliant but lazy',
 'lack of empathy',
 'book dumb',
 'anti hero',
 'honor before reason',
 'never my fault',
 'insufferable genius',
 'tsundere',
 'pride',
 'no social skills',
 'ambiguous disorder',
 'control freak',
 'disappeared dad',
 'bad liar',
 'hollywood nerd',
 'like brother and sister',
 'crazy jealous guy',
 'small name big ego',
 'narcissist']
['its all about me',
 'jerk with a heart of jerk',
 'sugar and ice personality',
 'incorruptible pure pureness',
 'screams like a little girl',
 'mistaken for gay',
 'ditzy genius',
 'used to be a sweet kid',
 'good is not nice',
 '

In [None]:
# raise Exception()

## Simple dialog Training Writing

In [18]:
import re
import math

RE_NL = re.compile(r'\n', re.S)
RE_T = re.compile(r'\t', re.S)
RE_B = re.compile(r'\|', re.S)

def clean_line(line):
    """Remove special char in line
    """
    try:
        line = re.sub(RE_NL, ' ', line)
        line = re.sub(RE_T, ' ', line)
        line = re.sub(RE_B, ' ', line)
    except:
        print('{}:|{}|'.format(type(line), line))
        print(line==np.nan)
        print(math.isnan(line))
        raise
    return line.strip()

clean_line('   There is a greater love. Men out there, young men,\tare\tdying for\nit. Dying so that Atrios might live.   ')

'There is a greater love. Men out there, young men, are dying for it. Dying so that Atrios might live.'

## Training set generation

The script generate training set, and optionally testing set given dialogs, their heads, and character information.

Options:
* Weight: control training repetition in data for each character Ci in training. Choose one of:
    * weight_corr: weight on Ci log scaled corrlation to Ct in cluster for each character Ci, follows by a normalization to quantity of character dialogs.
        * Note: can only be enabled with GT option on positive. Negative characters does not have corrlation weight.
    * weight_char: weight on Ci dialog quantity only.
    * no_weight: do not repeat on training data.
* GT selection: choose what characters' dialogs are selected as training data.
    * all: use all characters as training ground truth.
    * positive: use only characters that are positively corrlated to target.
* Neg option: negative characters for sentence candidate option. Those characters' dialog is used to generate negative sentence candidates. For details about controlling how the sentences are actually drawed, use sentence option (next). 
    * bench: randomly draw a sentence and use it as negative candidate.
        * Note: This approach in general follows traditional way to train an agent (e.g., benchmark). The idea is that if a sentence is randomly drawed, most likely the context is incorrect as response, therefore can be used as negative candidate.  
    * random: randomly draw a character and use its dialog as negative candidates (dialog data is uniformly distributed to characters).
    * positive: same as random, except only draw from positive character set.
    * negative: same as random, except only draw from negative character set.
* Sent option: sentence neighborhood option.
    * close: use MM setence neighborhood to fill in negative sentence candididates (same as persona test).
    * random: randomly withdraw sentences.

The following parameters should be used to build benchmark:
* weight: no_weight.
* gt: all.
* neg: bench.
* sent: random.

The following parameters are our best estimate of a good model that enhance on target HLAs (e.g., recover target persona):
* weight: no_weight.
* gt: all.
* neg: negative.
* sent: close.

Note: Due to our limited hardware conditions, we do not have time to inspect all models and its variations. The above is our best bet.

### Additional details

Additional parameters are used to adjust small details in training.
* fixed_repetition: default true. If the weight option is enabled and negative sentences are randomly drawed, there will be repetition for the negatives for the same ground truth sentence.
    * Note: This variable controls if the repetition is the same or not. If false, the negative candidates will be refreshed (randomly draw again) from training set. Theoritically, set this variable to false would help model to see a wider range of negative candidates.
* relax_validation: default false. This variable allows to "relax" the validation set.
    * Note: From what we observed, due to training set is trained at a very strict condition (strict toward target persona), the validation set is the same strict conditions as training. This may cause the training stopped at early stage before fully converge. Our assumption is that if the validation set follows similar condition as testing set (random draw characters as negative candidates) this helps to continue until converge. Recommand to enable this option.
* run_sent_model_test: default false. This simply enable run a MM test on sentence neighboorhood as a sanity check, have no impact to final results.
* build_folder: default true. This control if the folder need to be built. No impact to final results.

In [19]:
import codecs
import time

class ShuffleMachine:
    """Give a list of index number, able to get one index at a time with random order.
    
    Note: currently only bench condition use this class. Reason: bench condition random draw
        one row from training dialog data, the operation is super slow with pandas function sample.
        The class is setup to speed up the process.
    """
    def __init__(self, df_ref, random_state):
        np.random.seed(random_state)
        inds = df_ref.index.tolist()
        np.random.shuffle(inds)
        self.inds = inds
        self.size = len(inds)
        print('RANDOM MACHINE: total load {} elements.'.format(self.size))
        self.curr = 0  # relative index to data list
        self.ref = df_ref

    def get_one(self):
        """Get a random row.
        """
        self.curr += 1
        if self.curr >= self.size:  # out
            # reset index, start from beginning
            np.random.shuffle(self.inds)  # shuffle again to new order
            self.curr = 0
        _ind = self.inds[self.curr]  # select one index
        return self.ref.loc[_ind]


class HLAMachine:
    def __init__(self, positive_list, char_df=chars,
                 decouple=False, apply_reduction=False, force_random=False, cleaned_hlas=None):
        self.char_df = char_df
        self._cache = {}  # key: char_id, value: weighted shock.
        self.pos = positive_list
        self.decouple = decouple
        self.reduction_map = (REDUCTED_HLAs if apply_reduction else None)
        self.force_random = force_random
        if self.force_random:
            print('!!!! Warning: you used force random. Random HLAs will be drawn as sanity checks. !!!!')
            # use the variable as caching of all HLAs.
            self.force_random = chars.feature.unique().tolist()

    def get(self, char_id, limit=4, apply_shock=False):
        if self.force_random:
            # randomly drawn HLAs regardless of character id for sanity checks
            return random.sample(self.force_random, limit)
        else:
            if char_id not in self._cache:
                self._cache[char_id] = weight_shock(
                    char_id, self.pos, char_df=self.char_df,
                    apply_shock=apply_shock, decouple_attr=self.decouple, reduction_map=self.reduction_map,
                    cleaned_hlas=cleaned_hlas)
            hlas = self._cache[char_id]
            if len(hlas) == 0:
                return []
            return random.sample(hlas, min(limit, len(hlas)))


class TrainWriter:
    TRAIN_F = 'train.txt'
    TEST_F = 'test.txt'
    VALID_F = 'valid.txt'
    LOG_F = 'logs.txt'
    # repetition on training data relative to ground truth
    WEIGHTING_OPTIONS = set(['weight_corr', 'weight_char', 'no_weight'])
    # character used in GT selection optiwon
    GT_SELECTION_OPTIONS = set(['all', 'positive'])
    # character selected for fill in negative candidate sentence
    NEG_SELECTION_OPTIONS = set(['bench', 'random', 'positive', 'negative'])
    # sentence selected for fill in negative candidates
    SENT_NEIGHBOR_OPTIONS = set(['random', 'close'])
    # control concentration of HLA feeding in training.
    # postive indicates HLA fits towards positive community. Target means fits towards one target.
    HLA_CONCENTRATION_OPTIONS = set(['positive', 'target'])

    def __init__(self, target_char_id, test_shows, output_folder,
                 dialog_df=dialogs, char_df=chars, head_df=heads,
                 weight_option='weight_corr', gt_option='positive',
                 neg_option='negative', sent_option='close',
                 fixed_repetition=True, relax_validation=False,
                 run_sent_model_test=False, build_folder=True,
                 # hla feeding configs
                 decouple_hla=False, hla_placeholder=-1, hla_empty=False, hla_concentration='positive',
                 # hla reduction config
                 apply_hla_reduction=False,
                 # HLA sanity check condition
                 hla_force_random=False,
                 # HLA manual cleaning option
                 cleaned_hlas=None,
                ):
        self.cid = target_char_id
        self.test_shows = test_shows
        self.fixed_repetition = fixed_repetition  # if negative content in repetition is fixed
        self.relax_validation = relax_validation
        self.build_hla = (hla_placeholder > 0)
        self.hla_empty = hla_empty
        self.hla_placeholder = hla_placeholder
        self.hla_concentration = hla_concentration
        
        # setup options
        assert weight_option in self.WEIGHTING_OPTIONS
        assert gt_option in self.GT_SELECTION_OPTIONS
        assert neg_option in self.NEG_SELECTION_OPTIONS
        assert sent_option in self.SENT_NEIGHBOR_OPTIONS
        assert hla_concentration in self.HLA_CONCENTRATION_OPTIONS
#         if (weight_option == 'weight_corr') and (gt_option != 'positive'):
#             raise Exception('Only positive gt can use corr weight option. GT option {} on weight option {}.'.format(
#                 gt_option, weight_option,
#             ))
        self.weight_option = weight_option
        self.gt_option = gt_option
        self.neg_option = neg_option
        self.sent_option = sent_option
        if (not self.fixed_repetition) and (self.sent_option == 'close'):
            # meaningless to do different repetition with closed sentence neighboorhood: sentences are fixed.
            raise Exception('Does not support fixed repetition with closed sentence neighboorhood.')

        # tmp state
        self.state = None
        self.fallback_count = 0
        
        # build files
        self.output_base = output_folder
        if build_folder:
            if os.path.exists(self.output_base):
                raise Exception('You directory already exists: {}. Stop operation to prevent overwrite.'.format())
            assert not os.path.exists(self.output_base)  # prevents overwrite
            os.mkdir(self.output_base)
            assert os.path.isdir(self.output_base)

        # hold char id filtering result to speed up.
        self._char_dialog_tmps = {} # key: char id. Value: filtered dataframe.
        
        # validate character
        print('--TARGET {} --'.format(target_char_id))
        print('Character check: {} dialogs, {} features.'.format(
            len(dialog_df.loc[dialog_df.char_id==self.cid]),
            len(char_df.loc[char_df.char_id==self.cid].feature.unique())))
        
        # build and seperate train/test for dialogs and heads data
        train_dialogs, train_heads, test_dialogs, target_dialogs = break_data(
            dialog_df=dialog_df, head_df=head_df, target_char_id=self.cid, test_shows=test_shows)
        # sanity: no target char data in training
        assert len(train_dialogs.loc[train_dialogs.char_id==self.cid]) == 0
        
        # save all train characters for now
        _train_chars = set(train_dialogs.char_id.unique())
        _all_chars_len = len(set(dialog_df.char_id.unique()))
        assert len(_train_chars) < _all_chars_len  # not the same
        print('REPORT: {} out of {} characters are in train.'.format(
            len(_train_chars), _all_chars_len,
        ))
        
        self.train_dialogs = train_dialogs
        self.test_dialogs = test_dialogs
        self.target_dialogs = target_dialogs

        # p and n need to build weight and ratios for character cluster
        # Note: weights and ratios are used to guide training set.
        # weights format: [0] weight value, [1] char_id, [2] note about the character. For each array element.
        # ratio value is the actual repetation needed, adjusted to fit at minimal. Pair to index of weights.
        _pos_chars = set()  # positive characters, caching results
        _neu_chars = set()  # neutral characters
        _rc = RetriveCharCluster(self.cid)
        weights, neutral = _rc.retrive(train_dialogs)
        # record positive sets for later
        weight_map = {}
        for w in weights:
            _pos_chars.add(w[1])
            weight_map[w[1]] = w[0]
        for w in neutral:
            _neu_chars.add(w[1])

        if self.gt_option == 'positive':
            print('Positive Characters Selected As GT:')
            self.weights = weights
            self.ratios = self._build_char_ratios()  # build ratios pair up to weights
        elif self.gt_option == 'all':
            print('All Characters Selected As GT:')
            # build weight 1 to all characters in training set
            self.weights = []
            for cid in _train_chars:
                _w = 1.0  # default weight, if not in positive.
                if cid in weight_map:
                    # note: the weighting is plue 1 due to min one have to be at least as big as negatives.
                    # (The min value of neutral is freqency 1, which after log scale equal to 0)
                    _w = weight_map[cid] + 1
                self.weights.append([_w, cid, get_char_note(cid)])
            self.ratios = self._build_char_ratios()
        else:
            raise Exception('GT option not implemented: {}.'.format(gt_option))
        print('REPORT: {} out of {} train characters as GT with gt option {}.'.format(
            len(self.weights), len(_train_chars), gt_option,
        ))
        
        # build hla machine based on gt group
        self.hla_machine = HLAMachine(positive_list=self.weights, decouple=decouple_hla,
                                      apply_reduction=apply_hla_reduction, force_random=hla_force_random,
                                      cleaned_hlas=cleaned_hlas)
        
        # calcuate characters allows for random dialog selection in training
        # Note: the variable train(test)_draw_chars store characters for negatives sentence candidates.
        if self.neg_option == 'random' or self.neg_option == 'bench':
            print('All Character Selected As negatives:')
            self.train_draw_chars = list(_train_chars)
        elif self.neg_option == 'positive':
            print('Positive Character Selected As negatives:')
            self.train_draw_chars = list(set(_pos_chars))  # set is more like a deep copy
        elif self.neg_option == 'negative':
            print('Negative Character Selected As negatives:')
            # build a set that exclude positive and neutral characters
            _exclude = _pos_chars
            # uncomment the following line to exclude negative as candidates
            # (you may found u does not get much negative chars left if you do that...)
#             _exclude = _pos_chars.union(_neu_chars)  # exclude all characters that are not positive or neutral.
            self.train_draw_chars = []
            for cid in train_dialogs.char_id.unique():
                if cid in _exclude:
                    continue
                self.train_draw_chars.append(cid)
            print('Summary: {} total characters in training, {} positive or neutral excluded'.format(
                len(self.train_dialogs.char_id.unique()), len(_exclude),))
            print('{} negatives in consideration.'.format(len(self.train_draw_chars)))
            assert len(self.train_draw_chars) == (len(self.train_dialogs.char_id.unique()) - len(_exclude))
        else:
            raise Exception('Does not recognize negative char option: {}'.format(self.neg_option))
        
        # in addition, bench option need a shuffle machine.
        if self.neg_option == 'bench':
            self.shuffle_machine = ShuffleMachine(train_dialogs, random_state=33050392)  # from random.org
        
        # calcuate testing characters for negative sentence neighboorhood
        self.test_draw_chars = (test_dialogs.loc[test_dialogs.char_id!=self.cid]).char_id.unique()
        # sanity check: should never draw target character to fill in negative sentence candidates
        assert self.cid not in set(self.test_draw_chars)
        # sanity: no character cross over in train/bench against test
        assert len(set(self.train_draw_chars).intersection(set(self.test_draw_chars))) == 0
        print('Character selection: total of {}/{} train candidates, {} test candidates.'.format(
            len(self.train_draw_chars), len(train_dialogs.char_id.unique()), len(self.test_draw_chars)))
        
        # build sentence neighbourhood model
        # sentence neighbourhood model is build on all heads of sentence in training set.
        # the neighbourhood sentence selection set is on character set of train_draw_chars.
        # a sentence pool is cached to speed up the operation (so do not have to validate character draw everytime).
        if sent_option == 'close':
            # note: this build follows strict hidden of testing data.
            self.sent_model, self.sent_map, self.inv_sent_map = get_train(
                train_heads, report_test_acc=run_sent_model_test)
            assert len(self.sent_map) == len(self.inv_sent_map)
            print('sentence model considering: {}'.format(len(self.sent_map)))
            # Pre-compute sentence pool, a set of sentence category that available for neighboorhood.
            # note: this pool limits to char candidate set, only sentence with char is valid.
            _valid_chars = set(self.train_draw_chars)  # only allow to draw sentence from those characters
            _limit_dialogs = train_dialogs.loc[train_dialogs.char_id.isin(_valid_chars)]
            self.sent_pool = set()  # set of sentence category
            # convert sent id to sent category, so when checking, does not have to do convertion everytime.
            for sid in _limit_dialogs.index.values:
                if sid in self.inv_sent_map:
                    self.sent_pool.add(self.inv_sent_map[sid])
            # 4% of sentence is removed on average
            print('Some sentences may be removed due to not having enough good features.')
            print('Total of {}/{} sentences in consideration.'.format(len(self.sent_pool), len(_limit_dialogs)))

    def _path(self, path):
        return (os.path.join(self.output_base, path))

    def _get_char_dialogs(self, cid):
        """Simply get a character dialogs in training with caching.
        """
        if cid in self._char_dialog_tmps:
            _char_dialogs = self._char_dialog_tmps[cid]
        else:
            dialog_set = self.train_dialogs
            if self.state == 'test':
                dialog_set = self.test_dialogs
            _char_dialogs = dialog_set[dialog_set.char_id==cid]
            self._char_dialog_tmps[cid] = _char_dialogs
        if len(_char_dialogs) == 0:
            raise Exception('Can not find any dialogs with char id: {}'.format(cid))
        assert _char_dialogs.iloc[0].char_id == cid  # validate
        return _char_dialogs

    def _build_char_ratios(self):
        """Character ratio is used in repetations in training data to control learning weight.
        """
        ratios = []
        for w in self.weights:
            weight = w[0]
            cid = w[1]
            dialog_count = len(self._get_char_dialogs(cid))
            if self.weight_option == 'weight_corr':
                ratios.append(float(weight) / dialog_count)
            elif self.weight_option == 'weight_char':
                ratios.append(float(weight))
            elif self.weight_option == 'no_weight':
                ratios.append(float(1))
            else:
                raise Exception('Does not recognize weighting option: {}'.format(self.weighting_option))
        # convert all ratios to ints in approximate ratios (for repetition in minimal file length)
        norm = np.array(ratios)/ min(ratios)
        res = np.rint(norm)
        assert len(res) == len(self.weights)
        return res.astype(int)

    def get_random_dialog2(self, num_of_cands, start_sent):
        """Get a fixed number of random dialog line as training candidate.

        Note that:
        * Each character has equal weight to be selected.
        * No duplicate candidiate are same.
        * Candidates choosed randomly from dialog 2.
        """
        res = set([start_sent])
    #     char_list = chars.char_id.unique()
        while len(res) < num_of_cands:
            # first select a random character (unify distribution of using each char as candidates)
            if self.state == 'train':
                cid = np.random.choice(self.train_draw_chars, 1)[0]
            elif self.state == 'test':
                cid = np.random.choice(self.test_draw_chars, 1)[0]
            else:
                raise Exception('State is unset: {}'.format(self.state))
            _char_dialogs = self._get_char_dialogs(cid)
            cand = _char_dialogs.sample(n=1).iloc[0].dia2
            res.add(cand)
        res = list(res)
        np.random.shuffle(res)
        if len(res) != num_of_cands:
            raise Exception('Mismatch selection length: {} {}'.format(len(res), num_of_cands))
        return res

    def get_close_dialog2(self, num_of_cands, dialog1_id, dialog2,  # notice it pass in sent id rather than raw sentence
                          sent_pool=None, sent_model=None, sent_map=None, inv_sent_map=None,):
        assert self.sent_option == 'close'
        # rank start sentence against dialog 1
        res = set([dialog2])
        
        if sent_pool is None:
            sent_pool = self.sent_pool
        if sent_model is None:
            sent_model = self.sent_model
        if sent_map is None:
            sent_map = self.sent_map
        if inv_sent_map is None:
            inv_sent_map = self.inv_sent_map
        if self.state == 'train':
            _dias = self.train_dialogs
        elif self.state == 'test':
            _dias = self.test_dialogs
        else:
            raise Exception('Unknown state: {}'.format(self.state))
        
        for sent_cat, _ in sent_model.similar_users(inv_sent_map[dialog1_id], N=num_of_cands*200):
            if len(res) >= num_of_cands:
                break
            if sent_cat in sent_pool:  # if allow to consider as sentence candidate
                sid = sent_map[sent_cat]
                res.add(_dias.loc[sid].dia2)
        res = list(res)
        np.random.shuffle(res)
        if len(res) != num_of_cands:
            raise Exception('Mismatch selection length: {} {}'.format(len(res), num_of_cands))
        return res

    def get_bench_dialog2(self, num_of_cands, start_sent):
        """Get a fixed number of random dialog line as training candidate.
        
        Notice the diff between this and get_random_dialog2:
            This function follows strict benchmark condition, which draw a dialog2 regardless of characters.
        """
        assert self.state == 'train'  # only training allow to get bench condition
        res = set([start_sent])
    #     char_list = chars.char_id.unique()
        while len(res) < num_of_cands:
            # select a dialog2 from data
            cand = self.shuffle_machine.get_one().dia2
            res.add(cand)
        res = list(res)
        np.random.shuffle(res)
        if len(res) != num_of_cands:
            raise Exception('Mismatch selection length: {} {}'.format(len(res), num_of_cands))
        return res

    def get_fb_line(self, char_id, diag1, diag2, diag1_id, total_cands=20, reward='', fallback_random=False):
        """Get line for ParlAI format.
        """
        line_cnt = 1
        line = ''
        if self.build_hla:
            if self.hla_empty:  # fill in empty hla
                for _ in range(self.hla_placeholder):
                    line += '{} persona: none.\n'.format(line_cnt)
                    line_cnt += 1
            else:
                for hla in self.hla_machine.get(char_id=char_id, limit=self.hla_placeholder):
                    line += '{} persona: i am {}.\n'.format(line_cnt, hla)
                    line_cnt += 1
        line += '{} {}\t{}\t{}\t'.format(line_cnt, clean_line(diag1), clean_line(diag2), reward)
       
        if self.neg_option == 'bench':  # random draw sentences.
            for cand in self.get_bench_dialog2(num_of_cands=total_cands, start_sent=diag2):
                line += '{}|'.format(clean_line(cand))
        elif self.sent_option == 'random' or self.state=='test' or fallback_random:  # note: test is always random
            for cand in self.get_random_dialog2(num_of_cands=total_cands, start_sent=diag2):
                line += '{}|'.format(clean_line(cand))
        elif self.sent_option == 'close':
            if diag1_id not in self.inv_sent_map:
                # fall back to random option
                self.fallback_count += 1
#                 print('FALL: |{}|'.format(diag1))  # uncomment this to see short sentences...
                return self.get_fb_line(char_id, diag1, diag2, diag1_id, total_cands, reward, fallback_random=True)
            for cand in self.get_close_dialog2(num_of_cands=total_cands, dialog1_id=diag1_id, dialog2=diag2):
                line += '{}|'.format(clean_line(cand))
        else:
            raise Exception('Sentence option not recognized: {}'.format(self.sent_option))
        line = line[:-1] + '\n'  # clip last char, append newline.
        return line

    def _write_lines(self, diag, ratio, f_ptr, relax=False):
        if relax:  # use same condition as standard testing.
            line = self.get_fb_line(diag.char_id, diag.dia1, diag.dia2, diag.Index, total_cands=20, fallback_random=True)
            f_ptr.write(line)
        else:
            if self.fixed_repetition:  # write all the same content
                line = self.get_fb_line(diag.char_id, diag.dia1, diag.dia2, diag.Index, total_cands=20)
                for _ in range(ratio):
                    f_ptr.write(line)
            else:
                for _ in range(ratio):
                    line = self.get_fb_line(diag.char_id, diag.dia1, diag.dia2, diag.Index, total_cands=20)
                    f_ptr.write(line)

    def simple_write(self, train_ratio=0.7, write_test=False, random_state=None):
        print('WRITING...')
        if random_state is not None:
            np.random.seed(random_state)  # for get random lines
        else:
            random_state = int(time.time())
            np.random.seed(random_state)
        
        # record all params before do anything (for quick debug).
        with open(self._path(self.LOG_F), 'a+') as f:
            f.write('random_state={}\n'.format(random_state))
            f.write('train_ratio={}\n'.format(train_ratio))
            f.write('weight_option={}\n'.format(self.weight_option))
            f.write('gt_option={}\n'.format(self.gt_option))
            f.write('neg_option={}\n'.format(self.neg_option))
            f.write('sent_option={}\n'.format(self.sent_option))
            f.write('target={}\n\n'.format(self.cid))
        
        train_ptr = codecs.open(self._path(self.TRAIN_F), 'a+')
        valid_ptr = codecs.open(self._path(self.VALID_F), 'a+')
        log_ptr = open(self._path(self.LOG_F), 'a+')
        
        # write train / valid
        self.state = 'train'
        self.fallback_count = 0
        print('-- Writing {}, {} --'.format(self.TRAIN_F, self.VALID_F))
        for r, w in zip(self.ratios, self.weights):
            char_id = w[1]
            target_dialogs = self._get_char_dialogs(char_id)
            debug_msg = 'Writing total {} from {} with weight {}, ratio {}...'.format(
                len(target_dialogs), w[2], w[0], r)
            print(debug_msg)
            log_ptr.write(debug_msg + '\n')
            for diag in target_dialogs.itertuples():
                if np.random.rand() < train_ratio:
                    self._write_lines(diag=diag, ratio=r, f_ptr=train_ptr)
                else:
                    # valid, allow to relax if required.
                    self._write_lines(diag=diag, ratio=r, f_ptr=valid_ptr, relax=self.relax_validation)
        _msg = 'End operation, total fallback {}/{}'.format(self.fallback_count, len(self.train_dialogs))
        print(_msg)
        log_ptr.write(_msg + '\n')
        
        train_ptr.close()
        valid_ptr.close()
        log_ptr.close()
        print()

        # write test
        if write_test:  # only benchmark is necessary to write test.
            self.state = 'test'
            test_ptr = codecs.open(self._path(self.TEST_F), 'a+')
            print('-- Writing {} --'.format(self.TEST_F))
            self.state = 'test'
            for diag in self.target_dialogs.itertuples():
                line = self.get_fb_line(diag.char_id, diag.dia1, diag.dia2, diag.Index, total_cands=20)
                test_ptr.write(line)
            test_ptr.close()

        self.state = None

## 5 Folds

Shows are divided randomly into 5 folds, and do a 5-fold cross validation. 1 fold is used for testing, 4 for training on each fold.

The idea is that due to the complexity of dialogs, the data may not be uniformally distributed. So prove our model works better in all 5 folds are stronger evidence that it works well.

All characters in testing are used as candidates to fill in testing candidates in both machine and human testing.

In [20]:
global_sent_model, global_sent_map, global_inv_sent_map = get_train(heads, report_test_acc=False,
                                                                    random_state=549178)  # from random.org

<142447x929612 sparse matrix of type '<class 'numpy.float64'>'
	with 13530074 stored elements in COOrdinate format>
--TRAIN: train on complete matrix --


100%|██████████| 200.0/200 [49:06<00:00, 13.32s/it]


In [21]:
class FoldManager:
    def __init__(self, fold_num=5, dialog_df=dialogs, random_state=2565465):
        # NOTE: default random state from random.org
        # do kfold on all shows
        assert fold_num  # can not be zero
        shows = np.array(dialog_df.show_id.unique())
        kf = KFold(n_splits=fold_num, random_state=random_state, shuffle=True)
        self.folds = []
        for train_index, test_index in kf.split(shows):
            self.folds.append({'train': set(shows[train_index]), 'test': set(shows[test_index])})
        assert len(self.folds) == fold_num
        
        self.dialog_df = dialog_df

    def inspect(self, targets=None, char_df=chars):
        for i, f in enumerate(self.folds):
            print('----Fold {}----'.format(i+1))
            print(f)
            print('Summary:')
            total_d = 0
            total_c = 0
            print('Total train shows {}'.format(len(f['train'])))
            for t in f['train']:
                _ping = self.dialog_df.loc[self.dialog_df.show_id==t]
                total_d += _ping.shape[0]
                total_c += len(_ping.char_id.unique())
            print('total train dialog: {}\n train char: {}'.format(total_d, total_c))
            total_d = 0
            total_c = 0
            print('Total test shows {}'.format(len(f['test'])))
            for t in f['test']:
                _ping = self.dialog_df.loc[self.dialog_df.show_id==t]
                total_d += _ping.shape[0]
                total_c += len(_ping.char_id.unique())
            print('total test dialog: {}\n test char: {}'.format(total_d, total_c))
        if targets is not None:
            for t in targets:
                _ping = self.dialog_df.loc[self.dialog_df.char_id==t]
                print('Total target dialog: {}'.format(_ping.shape[0]))
                _ping = char_df.loc[char_df.char_id==t]
                print('Total target HLA: {}'.format(len(_ping.feature.unique())))

    def _sanity_chars(self, char_ids):
        """Check sanity of input characters.
        """
        # sanity check length
        assert len(char_ids) == len(self.folds)
        # check if all character is aligned with fold before do anything
        for cid, fold in zip(char_ids, self.folds):
            char_dias = self.dialog_df.loc[self.dialog_df.char_id==cid]
            assert char_dias.iloc[0].show_id in fold['test']

    def write_persona_test(self, char_ids, base_folder, num_of_cands=20,
                           head_df=heads, report_sent_mm_acc=False, random_state=None,
                           sent_model=global_sent_model, sent_map=global_sent_map, inv_sent_map=global_inv_sent_map):
        """Persona test uses all candidate sentences in testing set.
        """
        assert os.path.isdir(base_folder) and (not os.listdir(base_folder))  # exists and empty
        self._sanity_chars(char_ids)
        
        # for each fold, construct sentence neigbhood against target dialogs.
        fold_count = 1
        log_f = os.path.join(base_folder, 'log.txt')
        for cid, fold in zip(char_ids, self.folds):
            test_f = os.path.join(base_folder, 'test_persona_fold{}_{}.txt'.format(fold_count, cid))
            # use all random (minimal) settings (no need for construct for trainings)
            tw = TrainWriter(target_char_id=cid, test_shows=fold['test'],
                 output_folder=None, build_folder=False,  # cancel folder check (not necessary to use folder)
                 weight_option='no_weight', gt_option='all', neg_option='random', sent_option='random')
            tw.state = 'test'
            
            # -- establish dialog candidates --
            # NOTE: similar to init from TrainWriter, build sentence pool.
            char_cands = tw.test_draw_chars  # character candidates
            # NOTE: this sentence neighboorhood build use all sentences heads
            # (rather than strict hidden of testing dialog heads like TrainWriter).
#             sent_model, sent_map, inv_sent_map = get_train(head_df, report_test_acc=report_sent_mm_acc)
            assert len(sent_map) == len(inv_sent_map)
            print('Testing sentence model considering all {} sentences.'.format(len(sent_map)))
            # Pre-compute sentence pool, a set of sentence category that available for neighboorhood.
            # note: this pool limits to char candidate set, only sentence with char is valid.
            sent_pool = set()  # set of sentence category
            _limit_dialogs = tw.test_dialogs 
            for sid in _limit_dialogs.index.values:  # convert sent id to sent category
                if sid in inv_sent_map:
                    sent_pool.add(inv_sent_map[sid])
            print('Total of {}/{} sentences in consideration.'.format(len(sent_pool), len(_limit_dialogs)))
            
            # -- write test dialogs based on sentence neighborhood 
            print('WRITING PERSONA TEST {}...'.format(cid))
            if random_state is not None:
                np.random.seed(random_state)  # for get random lines
            else:
                random_state = int(time.time())
                np.random.seed(random_state)
                with open(log_f, 'a+') as f:
                    f.write('random_state:fold{}:{}={}\n'.format(fold_count, cid, random_state))

            test_ptr = codecs.open(test_f, 'a+')
            tw.sent_option = 'close'  # do not complian for security check
            fallback_count = 0
            for diag in tw.target_dialogs.itertuples():
                line = '1 {}\t{}\t\t'.format(clean_line(diag.dia1), clean_line(diag.dia2))
                diag1_id = diag.Index
                # rank start sentence against dialog 1
                if diag1_id not in inv_sent_map:
                    # fall back to random option
                    fallback_count += 1
#                         print('FALL: |{}|'.format(diag1))  # uncomment this to see short sentences...
                    res = tw.get_random_dialog2(num_of_cands, diag.dia2)
                else:
                    res = tw.get_close_dialog2(num_of_cands=num_of_cands, dialog1_id=diag1_id, dialog2=diag.dia2,
                                               sent_pool=sent_pool, sent_model=sent_model,
                                               sent_map=sent_map, inv_sent_map=inv_sent_map)
                for cand in res:
                    line += '{}|'.format(clean_line(cand))
                line = line[:-1] + '\n'
                test_ptr.write(line)
            with open(log_f, 'a+') as f:
                f.write('total_fallback:fold{}:{}={}\n'.format(fold_count, cid, fallback_count))

            test_ptr.close()
            fold_count+=1

    def write_all(self, char_ids, base_folder):
        # folder exists and empty
        assert os.path.isdir(base_folder) and (not os.listdir(base_folder))
        
        self._sanity_chars(char_ids)
        
        # write for each fold and its target
        fold_count = 1
        for cid, fold in zip(char_ids, self.folds):
            # write benchmark with none-persona
#             base = os.path.join(base_folder, 'unform_empty')
#             if not os.path.exists(base):
#                 os.mkdir(base)#
#             tw = TrainWriter(target_char_id=cid, test_shows=fold['test'],
#                  output_folder=os.path.join(base, 'fold{}_{}'.format(fold_count, cid)),
#                  weight_option='no_weight', gt_option='all', neg_option='random', sent_option='random',
#                  build_hla=True, hla_placeholder=4)  # hold 4 empty spot in history.
#             tw.simple_write(write_test=True)
#             print('...END WRITE benchmark...')

            # Config1, decoupling
#             base = os.path.join(base_folder, 'sd_config1')
#             if not os.path.exists(base):
#                 os.mkdir(base)
#             tw = TrainWriter(target_char_id=cid, test_shows=fold['test'],
#                              output_folder=os.path.join(base, 'fold{}_{}'.format(fold_count, cid)),
#                              weight_option='no_weight', gt_option='positive', neg_option='negative', sent_option='close',
#                              build_hla=True, decouple_hla=True)
#             tw.simple_write(write_test=True)
#             print('...END WRITE sd_config1...')


            # config 1, all
#             base = os.path.join(base_folder, 'all_longconfig1')
#             if not os.path.exists(base):
#                 os.mkdir(base)
#             tw = TrainWriter(target_char_id=cid, test_shows=fold['test'],
#                              output_folder=os.path.join(base, 'fold{}_{}'.format(fold_count, cid)),
#                              weight_option='no_weight', gt_option='all', neg_option='random', sent_option='random',
#                              hla_placeholder=8, decouple_hla=True)
#             tw.simple_write(write_test=True)
#             print('...END WRITE all_longconfig1...')

#             # config1, decoupled and longer hla (standard configuration described in paper)
#             base = os.path.join(base_folder, 'sdlong_config1')
#             if not os.path.exists(base):
#                 os.mkdir(base)
#             tw = TrainWriter(target_char_id=cid, test_shows=fold['test'],
#                              output_folder=os.path.join(base, 'fold{}_{}'.format(fold_count, cid)),
#                              weight_option='no_weight', gt_option='positive', neg_option='negative', sent_option='close',
#                              hla_placeholder=8, decouple_hla=True)
#             tw.simple_write(write_test=True)
#             print('...END WRITE sdlong_config1...')

            # config 1, with hla reduction
#             base = os.path.join(base_folder, 'sdlong_reducted_config1')
#             if not os.path.exists(base):
#                 os.mkdir(base)
#             tw = TrainWriter(target_char_id=cid, test_shows=fold['test'],
#                              output_folder=os.path.join(base, 'fold{}_{}'.format(fold_count, cid)),
#                              weight_option='no_weight', gt_option='positive', neg_option='negative', sent_option='close',
#                              hla_placeholder=8, decouple_hla=True, apply_hla_reduction=True)
#             tw.simple_write(write_test=True)
#             print('...END WRITE sdlong_reducted_config1...')

#            # config 1, with manual correction
#            base = os.path.join(base_folder, 'sdlong_cleaned_config1')
#            if not os.path.exists(base):
#                os.mkdir(base)
#            tw = TrainWriter(target_char_id=cid, test_shows=fold['test'],
#                             output_folder=os.path.join(base, 'fold{}_{}'.format(fold_count, cid)),
#                             weight_option='no_weight', gt_option='positive', neg_option='negative', sent_option='close',
#                             hla_placeholder=8, decouple_hla=True, cleaned_hlas=cleaned_hlas)
#            tw.simple_write(write_test=True)
#            print('...END WRITE sdlong_cleaned_config1...')
            
            # config 1, with manual correction and all consider all characters
            base = os.path.join(base_folder, 'sdlong_cleanedall_config1')
            if not os.path.exists(base):
                os.mkdir(base)
            tw = TrainWriter(target_char_id=cid, test_shows=fold['test'],
                             output_folder=os.path.join(base, 'fold{}_{}'.format(fold_count, cid)),
                             weight_option='no_weight', gt_option='all', neg_option='negative', sent_option='close',
                             hla_placeholder=8, decouple_hla=True, cleaned_hlas=cleaned_hlas)
            tw.simple_write(write_test=True)
            print('...END WRITE sdlong_cleanedall_config1...')

            # relaxed config 2
#             base = os.path.join(base_folder, 'relaxed_config2')
#             if not os.path.exists(base):
#                 os.mkdir(base)
#             tw = TrainWriter(target_char_id=cid, test_shows=fold['test'],
#                  output_folder=os.path.join(base, 'fold{}_{}'.format(fold_count, cid)),
#                  weight_option='no_weight', gt_option='all', neg_option='negative',
#                  sent_option='close', relax_validation=True)
#             tw.simple_write()
#             print('...END WRITE config relaxed_config2...')

            fold_count+=1

In [22]:
fm = FoldManager()
fm.inspect()

----Fold 1----
{'train': {'NCIS', 'TheSimpsons', 'Salem', 'TheOC', 'Charmed1998', 'AdventureTime', 'MyLittlePonyFriendshipIsMagic', 'TrueBlood', 'StarTrek', 'TheOfficeUS', 'DoctorWho', 'Friends', 'SonsOfAnarchy', 'TeenWolf', 'Supernatural', 'Futurama', 'CSIVerse', 'TheOriginals', 'TheLWord', 'Merlin2008', 'Bones', 'Alias', 'TheSecretLifeOfTheAmericanTeenager', 'OnceUponATime', 'QueerAsFolk', 'Buffyverse', 'Roswell', 'TheMentalist', 'VeronicaMars', 'OneTreeHill'}, 'test': {'GreysAnatomy', 'TheBigBangTheory', 'TheVampireDiaries', 'HowIMetYourMother', 'Smallville', 'Seinfeld', 'GilmoreGirls', 'DawsonsCreek'}}
Summary:
Total train shows 30
total train dialog: 764168
 train char: 256
Total test shows 8
total test dialog: 278479
 test char: 71
----Fold 2----
{'train': {'TheBigBangTheory', 'NCIS', 'TheSimpsons', 'TheOC', 'Charmed1998', 'Smallville', 'Seinfeld', 'GilmoreGirls', 'AdventureTime', 'MyLittlePonyFriendshipIsMagic', 'TheOfficeUS', 'DoctorWho', 'Friends', 'TeenWolf', 'Supernatural', 

In [23]:
# Manually setup characters
# Note: we can, and preferablely randomly choose some characters.
# However, human paticipants are required in next step, so we have to choose well-know characters.

# this list is based on characters in each fold that has very high count of dialogs
# this was originally what we used to play around, not what we used in paper,
# you are welcome to play around with them as well.
SELECTED_TARGETS = [
    'l7947',  # Josephine "Joey" Potter from DawsonsCreek
    'l25431', # picard from star trek
    'l10692', # MONICA from Friends
    'l19235',  # RYAN from TheOC
    'l19352', # MICHAEL from The office
]

# this list is offically what we report on paper
# a list of well-known characters that we used to run human paticipants study.
OFFICAL_TARGETS = [
    'l4390',  # Sheldon from BigBang
    'l25431', # Picard from star trek
    'l10692', # MONICA from Friends
    'l7484',  # GRISSOM from CSI
    'a15821', # Marge from Simpsons
]

In [24]:
fm.inspect(targets=OFFICAL_TARGETS)

----Fold 1----
{'train': {'NCIS', 'TheSimpsons', 'Salem', 'TheOC', 'Charmed1998', 'AdventureTime', 'MyLittlePonyFriendshipIsMagic', 'TrueBlood', 'StarTrek', 'TheOfficeUS', 'DoctorWho', 'Friends', 'SonsOfAnarchy', 'TeenWolf', 'Supernatural', 'Futurama', 'CSIVerse', 'TheOriginals', 'TheLWord', 'Merlin2008', 'Bones', 'Alias', 'TheSecretLifeOfTheAmericanTeenager', 'OnceUponATime', 'QueerAsFolk', 'Buffyverse', 'Roswell', 'TheMentalist', 'VeronicaMars', 'OneTreeHill'}, 'test': {'GreysAnatomy', 'TheBigBangTheory', 'TheVampireDiaries', 'HowIMetYourMother', 'Smallville', 'Seinfeld', 'GilmoreGirls', 'DawsonsCreek'}}
Summary:
Total train shows 30
total train dialog: 764168
 train char: 256
Total test shows 8
total test dialog: 278479
 test char: 71
----Fold 2----
{'train': {'TheBigBangTheory', 'NCIS', 'TheSimpsons', 'TheOC', 'Charmed1998', 'Smallville', 'Seinfeld', 'GilmoreGirls', 'AdventureTime', 'MyLittlePonyFriendshipIsMagic', 'TheOfficeUS', 'DoctorWho', 'Friends', 'TeenWolf', 'Supernatural', 

In [None]:
# fm.write_persona_test(
#     char_ids=SELECTED_TARGETS,
#     base_folder=os.path.expanduser('~/Datasets/dodgsons/formal_aaai/5fold_persona_tests')
# )

In [25]:
fm.write_all(
    char_ids=OFFICAL_TARGETS,
    base_folder=os.path.expanduser('~/Datasets/dodgsons/formal_aaai/pure')
)

--TARGET l4390 --
Character check: 9133 dialogs, 74 features.
TARGET: SHELDON from TheBigBangTheory, total of 9133 dialogs
Total training dialogs 764168;
Total testing dialogs 269346;
Total target dialogs 9133;
Total training heads: 13530074, Avg 17.71 per sentence
REPORT: 256 out of 327 characters are in train.

----- LOCK target character TheBigBangTheory|Sheldon Cooper from The Big Bang Theory and Young Sheldon -----
Considering 4428 out of 44288 top ranked characters.
256
Level 1 total 4428.
Level 2 total 23469
filtering on 10 accepable overlap.
----POSITIVE----
freq 118 weight 4.770684624465665: Merlin2008|Merlin The Character
freq 114 weight 4.736198448394496: Supernatural|Sam Winchester
freq 106 weight 4.663439094112067: Supernatural|Dean Winchester
freq 101 weight 4.61512051684126: OnceUponATime|Rumplestiltskin/Mr.Gold/Weaver
freq 95 weight 4.553876891600541: Buffyverse|Angel, n? Liam
freq 74 weight 4.30406509320417: TheSimpsons|Lisa Marie Simpson
freq 73 weight 4.2904594411483

freq 6 weight 1.791759469228055: Futurama|Hermes Conrad
freq 6 weight 1.791759469228055: TheOC|Ryan Atwood
freq 6 weight 1.791759469228055: TeenWolf|Derek
freq 6 weight 1.791759469228055: TheOfficeUS|Janet "Jan" Levinson (formerly Levinson-Gould)
freq 6 weight 1.791759469228055: TheSimpsons|Nelson Muntz
freq 6 weight 1.791759469228055: DoctorWho|Doctor Who ? Amy Pond
freq 6 weight 1.791759469228055: TheOriginals|Elijah Mikaelson
freq 6 weight 1.791759469228055: StarTrek|Ensign Harry Kim
freq 6 weight 1.791759469228055: Alias|Marshall Flinkman
freq 6 weight 1.791759469228055: OneTreeHill|Peyton Sawyer
freq 6 weight 1.791759469228055: CSIVerse|Sara Sidle
freq 5 weight 1.6094379124341003: MyLittlePonyFriendshipIsMagic|Characterization and Background
freq 5 weight 1.6094379124341003: NCIS|Special Agent Ziva David
freq 5 weight 1.6094379124341003: OnceUponATime|Henry Mills
freq 5 weight 1.6094379124341003: MyLittlePonyFriendshipIsMagic|Scootaloo
freq 5 weight 1.6094379124341003: TheSimpsons

100%|██████████| 200.0/200 [36:17<00:00,  9.93s/it]


sentence model considering: 664931
Some sentences may be removed due to not having enough good features.
Total of 321239/367100 sentences in consideration.
WRITING...
-- Writing train.txt, valid.txt --
Writing total 2404 from DoctorWho|Doctor Who ? Brigadier Lethbridge-Stewart with weight 1.0, ratio 1...
Writing total 2196 from DoctorWho|Josephine "Jo" Grant with weight 1.0, ratio 1...
Writing total 8160 from Friends|Rachel Karen Green with weight 4.555348061489413, ratio 1...
Writing total 772 from OnceUponATime|Belle/Lacey with weight 3.70805020110221, ratio 1...
Writing total 2308 from OnceUponATime|Princess Snow White/Mary Margaret Nolan with weight 3.3978952727983707, ratio 1...
Writing total 2873 from DoctorWho|Jamie McCrimmon with weight 1.0, ratio 1...
Writing total 9344 from StarTrek|Captain James Tiberius Kirk with weight 3.8903717578961645, ratio 1...
Writing total 3247 from CSIVerse|Captain Jim Brass with weight 1.0, ratio 1...
Writing total 1013 from TheMentalist|Kimball C

Writing total 1003 from Alias|Marcus Dixon with weight 1.0, ratio 1...
Writing total 4758 from Charmed1998|Prudence 'Prue' Halliwell with weight 1.0, ratio 1...
Writing total 2732 from DoctorWho|Ian Chesterton with weight 1.0, ratio 1...
Writing total 1263 from MyLittlePonyFriendshipIsMagic|Apple Bloom with weight 3.8903717578961645, ratio 1...
Writing total 797 from Salem|Mary Sibley with weight 1.0, ratio 1...
Writing total 1368 from TheSimpsons|Milhouse Van Houten with weight 4.367295829986475, ratio 1...
Writing total 1902 from OnceUponATime|Prince David "Charming"/David Nolan with weight 3.8903717578961645, ratio 1...
Writing total 888 from TheOfficeUS|Phyllis Margaret Lapin-Vance with weight 1.0, ratio 1...
Writing total 4880 from DoctorWho|Tropes associated with the television series with weight 4.091042453358316, ratio 1...
Writing total 7107 from Friends|Friends Joey Tribbiani with weight 4.688879454113936, ratio 1...
Writing total 3076 from Bones|Angela Montenegro with weight

Writing total 1770 from OneTreeHill|Dan Scott with weight 1.0, ratio 1...
Writing total 7490 from DoctorWho|Doctor Who ? Sarah Jane Smith with weight 1.0, ratio 1...
Writing total 625 from AdventureTime|Ice King with weight 1.0, ratio 1...
Writing total 4874 from DoctorWho|First Doctor with weight 1.0, ratio 1...
Writing total 634 from TheOfficeUS|Stanley James Hudson with weight 1.0, ratio 1...
Writing total 1730 from OnceUponATime|Captain Killian "Hook" Jones with weight 4.218875824868201, ratio 1...
Writing total 7908 from Friends|Friends Ross Geller with weight 5.007333185232471, ratio 1...
Writing total 2090 from Roswell|Michael Guerin with weight 1.0, ratio 1...
Writing total 4297 from VeronicaMars|Veronica Mars (Kristen Bell) with weight 3.3978952727983707, ratio 1...
Writing total 1940 from TheSimpsons|Moe Szyslak with weight 1.0, ratio 1...
Writing total 736 from Merlin2008|Lady Morgana with weight 4.850147601710058, ratio 1...
Writing total 19683 from TheSimpsons|open/close a

freq 26 weight 3.258096538021482: DoctorWho|Fourth Doctor
freq 25 weight 3.2188758248682006: OnceUponATime|The Evil Queen/Regina Mills/Roni
freq 24 weight 3.1780538303479458: DoctorWho|Tropes associated with the television series
freq 24 weight 3.1780538303479458: HowIMetYourMother|Barney Stinson
freq 24 weight 3.1780538303479458: GreysAnatomy|Derek Shepherd
freq 23 weight 3.1354942159291497: Buffyverse|Wesley
freq 23 weight 3.1354942159291497: MyLittlePonyFriendshipIsMagic|Characterization and Background
freq 23 weight 3.1354942159291497: OnceUponATime|Princess Snow White/Mary Margaret Nolan
freq 22 weight 3.091042453358316: DoctorWho|Ian Chesterton
freq 21 weight 3.044522437723423: NCIS|Special Agent in Charge Leroy Jethro Gibbs
freq 21 weight 3.044522437723423: Charmed1998|Chris Perry Halliwell
freq 20 weight 2.995732273553991: Alias|Marcus Dixon
freq 19 weight 2.9444389791664403: OnceUponATime|Prince David "Charming"/David Nolan
freq 18 weight 2.8903717578961645: OnceUponATime|Emma

freq 2 weight 0.6931471805599453: Seinfeld|Jerry Seinfeld
freq 2 weight 0.6931471805599453: TheOfficeUS|Phyllis Margaret Lapin-Vance
freq 2 weight 0.6931471805599453: TheSimpsons|Lenny
freq 2 weight 0.6931471805599453: Smallville|Pete Ross
freq 2 weight 0.6931471805599453: TheOriginals|Marcel Gerard
freq 2 weight 0.6931471805599453: CSIVerse|Warrick Brown
freq 2 weight 0.6931471805599453: QueerAsFolk|Justin Taylor
freq 2 weight 0.6931471805599453: MyLittlePonyFriendshipIsMagic|Apple Bloom
freq 2 weight 0.6931471805599453: MyLittlePonyFriendshipIsMagic|Sweetie Belle
freq 2 weight 0.6931471805599453: TheOC|Ryan Atwood
freq 2 weight 0.6931471805599453: Friends|Monica E. Geller
freq 2 weight 0.6931471805599453: GreysAnatomy|Preston Burke
freq 1 weight 0.0: Futurama|Bender Bending Rodriguez (Bending Unit 22)
freq 1 weight 0.0: Alias|Syndey Bristow
freq 1 weight 0.0: AdventureTime|Adventure Time Jake
freq 1 weight 0.0: Charmed1998|Billie Jenkins
freq 1 weight 0.0: DoctorWho|Jamie McCrimmon
f

100%|██████████| 200.0/200 [39:01<00:00, 10.72s/it]


sentence model considering: 720807
Some sentences may be removed due to not having enough good features.
Total of 537624/627398 sentences in consideration.
WRITING...
-- Writing train.txt, valid.txt --
Writing total 2404 from DoctorWho|Doctor Who ? Brigadier Lethbridge-Stewart with weight 3.6390573296152584, ratio 1...
Writing total 2196 from DoctorWho|Josephine "Jo" Grant with weight 1.0, ratio 1...
Writing total 8160 from Friends|Rachel Karen Green with weight 1.0, ratio 1...
Writing total 772 from OnceUponATime|Belle/Lacey with weight 3.302585092994046, ratio 1...
Writing total 2308 from OnceUponATime|Princess Snow White/Mary Margaret Nolan with weight 4.13549421592915, ratio 1...
Writing total 2873 from DoctorWho|Jamie McCrimmon with weight 1.0, ratio 1...
Writing total 3068 from GilmoreGirls|Richard Gilmore with weight 1.0, ratio 1...
Writing total 3247 from CSIVerse|Captain Jim Brass with weight 1.0, ratio 1...
Writing total 4099 from GilmoreGirls|Sookie St. James with weight 1.0

Writing total 1890 from Roswell|Isabel Evans with weight 1.0, ratio 1...
Writing total 1774 from TheBigBangTheory|Bernadette Rostenkowski with weight 1.0, ratio 1...
Writing total 4561 from TheBigBangTheory|Howard Wolowitz with weight 1.0, ratio 1...
Writing total 3315 from Futurama|Bender Bending Rodriguez (Bending Unit 22) with weight 1.0, ratio 1...
Writing total 12949 from Bones|Agent Seeley Booth with weight 1.0, ratio 1...
Writing total 891 from Alias|Will Tippin with weight 1.0, ratio 1...
Writing total 2414 from DoctorWho|Gwen Cooper with weight 1.0, ratio 1...
Writing total 5302 from TheOC|Seth Cohen with weight 1.0, ratio 1...
Writing total 1003 from Alias|Marcus Dixon with weight 3.995732273553991, ratio 1...
Writing total 4758 from Charmed1998|Prudence 'Prue' Halliwell with weight 1.0, ratio 1...
Writing total 2732 from DoctorWho|Ian Chesterton with weight 4.091042453358316, ratio 1...
Writing total 1263 from MyLittlePonyFriendshipIsMagic|Apple Bloom with weight 1.0, ratio 

Writing total 1331 from Smallville|Lionel Luthor with weight 3.302585092994046, ratio 1...
Writing total 1730 from OnceUponATime|Captain Killian "Hook" Jones with weight 1.0, ratio 1...
Writing total 7908 from Friends|Friends Ross Geller with weight 1.0, ratio 1...
Writing total 2090 from Roswell|Michael Guerin with weight 1.0, ratio 1...
Writing total 1940 from TheSimpsons|Moe Szyslak with weight 1.0, ratio 1...
Writing total 19683 from TheSimpsons|open/close all folders with weight 3.5649493574615367, ratio 1...
Writing total 4365 from TheOfficeUS|Pamela Morgan "Pam" Beesly-Halpert with weight 1.0, ratio 1...
Writing total 660 from Charmed1998|Billie Jenkins with weight 1.0, ratio 1...
Writing total 12127 from Bones|Temperance Brennan with weight 1.0, ratio 1...
Writing total 2954 from OnceUponATime|The Evil Queen/Regina Mills/Roni with weight 4.218875824868201, ratio 1...
Writing total 7370 from Friends|Chandler Muriel Bing with weight 1.0, ratio 1...
Writing total 1369 from CSIVers

freq 31 weight 3.4339872044851463: GilmoreGirls|Emily Gilmore
freq 30 weight 3.4011973816621555: TheSimpsons|Marjorie "Marge" Jacqueline Simpson (n?e Bouvier)
freq 28 weight 3.332204510175204: Merlin2008|Guinevere (Gwen)
freq 27 weight 3.295836866004329: Futurama|Amy Wong, Ph.D
freq 27 weight 3.295836866004329: TheVampireDiaries|Elena Gilbert
freq 26 weight 3.258096538021482: HowIMetYourMother|Ted Mosby
freq 26 weight 3.258096538021482: TheVampireDiaries|Bonnie Mc Cullough
freq 26 weight 3.258096538021482: TheOC|Marissa Cooper
freq 26 weight 3.258096538021482: TheMentalist|Teresa Lisbon
freq 26 weight 3.258096538021482: OneTreeHill|Lucas Scott
freq 25 weight 3.2188758248682006: GilmoreGirls|Lorelai "Rory" Leigh Gilmore
freq 25 weight 3.2188758248682006: TheOfficeUS|The Office US Michael Scott
freq 25 weight 3.2188758248682006: OneTreeHill|Alex Dupre
freq 24 weight 3.1780538303479458: TheBigBangTheory|Bernadette Rostenkowski
freq 24 weight 3.1780538303479458: MyLittlePonyFriendshipIsMag

freq 5 weight 1.6094379124341003: Roswell|Isabel Evans
freq 5 weight 1.6094379124341003: MyLittlePonyFriendshipIsMagic|Scootaloo
freq 5 weight 1.6094379124341003: TheLWord|Jenny Schecter
freq 5 weight 1.6094379124341003: DoctorWho|Gwen Cooper
freq 5 weight 1.6094379124341003: Seinfeld|Cosmo Kramer
freq 4 weight 1.3862943611198906: TeenWolf|Derek
freq 4 weight 1.3862943611198906: DoctorWho|Doctor Who ? Rose Tyler
freq 4 weight 1.3862943611198906: Andromeda|Trance Gemini
freq 4 weight 1.3862943611198906: TheOfficeUS|Pamela Morgan "Pam" Beesly-Halpert
freq 4 weight 1.3862943611198906: Smallville|Martha Kent
freq 4 weight 1.3862943611198906: CSIVerse|Dr. Gilbert "Gil" Grissom
freq 4 weight 1.3862943611198906: TheLWord|Bette Porter
freq 4 weight 1.3862943611198906: DawsonsCreek|Audrey Liddell
freq 4 weight 1.3862943611198906: DawsonsCreek|Andrea "Andie" McPhee
freq 4 weight 1.3862943611198906: StarTrek|Lieutenant (later Lieutenant Commander) Worf
freq 4 weight 1.3862943611198906: TheBigBang

100%|██████████| 200.0/200 [39:50<00:00, 10.96s/it]


sentence model considering: 739289
Some sentences may be removed due to not having enough good features.
Total of 361608/414021 sentences in consideration.
WRITING...
-- Writing train.txt, valid.txt --
Writing total 2404 from DoctorWho|Doctor Who ? Brigadier Lethbridge-Stewart with weight 1.0, ratio 1...
Writing total 2196 from DoctorWho|Josephine "Jo" Grant with weight 3.302585092994046, ratio 1...
Writing total 9344 from StarTrek|Captain James Tiberius Kirk with weight 1.0, ratio 1...
Writing total 4027 from StarTrek|Lieutenant Junior Grade B'Elanna Torres with weight 1.0, ratio 1...
Writing total 3068 from GilmoreGirls|Richard Gilmore with weight 1.0, ratio 1...
Writing total 2873 from DoctorWho|Jamie McCrimmon with weight 1.0, ratio 1...
Writing total 2787 from GilmoreGirls|Paris Eustace Geller with weight 4.465735902799727, ratio 1...
Writing total 3247 from CSIVerse|Captain Jim Brass with weight 1.0, ratio 1...
Writing total 1013 from TheMentalist|Kimball Cho with weight 1.0, rat

Writing total 1890 from Roswell|Isabel Evans with weight 1.0, ratio 1...
Writing total 1774 from TheBigBangTheory|Bernadette Rostenkowski with weight 4.178053830347945, ratio 1...
Writing total 1734 from Andromeda|Tyr Anasazi with weight 1.0, ratio 1...
Writing total 4561 from TheBigBangTheory|Howard Wolowitz with weight 3.8903717578961645, ratio 1...
Writing total 3315 from Futurama|Bender Bending Rodriguez (Bending Unit 22) with weight 3.70805020110221, ratio 1...
Writing total 891 from Alias|Will Tippin with weight 1.0, ratio 1...
Writing total 2414 from DoctorWho|Gwen Cooper with weight 1.0, ratio 1...
Writing total 5302 from TheOC|Seth Cohen with weight 3.5649493574615367, ratio 1...
Writing total 1342 from OneTreeHill|Clayton "Clay" Evans with weight 1.0, ratio 1...
Writing total 1003 from Alias|Marcus Dixon with weight 1.0, ratio 1...
Writing total 2732 from DoctorWho|Ian Chesterton with weight 1.0, ratio 1...
Writing total 1263 from MyLittlePonyFriendshipIsMagic|Apple Bloom wit

Writing total 1456 from SonsOfAnarchy|Gemma Teller Morrow (n?e Madock) with weight 1.0, ratio 1...
Writing total 795 from TheOfficeUS|Janet "Jan" Levinson (formerly Levinson-Gould) with weight 3.4849066497880004, ratio 1...
Writing total 1284 from GilmoreGirls|Michel Gerard with weight 1.0, ratio 1...
Writing total 1060 from TheOfficeUS|Darryl Mathias Philbin with weight 1.0, ratio 1...
Writing total 916 from Futurama|John A. Zoidberg, M.D. with weight 1.0, ratio 1...
Writing total 912 from TheMentalist|Wayne Rigsby with weight 1.0, ratio 1...
Writing total 2013 from MyLittlePonyFriendshipIsMagic|Characterization and Background with weight 3.70805020110221, ratio 1...
Writing total 891 from MyLittlePonyFriendshipIsMagic|Sweetie Belle with weight 3.70805020110221, ratio 1...
Writing total 5581 from DoctorWho|Third Doctor with weight 1.0, ratio 1...
Writing total 2563 from StarTrek|Doctor (Lieutenant Commander) Leonard Horatio "Bones" McCoy with weight 1.0, ratio 1...
Writing total 1482 

Writing total 1078 from TheSimpsons|Krusty the Clown with weight 1.0, ratio 1...
Writing total 1390 from TheSecretLifeOfTheAmericanTeenager|Richard "Ricky" Underwood with weight 1.0, ratio 1...
End operation, total fallback 113513/852802

-- Writing test.txt --
...END WRITE sdlong_cleanedall_config1...
--TARGET l7484 --
Character check: 6100 dialogs, 18 features.
TARGET: GRISSOM from CSIVerse, total of 6100 dialogs
Total training dialogs 929025;
Total testing dialogs 107522;
Total target dialogs 6100;
Total training heads: 13530074, Avg 14.56 per sentence
REPORT: 275 out of 327 characters are in train.

----- LOCK target character CSIVerse|Dr. Gilbert "Gil" Grissom -----
Considering 4428 out of 44288 top ranked characters.
275
Level 1 total 4428.
Level 2 total 22419
filtering on 10 accepable overlap.
----POSITIVE----
freq 77 weight 4.343805421853684: Merlin2008|Merlin The Character
freq 57 weight 4.04305126783455: TheBigBangTheory|Sheldon Cooper from The Big Bang Theory and Young Sheld

freq 6 weight 1.791759469228055: GreysAnatomy|Callie Torres
freq 6 weight 1.791759469228055: OneTreeHill|Haley James-Scott
freq 6 weight 1.791759469228055: Bones|Camille Saroyan
freq 6 weight 1.791759469228055: Futurama|John A. Zoidberg, M.D.
freq 6 weight 1.791759469228055: Smallville|Oliver Queen / Green Arrow (Justin Hartley)
freq 6 weight 1.791759469228055: DoctorWho|Doctor Who ? Jack Harkness
freq 6 weight 1.791759469228055: DoctorWho|Seventh Doctor
freq 6 weight 1.791759469228055: QueerAsFolk|Theodore (Ted) Schmidt
freq 6 weight 1.791759469228055: Futurama|Hermes Conrad
freq 5 weight 1.6094379124341003: Charmed1998|Leo Wyatt
freq 5 weight 1.6094379124341003: Andromeda|Rommie
freq 5 weight 1.6094379124341003: StarTrek|Doctor Phlox (John Billingsley)
freq 5 weight 1.6094379124341003: AdventureTime|Princess Bubblegum
freq 5 weight 1.6094379124341003: GreysAnatomy|Cristina Yang
freq 5 weight 1.6094379124341003: Merlin2008|Uther Pendragon
freq 5 weight 1.6094379124341003: TheOfficeUS|

100%|██████████| 200.0/200 [43:03<00:00, 11.78s/it]


sentence model considering: 806186
Some sentences may be removed due to not having enough good features.
Total of 455006/525545 sentences in consideration.
WRITING...
-- Writing train.txt, valid.txt --
Writing total 2404 from DoctorWho|Doctor Who ? Brigadier Lethbridge-Stewart with weight 3.3978952727983707, ratio 1...
Writing total 2196 from DoctorWho|Josephine "Jo" Grant with weight 1.0, ratio 1...
Writing total 8160 from Friends|Rachel Karen Green with weight 3.3978952727983707, ratio 1...
Writing total 772 from OnceUponATime|Belle/Lacey with weight 1.0, ratio 1...
Writing total 2308 from OnceUponATime|Princess Snow White/Mary Margaret Nolan with weight 1.0, ratio 1...
Writing total 2873 from DoctorWho|Jamie McCrimmon with weight 1.0, ratio 1...
Writing total 9344 from StarTrek|Captain James Tiberius Kirk with weight 4.465735902799727, ratio 1...
Writing total 4027 from StarTrek|Lieutenant Junior Grade B'Elanna Torres with weight 1.0, ratio 1...
Writing total 1013 from TheMentalist|

Writing total 2642 from DawsonsCreek|Jack McPhee with weight 1.0, ratio 1...
Writing total 6166 from TheOfficeUS|Dwight Kurt Schrute III with weight 1.0, ratio 1...
Writing total 1957 from QueerAsFolk|Theodore (Ted) Schmidt with weight 1.0, ratio 1...
Writing total 1542 from DawsonsCreek|Andrea "Andie" McPhee with weight 1.0, ratio 1...
Writing total 11566 from Charmed1998|Phoebe Halliwell with weight 1.0, ratio 1...
Writing total 1774 from TheBigBangTheory|Bernadette Rostenkowski with weight 1.0, ratio 1...
Writing total 1734 from Andromeda|Tyr Anasazi with weight 1.0, ratio 1...
Writing total 4561 from TheBigBangTheory|Howard Wolowitz with weight 1.0, ratio 1...
Writing total 3315 from Futurama|Bender Bending Rodriguez (Bending Unit 22) with weight 1.0, ratio 1...
Writing total 2414 from DoctorWho|Gwen Cooper with weight 1.0, ratio 1...
Writing total 1342 from OneTreeHill|Clayton "Clay" Evans with weight 1.0, ratio 1...
Writing total 4758 from Charmed1998|Prudence 'Prue' Halliwell wi

Writing total 679 from Minor character (not enough features). with weight 1.0, ratio 1...
Writing total 4256 from DawsonsCreek|Jennifer "Jen" Lindley with weight 1.0, ratio 1...
Writing total 10329 from Supernatural|Dean Winchester with weight 5.02535169073515, ratio 1...
Writing total 1456 from SonsOfAnarchy|Gemma Teller Morrow (n?e Madock) with weight 1.0, ratio 1...
Writing total 795 from TheOfficeUS|Janet "Jan" Levinson (formerly Levinson-Gould) with weight 1.0, ratio 1...
Writing total 1284 from GilmoreGirls|Michel Gerard with weight 1.0, ratio 1...
Writing total 1060 from TheOfficeUS|Darryl Mathias Philbin with weight 1.0, ratio 1...
Writing total 916 from Futurama|John A. Zoidberg, M.D. with weight 1.0, ratio 1...
Writing total 912 from TheMentalist|Wayne Rigsby with weight 1.0, ratio 1...
Writing total 2563 from StarTrek|Doctor (Lieutenant Commander) Leonard Horatio "Bones" McCoy with weight 4.13549421592915, ratio 1...
Writing total 5581 from DoctorWho|Third Doctor with weight

Total training heads: 13530074, Avg 17.17 per sentence
REPORT: 246 out of 327 characters are in train.

----- LOCK target character TheSimpsons|Marjorie "Marge" Jacqueline Simpson (n?e Bouvier) -----
Considering 4428 out of 44288 top ranked characters.
246
Level 1 total 4428.
Level 2 total 23007
filtering on 10 accepable overlap.
----POSITIVE----
freq 75 weight 4.31748811353631: TheBigBangTheory|Sheldon Cooper from The Big Bang Theory and Young Sheldon
freq 69 weight 4.23410650459726: Friends|Monica E. Geller
freq 61 weight 4.110873864173311: Supernatural|Sam Winchester
freq 56 weight 4.02535169073515: Supernatural|Dean Winchester
freq 56 weight 4.02535169073515: Merlin2008|Merlin The Character
freq 53 weight 3.970291913552122: Merlin2008|Arthur Pendragon
freq 52 weight 3.9512437185814275: OnceUponATime|The Evil Queen/Regina Mills/Roni
freq 51 weight 3.9318256327243257: Merlin2008|Lady Morgana
freq 48 weight 3.871201010907891: Buffyverse|Joyce
freq 46 weight 3.828641396489095: Friends|

freq 3 weight 1.0986122886681098: Smallville|Jonathan Kent
freq 3 weight 1.0986122886681098: GilmoreGirls|Lucas "Luke" Danes
freq 3 weight 1.0986122886681098: Smallville|Martha Kent
freq 3 weight 1.0986122886681098: OneTreeHill|James "Jamie" Scott
freq 3 weight 1.0986122886681098: Charmed1998|Paige Matthews
freq 3 weight 1.0986122886681098: DawsonsCreek|Andrea "Andie" McPhee
freq 3 weight 1.0986122886681098: Bones|Angela Montenegro
freq 3 weight 1.0986122886681098: Roswell|Liz Parker
freq 3 weight 1.0986122886681098: Charmed1998|Prudence 'Prue' Halliwell
freq 3 weight 1.0986122886681098: GreysAnatomy|Izzie Stevens
freq 3 weight 1.0986122886681098: SonsOfAnarchy|Jackson "Jax" Teller
freq 3 weight 1.0986122886681098: TheOC|Ryan Atwood
freq 3 weight 1.0986122886681098: StarTrek|Lieutenant Commander Data
freq 3 weight 1.0986122886681098: StarTrek|Lieutenant Malcolm Reed (Dominic Keating)
freq 3 weight 1.0986122886681098: TrueBlood|Sookie
freq 3 weight 1.0986122886681098: GilmoreGirls|Zach 

100%|██████████| 200.0/200 [37:21<00:00, 10.26s/it]


sentence model considering: 682855
Some sentences may be removed due to not having enough good features.
Total of 397567/454285 sentences in consideration.
WRITING...
-- Writing train.txt, valid.txt --
Writing total 8160 from Friends|Rachel Karen Green with weight 4.828641396489095, ratio 1...
Writing total 772 from OnceUponATime|Belle/Lacey with weight 3.302585092994046, ratio 1...
Writing total 9344 from StarTrek|Captain James Tiberius Kirk with weight 1.0, ratio 1...
Writing total 4027 from StarTrek|Lieutenant Junior Grade B'Elanna Torres with weight 1.0, ratio 1...
Writing total 2308 from OnceUponATime|Princess Snow White/Mary Margaret Nolan with weight 4.367295829986475, ratio 1...
Writing total 3068 from GilmoreGirls|Richard Gilmore with weight 1.0, ratio 1...
Writing total 3692 from HowIMetYourMother|Marshall Eriksen with weight 3.302585092994046, ratio 1...
Writing total 3247 from CSIVerse|Captain Jim Brass with weight 1.0, ratio 1...
Writing total 1013 from TheMentalist|Kimbal

Writing total 1003 from Alias|Marcus Dixon with weight 1.0, ratio 1...
Writing total 4758 from Charmed1998|Prudence 'Prue' Halliwell with weight 1.0, ratio 1...
Writing total 1263 from MyLittlePonyFriendshipIsMagic|Apple Bloom with weight 3.3978952727983707, ratio 1...
Writing total 1355 from GilmoreGirls|Kirk Gleason with weight 1.0, ratio 1...
Writing total 797 from Salem|Mary Sibley with weight 1.0, ratio 1...
Writing total 1902 from OnceUponATime|Prince David "Charming"/David Nolan with weight 1.0, ratio 1...
Writing total 3135 from HowIMetYourMother|Lily Aldrin with weight 1.0, ratio 1...
Writing total 7107 from Friends|Friends Joey Tribbiani with weight 3.4849066497880004, ratio 1...
Writing total 3076 from Bones|Angela Montenegro with weight 1.0, ratio 1...
Writing total 2353 from Andromeda|Rommie with weight 1.0, ratio 1...
Writing total 640 from TheOriginals|Hayley Marshall-Kenner with weight 1.0, ratio 1...
Writing total 724 from TheOC|Jimmy Cooper with weight 3.3025850929940

Writing total 660 from Charmed1998|Billie Jenkins with weight 1.0, ratio 1...
Writing total 12127 from Bones|Temperance Brennan with weight 4.13549421592915, ratio 1...
Writing total 2954 from OnceUponATime|The Evil Queen/Regina Mills/Roni with weight 4.951243718581427, ratio 1...
Writing total 7370 from Friends|Chandler Muriel Bing with weight 3.3978952727983707, ratio 1...
Writing total 1369 from CSIVerse|Greg Sanders with weight 1.0, ratio 1...
Writing total 5983 from OneTreeHill|Brooke Davis with weight 3.995732273553991, ratio 1...
Writing total 1587 from OneTreeHill|James "Jamie" Scott with weight 1.0, ratio 1...
Writing total 707 from TheMentalist|Grace Van Pelt with weight 1.0, ratio 1...
Writing total 3461 from CSIVerse|Sara Sidle with weight 1.0, ratio 1...
Writing total 996 from Smallville|Oliver Queen / Green Arrow (Justin Hartley) with weight 1.0, ratio 1...
Writing total 1268 from GreysAnatomy|Preston Burke with weight 1.0, ratio 1...
Writing total 2890 from StarTrek|Neel

In [26]:
raise Exception('END, ALL NORMAL')

Exception: END, ALL NORMAL

In [None]:
dialogs

In [None]:
# write a fixed candidate file (for demo propose only)
def dump_candidate_files(path, n=80000):
    with open(path, 'a+') as _f:
        for line in dialogs.dia2.sample(n=n, random_state=714861):
            _f.write('{}\n'.format(clean_line(line)))

dump_candidate_files('fixed_80k.txt')

In [None]:
dialogs.char_id.value_counts()

In [None]:
chars

In [None]:
for row in dialogs.itertuples():
#     print getattr(row, "c1"), getattr(row, "c2")
    if row.dia2 == 'Yeah. But I think the thing that makes me maddest is that he\'s right.':
        print(row.Index, row.show_id, row.char_id, '\n', row.dia1, '\n', row.dia2)
        break

In [None]:
chars.loc[chars.char_id == 'l4390']

In [None]:
chars.feature.value_counts()

In [None]:
REDUCTED_HLAs

In [None]:
_total = 0
_p = 0
for k, v in REDUCTED_HLAs.items():
    _total += 1
    if k == v:
        _p += 1

print(float(_p) / float(_total))  # varify reducted to around 30%. 

In [None]:
chars.feature.unique().tolist()