In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
from itertools import chain

import nltk
import numpy as np
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

import sklearn_crfsuite
from sklearn_crfsuite import scorers
from sklearn_crfsuite import metrics

In [3]:
nltk.corpus.conll2002.fileids()

['esp.testa', 'esp.testb', 'esp.train', 'ned.testa', 'ned.testb', 'ned.train']

In [4]:
%%time
train_sents = list(nltk.corpus.conll2002.iob_sents('esp.train'))
test_sents = list(nltk.corpus.conll2002.iob_sents('esp.testb'))

CPU times: user 6.98 s, sys: 366 ms, total: 7.35 s
Wall time: 10.9 s


In [5]:
train_sents[0]

[('Melbourne', 'NP', 'B-LOC'),
 ('(', 'Fpa', 'O'),
 ('Australia', 'NP', 'B-LOC'),
 (')', 'Fpt', 'O'),
 (',', 'Fc', 'O'),
 ('25', 'Z', 'O'),
 ('may', 'NC', 'O'),
 ('(', 'Fpa', 'O'),
 ('EFE', 'NC', 'B-ORG'),
 (')', 'Fpt', 'O'),
 ('.', 'Fp', 'O')]

In [6]:
# define features 

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    
    features = {
        'bias': 1.0,
        'word.lower()': word.lower(),
        'word[-3:]': word[-3:],
        'word[-2:]': word[-2:],
        'word.isupper()': word.isupper(),
        'word.istitle()': word.istitle(),
        'word.isdigit()': word.isdigit(),
        'postag': postag,
        'postag[:2]': postag[:2],        
    }
    if i > 0:
        word1 = sent[i-1][0]
        postag1 = sent[i-1][1]
        features.update({
            '-1:word.lower()': word1.lower(),
            '-1:word.istitle()': word1.istitle(),
            '-1:word.isupper()': word1.isupper(),
            '-1:postag': postag1,
            '-1:postag[:2]': postag1[:2],
        })
    else:
        features['BOS'] = True
        
    if i < len(sent)-1:
        word1 = sent[i+1][0]
        postag1 = sent[i+1][1]
        features.update({
            '+1:word.lower()': word1.lower(),
            '+1:word.istitle()': word1.istitle(),
            '+1:word.isupper()': word1.isupper(),
            '+1:postag': postag1,
            '+1:postag[:2]': postag1[:2],
        })
    else:
        features['EOS'] = True
        
    features['num-1'] = np.log(3.99 * ((i+1) **2))
    features['num-2'] = 0.28 * i             
        
    return features


def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label for token, postag, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

In [7]:
# a feature set is a dictionary

sent2features(train_sents[0])[0] # sequence of dictionaries

{'bias': 1.0,
 'word.lower()': 'melbourne',
 'word[-3:]': 'rne',
 'word[-2:]': 'ne',
 'word.isupper()': False,
 'word.istitle()': True,
 'word.isdigit()': False,
 'postag': 'NP',
 'postag[:2]': 'NP',
 'BOS': True,
 '+1:word.lower()': '(',
 '+1:word.istitle()': False,
 '+1:word.isupper()': False,
 '+1:postag': 'Fpa',
 '+1:postag[:2]': 'Fp',
 'num-1': 1.3837912309017721,
 'num-2': 0.0}

In [8]:
%%time
X_train = [sent2features(s) for s in train_sents]
y_train = [sent2labels(s) for s in train_sents]

X_test = [sent2features(s) for s in test_sents]
y_test = [sent2labels(s) for s in test_sents]   # list of lists

print("> dim(X_train): {}, dim(X_test):{} ".format(len(X_train), len(X_test)))
print("> dtype | type(X_train[0]): {}, type(X_train[0][0]): {}".format(type(X_train[0]), type(X_train[0][0])))
print("> y_test: {}".format(len(y_test)))

> dim(X_train): 8323, dim(X_test):1517 
> dtype | type(X_train[0]): <class 'list'>, type(X_train[0][0]): <class 'dict'>
> y_test: 1517
CPU times: user 5.68 s, sys: 425 ms, total: 6.11 s
Wall time: 9.04 s


In [9]:
%%time
crf = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)

print("> X_train[0][0]: {}".format(X_train[0][0]))
print("> X_train[0]: {}".format(X_train[0]))  # list of dictionary
print("> y_train[0]: {}".format(y_train[0]))  # list of strings

crf.fit(X_train, y_train)

> X_train[0][0]: {'bias': 1.0, 'word.lower()': 'melbourne', 'word[-3:]': 'rne', 'word[-2:]': 'ne', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'postag': 'NP', 'postag[:2]': 'NP', 'BOS': True, '+1:word.lower()': '(', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:postag': 'Fpa', '+1:postag[:2]': 'Fp', 'num-1': 1.3837912309017721, 'num-2': 0.0}
> X_train[0]: [{'bias': 1.0, 'word.lower()': 'melbourne', 'word[-3:]': 'rne', 'word[-2:]': 'ne', 'word.isupper()': False, 'word.istitle()': True, 'word.isdigit()': False, 'postag': 'NP', 'postag[:2]': 'NP', 'BOS': True, '+1:word.lower()': '(', '+1:word.istitle()': False, '+1:word.isupper()': False, '+1:postag': 'Fpa', '+1:postag[:2]': 'Fp', 'num-1': 1.3837912309017721, 'num-2': 0.0}, {'bias': 1.0, 'word.lower()': '(', 'word[-3:]': '(', 'word[-2:]': '(', 'word.isupper()': False, 'word.istitle()': False, 'word.isdigit()': False, 'postag': 'Fpa', 'postag[:2]': 'Fp', '-1:word.lower()': 'melbourne', '-1:word.i

In [10]:
labels = list(crf.classes_)
labels.remove('O')
print(labels)

# dir(crf)
crf.state_features_   # Dict with state feature coefficients – {(attr_name, label) -- coef}
# crf.transition_features_

['B-LOC', 'B-ORG', 'B-PER', 'I-PER', 'B-MISC', 'I-ORG', 'I-LOC', 'I-MISC']


{('bias', 'B-LOC'): -0.298796,
 ('bias', 'O'): 1.532135,
 ('bias', 'B-ORG'): -0.035727,
 ('bias', 'B-PER'): -0.207561,
 ('bias', 'I-PER'): -0.702335,
 ('bias', 'B-MISC'): -0.064028,
 ('bias', 'I-ORG'): -0.408596,
 ('bias', 'I-LOC'): -0.017751,
 ('bias', 'I-MISC'): -0.083637,
 ('word.lower():melbourne', 'B-LOC'): 1.113998,
 ('word.lower():melbourne', 'I-MISC'): 0.407021,
 ('word[-3:]:rne', 'B-LOC'): 0.623061,
 ('word[-3:]:rne', 'B-ORG'): 0.000644,
 ('word[-3:]:rne', 'I-MISC'): 0.376198,
 ('word[-2:]:ne', 'B-LOC'): 6.3e-05,
 ('word[-2:]:ne', 'O'): 0.695846,
 ('word[-2:]:ne', 'B-ORG'): 0.232472,
 ('word[-2:]:ne', 'B-PER'): 0.683416,
 ('word[-2:]:ne', 'I-PER'): 0.003849,
 ('word[-2:]:ne', 'I-ORG'): -0.020744,
 ('word[-2:]:ne', 'I-MISC'): -0.025966,
 ('word.isupper()', 'B-LOC'): 0.304402,
 ('word.isupper()', 'O'): -5.749025,
 ('word.isupper()', 'B-ORG'): 2.659295,
 ('word.isupper()', 'B-PER'): 0.270291,
 ('word.isupper()', 'I-PER'): -0.176889,
 ('word.isupper()', 'B-MISC'): 2.25353,
 ('word

In [11]:
y_pred = crf.predict(X_test)
score = metrics.flat_f1_score(y_test, y_pred, 
                      average='weighted', labels=labels)

print('> flat f1 score: {}'.format(score))
print("> y_pred: {}".format(y_pred[:10]))

> flat f1 score: 0.7965347522966183
> y_pred: [['B-LOC', 'I-LOC', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O'], ['O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'B-LOC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O'], ['B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-MISC', 'O'], ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-LOC', 'I-LOC', 'O'], ['B-PER', 'O', 'B-LOC', 'O', 'O', 'O'

In [21]:
from sklearn.preprocessing import MinMaxScaler

def standardize(X, scaler=None): 
    """
    X: numpy array, or 
       a list of (list of dictionaries) for sequence model (e.g. CRF)
       

    """
    from sklearn.preprocessing import MinMaxScaler

    scaler = {}  # vdict
    if isinstance(X[0][0], dict):  # X[0] is a list (of dictionaries)
        N = len(X)
        fset = set()
        for d in X[0]: 
            fset.update( d.keys() )

        # ... fit
        scaler = {f: {'min': np.inf , 'max': -np.inf} for f in fset}  # value dictionary
        for f in fset: 
            
            for j in range(N): 
                dseq = X[j]
                fv = [di[f] for di in dseq if f in di]  # this variable across all feature dict 
                min_j, max_j = min(fv), max(fv)

                if min_j < scaler[f]['min']: scaler[f]['min'] = min_j
                if max_j > scaler[f]['max']: scaler[f]['max'] = max_j
        # each feature now has its min, and max

        # ... transform
        for f in fset: 
            for j in range(N): 
                dseq = X[j]
                for i, di in enumerate(dseq): 
                    if f in di: 
                        di[f] = (di[f]-scaler[f]['min'])/(scaler[f]['max']-scaler[f]['min']+0.0)
                        # dseq[i] = di
    else: 
        if scaler is None: scaler = MinMaxScaler()
        X = scaler.fit_transform(X)
        
    return X, scaler

def transform(X, scaler=None): 
    if scaler is None or not scaler: 
        # no-op
        return X

    if isinstance(X[0][0], dict):  # X[0] is a list (of dictionaries)
        assert isinstance(scaler, dict)

        fset = list(scaler.keys())
        N = len(X)

        # transform
        for f in fset: 
            for j in range(N): 
                dseq = X[j]
                for i, di in enumerate(dseq): 
                    if f in di: 
                        di[f] = (di[f]-scaler[f]['min'])/(scaler[f]['max']-scaler[f]['min']+0.0)
                        # dseq[i] = di
    else: 
        X = scaler.transform(X)
    
    return X

In [22]:
import sklearn as sk
from sklearn.linear_model import LogisticRegression
import pandas as pd
import os

prefix = '/Users/pleiades/work/data/diabetes'
# os.chdir('/Users/stevenhurwitt/Documents/Blog/Classification')
fpath = os.path.join(prefix, 'diabetes.csv')
assert os.path.exists(fpath)
df = pd.read_csv(fpath, sep=',',header=0)
print(df.head())

k = 'class'
print("> unique values of {}: {}".format(k, df[k].unique()))

# creating a dict file  
labels = {'tested_negative': 0,'tested_positive': 1} 
df[k] = [labels[v] for v in df[k]]

y = df['class'].values
X = df.drop('class', axis=1).values

print("> values(y): {}".format(np.unique(y)))
print("> dim(X): {}, dim(y): {}".format(X.shape, y.shape))

LR = LogisticRegression(random_state=0, solver='lbfgs', multi_class='ovr').fit(X, y)
y_proba = LR.predict_proba(X)
print(y_proba[:, 1])

   preg  plas  pres  skin  insu  mass   pedi  age            class
0     6   148    72    35     0  33.6  0.627   50  tested_positive
1     1    85    66    29     0  26.6  0.351   31  tested_negative
2     8   183    64     0     0  23.3  0.672   32  tested_positive
3     1    89    66    23    94  28.1  0.167   21  tested_negative
4     0   137    40    35   168  43.1  2.288   33  tested_positive
> unique values of class: ['tested_positive' 'tested_negative']
> values(y): [0 1]
> dim(X): (768, 8), dim(y): (768,)
[0.71097486 0.05553948 0.82575251 0.0450562  0.95208769 0.15461324
 0.07054015 0.63181601 0.63291146 0.04538278 0.20439439 0.89175845
 0.84839091 0.57971155 0.6322854  0.45062654 0.34578912 0.20218305
 0.3175605  0.24564533 0.40207693 0.30226711 0.9282186  0.28544346
 0.63233778 0.40399456 0.69516431 0.06031307 0.5027941  0.27104115
 0.40428383 0.61432906 0.05851874 0.04228352 0.43524466 0.20691687
 0.66001291 0.40302963 0.17918789 0.63728229 0.72050958 0.70386856
 0.11186769



In [23]:
from pandas import read_csv
def read_fold(path, fold, shuffle=False, test_ratio=None, fold_count=5, random_state=None):
    if shuffle: 
        from sklearn.model_selection import train_test_split

        if random_state is None: random_state = int(time.time()+random.randint(1, 1000)+fold)

        # aggregate and then reshuffle
        dfs = []
        train_df = read_csv('%s/validation-%i.csv.gz' % (path, fold), index_col = [0, 1], compression = 'gzip')
        test_df = read_csv('%s/predictions-%i.csv.gz' % (path, fold), index_col = [0, 1], compression = 'gzip')
        train_size = train_df.shape[0]
        test_size = test_df.shape[0]
        dfs.extend([train_df, test_df])

        # aggregate 
        df = concat(dfs, axis = 0)

        labels = df.index.get_level_values('label').values
        # print('... ids: {ids}'.format(ids=df.index.get_level_values('id').values[:100]))
        
        ### train-test split
        if test_ratio is None or test_ratio <= 0: test_ratio = 1/(fold_count+0.0)

        train_df, test_df, train_labels, test_labels = train_test_split(df, labels, test_size=test_ratio, 
            shuffle=True, stratify=labels, random_state=random_state)  # random_state
        # assert all(train_df.index.get_level_values('label').values == train_labels)
    else:
        train_df        = read_csv('%s/validation-%i.csv.gz' % (path, fold), index_col = [0, 1], compression = 'gzip')
        test_df         = read_csv('%s/predictions-%i.csv.gz' % (path, fold), index_col = [0, 1], compression = 'gzip')
        train_labels    = train_df.index.get_level_values('label').values
        test_labels     = test_df.index.get_level_values('label').values
    return train_df, train_labels, test_df, test_labels

path = '/Users/pleiades/work/data/pf2'
train_df, train_labels, test_df, test_labels = read_fold(path, 1)
R = train_df.values.T
T = test_df.values.T 

print("> dim(R): {}, dim(T): {}".format(R.shape, T.shape))

> dim(R): (50, 3183), dim(T): (50, 796)


In [24]:
# numerical prediction
import collections

def estimateLabelMatrix(R, L=[], p_th=[], pos_label=1, neg_label=0, ratio_small_class=0.01):
    Lh = np.zeros(R.shape).astype(int)
    
    for i in range(R.shape[0]):  # foreach user/classifeir
        cols_pos = R[i] >= p_th[i]
        Lh[i, cols_pos] = pos_label

    return Lh

def correctness_matrix(X, L, p_th, target_label=None): 
    """
    Compute a binary matrix in which 1 represents a correct prediction (i.e. TP or TN), 
    and 0 represents a false prediction (i.e. FP or FN). Predicted labels (Lh) are determined by the given probability threshold (p_th). 
    
    Lh(X, p_th)

    Lh are compared with the "ground truth" L to determine the correctness. 
    """
    Lh = estimateLabelMatrix(X, p_th=p_th)
    
    if target_label is not None: 
        return ((Lh == L[None, :]) & (Lh == target_label)).astype(int)
    return (Lh == L[None, :]).astype(int), Lh

def polarity_matrix(X, L, p_th, reduced_negative=-1, pos_label=1, neg_label=0): 
    Mc, Lh = correctness_matrix(X, L, p_th)
    return to_polarity(Mc), Lh

def color_matrix(X, L, p_th, reduced_negative=False, pos_label=1, neg_label=0): 
    sample_types = ['tp', 'tn'] + ['fp', 'fn']
    codes = {'tp': 2, 'tn': 1, 'fp': -2, 'fn': -1, 
            'unk': 0, 't': 3, 'f': -3}

    Mc, Lh = correctness_matrix(X, L, p_th)  # Mc is a (0, 1)-matrix
    n_users = X.shape[0]

    predict_pos = (Lh == pos_label)  # Given BP's prediction Lh, select entries ~ target label
    predict_neg = (Lh == neg_label)

    cells_tp = (Mc == pos_label) & predict_pos   # estimated
    cells_tn = (Mc == pos_label) & predict_neg
    cells_fp = (Mc == neg_label) & predict_pos
    cells_fn = (Mc == neg_label) & predict_neg

    Pc = np.zeros(X.shape)
    Pc[cells_tp] = codes['tp']
    Pc[cells_tn] = codes['tn']

    if reduced_negative: 
        Pc[cells_fp | cells_fn] = -1
    else: 
        Pc[cells_fp] = codes['fp']
        Pc[cells_fn] = codes['fn']

    return Pc, Lh

def to_polarity(M, verify=False): 
    # from preference matrix to polarity matrix

    # if verify: 
    #     vmin, vmax = np.min(M), np.max(M)

    import scipy.sparse as sparse
    P = np.ones(M.shape)  
    if sparse.issparse(M):      
        P[M.toarray() == 0] = -1    # preference 0 ~ negative polarity 
        P = sparse.csr_matrix(P)
    else: 
        P[M == 0] = -1 
    return P

pos_label, neg_label = 1, 0 

X = np.hstack([R, T]) # np.random.random(dim)
Nu, Ni = X.shape 
dim = (Nu, Ni)

Lh = np.zeros(X.shape)
Lh[X>=0.5] = 1

p_th = [0.5] * X.shape[0]

# create labels 
L = np.zeros(Lh.shape[1])
for j in range(X.shape[1]): 
    # majority vote of the user/classifiers for data j is True => positive
    if collections.Counter(X[:, j] >= p_th).most_common(1)[0][0]: # if it's that majory vote says positive
         L[j] = pos_label

print("> X: {}, L: {}".format(X[:10, :10], Lh[:10, :10]))

n_train = int(0.7 * Ni)
Lr, Lt = L[:n_train], L[n_train:]
y_train, y_test = Lr, Lt    # <<< 

Mc, Lh = correctness_matrix(X, L, p_th)
Lhr, Lht = Lh[:, :n_train], Lh[:, n_train:]

R, T = X[:, :n_train], X[:, n_train:]
Mcr, Mct = Mc[:, :n_train], Mc[:, n_train:]
assert Lhr.shape == R.shape
assert Lht.shape == T.shape
assert Mcr.shape == R.shape, "dim(Mcr): {}, dim(R): {}".format(Mcr.shape, R.shape)
print('> dim(R): {}, dim(T): {}'.format(R.shape, T.shape))


> X: [[2.70000e-04 0.00000e+00 1.67009e-01 5.66828e-01 9.10968e-01 8.41993e-01
  9.99994e-01 9.93500e-03 1.67000e-04 3.63000e-04]
 [2.05263e-01 2.64447e-01 9.98612e-01 9.99422e-01 7.12024e-01 9.18862e-01
  6.53757e-01 4.60000e-05 4.49200e-03 0.00000e+00]
 [3.83500e-03 1.16000e-04 5.24148e-01 8.59905e-01 9.94505e-01 8.26940e-02
  9.98994e-01 1.63100e-03 3.11000e-04 4.38470e-02]
 [2.06500e-03 0.00000e+00 9.98423e-01 9.86084e-01 3.34956e-01 9.85210e-01
  9.91423e-01 0.00000e+00 2.50000e-04 7.44184e-01]
 [1.00000e-06 1.00000e-06 8.55388e-01 8.60899e-01 9.79106e-01 7.35852e-01
  9.99289e-01 2.79300e-03 8.69100e-03 3.70110e-02]
 [1.09200e-03 0.00000e+00 9.35765e-01 8.96329e-01 9.97751e-01 1.31276e-01
  9.97744e-01 4.72880e-02 1.44000e-04 1.75000e-03]
 [6.05490e-01 0.00000e+00 8.03895e-01 9.95544e-01 4.97894e-01 5.97560e-02
  9.99955e-01 9.16805e-01 1.14500e-03 0.00000e+00]
 [6.04367e-01 3.61919e-01 2.92312e-01 9.93239e-01 9.72625e-01 1.53160e-02
  9.97028e-01 5.14100e-03 2.21900e-03 6.48510e

In [25]:
class Polarity(object): 
    sample_types = ['tp', 'tn'] + ['fp', 'fn']
    codes = {'tp': 2, 'tn': 1, 'fp': -2, 'fn': -1, 
            'unk': 0, 't': 3, 'f': -3}

sample_types = ['tp', 'tn'] + ['fp', 'fn']
codes = {'tp': 2, 'tn': 1, 'fp': -2, 'fn': -1, 
        'unk': 0, 't': 3, 'f': -3}

predict_pos = (Lhr == pos_label)  # Given BP's prediction Lh, select entries ~ target label
predict_neg = (Lhr == neg_label)
cells_tp = (Mcr == 1) & predict_pos   # estimated
cells_tn = (Mcr == 1) & predict_neg
cells_fp = (Mcr == 0) & predict_pos
cells_fn = (Mcr == 0) & predict_neg

scopes = {st: {} for st in sample_types}   # scope['tp'][0]: to be true positive, 0th classifier must have this proba range
for i in range(R.shape[0]):  # foreach classifier
    scopes['tp'][i] = {}

    # TPs
    v = R[i, :][cells_tp[i, :]]
    if len(v) > 0: 
        scopes['tp'][i] = {'min': np.min(v), 'max': np.max(v), 'mean': np.mean(v), 'median': np.median(v), 'sample': np.sort(v)}   # min, max, mean, median

    # TNs 
    v2 = R[i, :][cells_tn[i, :]]
    if len(v2) > 0: 
        scopes['tn'][i] = {'min': np.min(v2), 'max': np.max(v2), 'mean': np.mean(v2), 'median': np.median(v2), 'sample': np.sort(v)}   

    # ... positive polarity candidates 
    assert scopes['tp'][i]['median'] != scopes['tn'][i]['median'] 

    # FPs ~ TPs
    v3 = R[i, :][cells_fp[i, :]]
    if len(v3) > 0: 
        scopes['fp'][i] = {'min': np.min(v3), 'max': np.max(v3), 'mean': np.mean(v3), 'median': np.median(v3), 'sample': np.sort(v)}

    # FNs ~ TNs
    v4 = R[i, :][cells_fn[i, :]]
    if len(v4) > 0: 
        scopes['fn'][i] = {'min': np.min(v4), 'max': np.max(v4), 'mean': np.mean(v4), 'median': np.median(v4), 'sample': np.sort(v)}   
    # ... negative polarity candidates

In [26]:

def get_feature_sequence(R, j, p_th, Rm=None, C=None, Lh=None, p_model={}, name='', index=0, verbose=False, wsize=20): 
    Nu, Ni = R.shape
    fdx = []
    for i in range(Nu):  # foreach user/classifier index while holding item index fixed 
        # one feature dictionary per entry
        fd = get_vars_hstats(R, i, j, p_th=p_th, Rm=Rm, C=C, Lh=Lh, p_model=p_model, name=name, wsize=wsize, verbose=verbose, index=index, to_dict=True) 
        # fdv = get_vars_vstats(R, i, j, p_th=p_th, Rm=Rm, C=C, Lh=Lh, name=name, wsize=wsize, verbose=verbose, index=index, to_dict=True)
        # fd.update(fdv) # merge two dictionaries 

        fdx.append(fd)

    # output: a list of feature dictionaries, one per entry/classifier/user while holding column(j) fixed
    return fdx    

def get_vars_hstats(R, i, j, p_th, Rm=None, C=None, Lh=None, p_model={}, r_min=0.1, name='', index=0, verbose=False, wsize=20, to_dict=False):  
    # get BP prediction vector statistics as variables
    from scipy.stats import kurtosis, skew, ks_2samp

    # sample_types = ['tp', 'tn'] + ['fp', 'fn']
    # codes = {'tp': 2, 'tn': 1, 'fp': -2, 'fn': -1, 
    #         'unk': 0, 't': 3, 'f': -3}
    sample_types = Polarity.sample_types
    codes = Polarity.codes

    msg = ""
    # query point 
    q = pt_q = R[i, j]   # q

    fv = []  # features 
    fvn = []  # feature names
    
    # values 
    fv.append(q)
    fvn.append('value')
    
    # origina data index 
#     fv.append(j)
#     fvn.append('index')

    max_gap = 1.0

    N = R.shape[1]
    rk = -1   # rank of the query point
    
    wsize_min, wsize_max = N//100, 20 
    wsize = min(wsize_max, max(wsize_min, wsize))

    vn = 'delta_pth'
    delta = pt_q - p_th[i]
    # fv.append( delta ); fvn.append(vn)
    # ... case q > p_th, likely TP if L = 1, or FP if L = 0
    # ... case q <= p_th, likely FN if L = 1, or TN if L = 0

    ### rank?  can also use q-
    if Rm is not None and Lh is not None:
        vn = 'rank'   # label-specific rank

        if delta >= 0: 
            pts = Rm[i, :][Lh[i, :] == pos_label]
            rk = np.searchsorted(pts, pt_q, side='left')
        else: 
            # negative rank
            pts = Rm[i, :][Lh[i, :] == neg_label]
            n_pts = len(pts)
            rkc = np.searchsorted(pts, pt_q, side='left')
            rk = -((n_pts+1)-rkc)

        # N = Rm.shape[1]
        # Rm: either a sorted array (or a rank array)
        # r = np.searchsorted(R[i, :], q, side='left')  
        fv.append( rk ); fvn.append(vn) 

    # --- (raw) confidence score ---
    vn = 'c-score'
    if C is not None: 
        fv.append(C[i, j])
        fvn.append(vn)

    # assert len(fv.shape) == 1
    assert len(fv) == len(fvn), "dim(fv): {} <> dim(fvn): {}".format(len(fv), len(fvn))
    if verbose: 
        # for vn, v in zip(fvn, fv): 
        msg += "(get_vars_hstats) vars name values ({}):\n... {}\n".format(name, list(zip(fvn, fv)))
        # print("... q: {}, topk_th: {} | r_min: {}".format(q, topk_th, r_min))   # ... ok
        print(msg)

    if to_dict: 
        return dict(zip(fvn, fv))

    return np.array(fv)

In [27]:
# generate features 
def numeric_to_str(labels, codes={}): 
    if len(codes) == 0: codes = Polarity.codes
    
    # inverse the codes 
    inv_codes = {num: stype for stype, num in codes.items()}
    return [inv_codes[l] for i, l in enumerate(labels)]
def str_to_numeric(labels, codes={}): # 
    if len(codes) == 0: codes = Polarity.codes
    return [codes[l] for i, l in enumerate(labels)]
    
def to_numeric(Yh, codes={}): 
    if len(codes) == 0: codes = Polarity.codes
    
    Yhn = []
    for y in Yh: # foreach label sequence/list
        Yhn.append( [codes[e] for e in y] )
    return Yhn

import scipy.stats as stats

tMulticlass = True
gamma = 2

n_users = nbp = R.shape[0]
n_train = n_items = R.shape[1]
Rs = np.sort(R, axis=1)

# subsample the negative to match sample size of the positive
Lr = L[:n_train]
Mcr, Lhr = correctness_matrix(R, Lr, p_th)

pos_sample = np.where(Lr == pos_label)[0]  # <<< 
neg_sample = np.where(Lr == neg_label)[0]
n_neg = len(neg_sample)
n_pos = len(pos_sample)

neg_sample = np.random.choice(neg_sample, min(n_pos * gamma, n_neg), replace=False)
# neg_sample = neg_sample[:n_pos * gamma]

print("> neg_sample: {}".format(neg_sample[:10]))
print("> pos_sample: {}".format(pos_sample[:10]))

Xset, yset = [], []
iset = []
    
for j in neg_sample: # foreach negative-class example 

    # get feature repr for j-th item by varying user (i) while holding item (j) fixed
    fseq = get_feature_sequence(R, j, p_th, Rm=Rs, Lh=Lhr, p_model=scopes, name='TN-FP-seq', verbose=False, wsize=20) # feature sequence
    # ... a list of feature dictionaries
    Xset.append(fseq)

    # each element of the label sequence is either a TN or an FP
    ls_j = []  # label sequence for j-th item
    for i in range(n_users):  # foreach user/classifeir
        if Mcr[i, j] == 1: 
            polarity = 'pos'
            label = 'tn' #  codes['tn'] if tMulticlass else polarity
        elif Mcr[i, j] == 0: 
            polarity = 'neg'
            label = 'fp'  # codes['fp'] if tMulticlass else polarity
        else: 
            raise ValueError
        ls_j.append(label)

    assert len(fseq) == len(ls_j)
    yset.append(ls_j)
    iset.append(j)

#########################

for j in pos_sample:  # foreach positive-class example

    fseq = get_feature_sequence(R, j, p_th, Rm=Rs, Lh=Lhr, p_model=scopes, name='TP-FN-seq', verbose=False, wsize=20) # feature sequence
    Xset.append(fseq)

    # each element of the label sequence is either a TP or an FN
    ls_j = []  # label sequence for j-th item
    for i in range(n_users):  # foreach user/classifeir
        if Mcr[i, j] == 1: 
            polarity = 'pos'
            label = 'tp' # codes['tp'] if tMulticlass else polarity
        elif Mcr[i, j] == 0: 
            polarity = 'neg'
            label = 'fn' # codes['fn'] if tMulticlass else polarity
        else: 
            raise ValueError
        ls_j.append(label)

    assert len(fseq) == len(ls_j)
    yset.append(ls_j)
    iset.append(j)
        

> neg_sample: [1207 1142 1809 1283 1647 1547 2031 1258 1058 2523]
> pos_sample: [ 2  3  4  6 15 20 23 24 28 32]


In [28]:
# now train a sequence classifier
model = sklearn_crfsuite.CRF(
    algorithm='lbfgs', 
    c1=0.1, 
    c2=0.1, 
    max_iterations=100, 
    all_possible_transitions=True
)

print("> prior to standardizing ...")
print("> Xset[0][0]: {}".format(Xset[0][0]))

j = 192
j_eff = iset[j] # Xset[j][0]['index']

# j-th row vector corresponds to ieff-th column originally
print("> Xset[j]: {}, R[j_eff]: {} | j_eff: {}".format(Xset[j], R[:, j_eff], j_eff))
print("> yset[j]: {}".format(yset[j]))

Pc, Lhr2 = color_matrix(R, Lr, p_th)
assert np.all(Lhr == Lhr2)

sl = numeric_to_str(Pc[:, j_eff])
print("> polarity: {}".format(sl)) 
print("> polarity: {} =?= {}".format( str_to_numeric(sl), Pc[:, j_eff]))


> prior to standardizing ...
> Xset[0][0]: {'value': 0.017227000000000003, 'rank': -1200}
> Xset[j]: [{'value': 0.149773, 'rank': -949}, {'value': 0.925999, 'rank': 972}, {'value': 0.0, 'rank': -1658}, {'value': 0.99724, 'rank': 1074}, {'value': 0.988092, 'rank': 994}, {'value': 0.958353, 'rank': 949}, {'value': 0.940753, 'rank': 1024}, {'value': 0.097081, 'rank': -960}, {'value': 0.534, 'rank': 700}, {'value': 0.000637, 'rank': -1390}, {'value': 1.0, 'rank': 815}, {'value': 1.0, 'rank': 771}, {'value': 0.0, 'rank': -1744}, {'value': 0.0, 'rank': -1813}, {'value': 0.0, 'rank': -1740}, {'value': 0.0, 'rank': -1732}, {'value': 0.0, 'rank': -1701}, {'value': 0.0, 'rank': -1740}, {'value': 0.0, 'rank': -1723}, {'value': 0.0, 'rank': -1777}, {'value': 0.988338, 'rank': 962}, {'value': 0.172652, 'rank': -988}, {'value': 0.006876999999999999, 'rank': -1492}, {'value': 0.185341, 'rank': -903}, {'value': 0.5168470000000001, 'rank': 636}, {'value': 0.185975, 'rank': -974}, {'value': 0.096616, 'r

In [30]:
msg = ''

Xset, scaler = standardize(Xset)

model.fit(Xset, yset) # remember to take transpose
polarity_labels = list(model.classes_)
print("> labels: {}".format(polarity_labels)) 


> labels: ['tn', 'fp', 'fn', 'tp']


In [31]:
Mc, Lh = correctness_matrix(X, L, p_th)
Lhr, Lht = Lh[:, :n_train], Lh[:, n_train:]
Mcr, Mct = Mc[:, :n_train], Mc[:, n_train:]

T = np.zeros(T.shape)
Ts = np.sort(T, axis=1)
Xset, yset = [], []
for j in range(T.shape[1]):

    # get feature repr for j-th item by varying user (i) while holding item (j) fixed
    fseq = get_feature_sequence(T, j, p_th, Rm=Ts, Lh=Lht, p_model=scopes, name='predict-T', verbose=False, wsize=20) # feature sequence
    # ... a list of feature dictionaries
    Xset.append(fseq)  # Xset[j] -> feature sequence for the j-th column/item

    # [test]
    ls_j = []  # label sequence for j-th item
    for i in range(n_users):  # foreach user/classifeir
        ################################################################################
        if Lht[i, j] == pos_label and Mct[i, j] == 1: 
            ls_j.append('tp' if tMulticlass else '+' )
        elif Lht[i, j] == pos_label and Mct[i, j] == 0:
            ls_j.append('fp' if tMulticlass else '-' )  # 'fp'
        elif Lht[i, j] == neg_label and Mct[i, j] == 1:
            ls_j.append('tn' if tMulticlass else '+'  )
        elif Lht[i, j] == neg_label and Mct[i, j] == 0:
            ls_j.append('fn' if tMulticlass else '-' ) # 'fn'
        ################################################################################
    # [test]
    assert len(fseq) == len(ls_j) == T.shape[0], \
        "size(feature seq): {}, size(label seq): {}, n_classifiers: {}".format(len(fseq), len(ls_j), T.shape[0])
    yset.append(ls_j)

Xset = transform(Xset, scaler)
y_pred = model.predict( Xset )  # transform(Xset, scaler)

# convert to np.array format 
M = np.zeros(T.shape)
for j, yj in enumerate(y_pred): 
    if j == 0: assert len(yj) == T.shape[0]
        
    # yj: sequence/list of strings as labels
    M[:, j] = str_to_numeric(yj)

f1 = metrics.flat_f1_score(yset, y_pred, 
              average='weighted', labels=polarity_labels)
msg += "(polarity_modeling) flat F1 score on T: {}\n".format(f1)

print(msg)

(polarity_modeling) flat F1 score on T: 0.3940633926196925



  'precision', 'predicted', average, warn_for)


In [None]:
y_test = yset

# group B and I results
sorted_labels = sorted(
    polarity_labels, 
    key=lambda name: (name[1:], name[0])
)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

In [None]:
### model selection

def optimize_crf_params(X, y, model, labels, max_size=5000): 
    import scipy.stats as stats
    from sklearn.metrics import make_scorer
    # import sklearn_crfsuite
    from sklearn_crfsuite import scorers
    from sklearn_crfsuite import metrics

    params_space = {
        'c1': stats.expon(scale=0.5),
        'c2': stats.expon(scale=0.05),
    }

    # use f1 for evaluation
    f1_scorer = make_scorer(metrics.flat_f1_score, 
                            average='weighted', labels=labels)

    # search
    rs = RandomizedSearchCV(model, params_space, 
                            cv=3, 
                            verbose=1, 
                            n_jobs=-1, 
                            n_iter=50, 
                            scoring=f1_scorer)

    X_train, y_train = X, y 
    N = len(X_train)
    if N > max_size: 
        indices = np.random.choice(range(N), max_size)
        X_train = list( np.asarray(X_train)[indices] )
        y_train = list( np.asarray(y_train)[indices] )

    rs.fit(X_train, y_train)

    print('best params:', rs.best_params_)
    print('best CV score:', rs.best_score_)
    print('model size: {:0.2f}M'.format(rs.best_estimator_.size_ / 1000000))

#     validate_crf_params(rs, output_path=None, dpi=300)

    return rs.best_estimator_


# Utility function to report best scores
def report(results, n_top=3):
    for i in range(1, n_top + 1):
        candidates = np.flatnonzero(results['rank_test_score'] == i)
        for candidate in candidates:
            print("Model with rank: {0}".format(i))
            print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
                  results['mean_test_score'][candidate],
                  results['std_test_score'][candidate]))
            print("Parameters: {0}".format(results['params'][candidate]))
            print("")



In [None]:
# Xset = transform(Xset, scaler)

model = optimize_crf_params(Xset, yset, model, labels=polarity_labels, max_size=5000)

In [None]:
# crf = model.best_estimator_
X_test, y_test = Xset, yset
y_pred = model.predict( X_test)
print(metrics.flat_classification_report(
    y_test, y_pred, labels=sorted_labels, digits=3
))

f1 = metrics.flat_f1_score(y_test, y_pred, 
              average='weighted', labels=polarity_labels)
msg += "(polarity_modeling) flat F1 score on T: {} AFTER model selection\n".format(f1)

print(msg)