# Examples

This notebook contains two example workflows, both based on matching two versions of S&P 500 companies. Data can be downloaded with the code in the notebook `downloads.ipynb` in the same folder.

Currently, only core functions are implemented in the library, so that there is a lot of administrational code. This will make its way into higher-level interfaces in the library itself.

In [None]:
%matplotlib inline

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder

from ptfidf import utils as ut
from ptfidf.train.aggregation import get_group_statistics, compress_group_statistics
from ptfidf.train.inference import map_estimate
from ptfidf.core import get_log_proba, get_log_prior, get_proba

In [None]:
def label_training_data(names, target, matched_symbols):
    """
    Label training data.
    
    Names with symbol in matched_symbols will be assigned to 
    the same entity (per symbol).
    
    Parameters
    ----------
    names : pandas.DataFrame
        columns ['source', 'symbol', ...]
    target : str
        value of source that is to be considered target list
    matched_symbols : sequence or int
        if sequence, these symbols are matched. If int,
        draw matched_symbols symbols randomly.
    Returns
    -------
    pandas.Series
        values are '{source}:{symbol}' where source equals target for
        matched symbols.
    """
    if isinstance(matched_symbols, int):
        matched_symbols = np.random.choice(np.unique(names['symbol']), replace=False, size=matched_symbols)
    res = pd.Series(index=names.index, name='entity')
    idx = names['symbol'].isin(matched_symbols)
    res.loc[idx] = target + ':' + names.loc[idx, 'symbol']
    res.loc[~idx] = names.loc[~idx, 'source'] + ':' + names.loc[~idx, 'symbol']
    return res


def get_evaluation_data(proba, correct_assignments):
    correct_assignments = pd.DataFrame(correct_assignments, columns=['row', 'col'])
    auc_data = ut.sparse_to_frame(proba).merge(correct_assignment.assign(y=1), on=['row', 'col'], how='left')
    auc_data['y'] = auc_data['y'].fillna(0).astype(int)
    return auc_data.rename(columns={'data': 'score'})

# Loading / Preprocessing

In [None]:
# read files and concatenate into single table
datadir = Path('../data').resolve()
filenames = ['wikipedia.csv', 'slickcharts.csv']
names = pd.concat([
    pd.read_csv(datadir.joinpath(fn)).assign(source=fn.split('.')[0]) for fn in filenames
], axis=0).sort_values(['symbol', 'source']).reset_index(drop=True)

# drop some symbols that are impossible to get correct with current preprocessing

# These differ only in appended Class <X>, where <X> is a single letter that is 
# dropped in the preprocessing
duplicate_companies = ['GOOGL', 'NWSA', 'DISCA', 'FOXA', 'UAA']
# No (or almost no) token overlap due to spelling variations
impossible_symbols = ['VFC', 'PHM', 'UNH', 'LLL']
names = names[~names['symbol'].isin(duplicate_companies + impossible_symbols)]

# vectorize
vec = CountVectorizer(binary=True).fit(names['name'])
X = vec.transform(names['name'])
vocabulary_inv = {v: k for k, v in vec.vocabulary_.items()}

# Online Training

For online training, companies are labeled correctly one by one - here in batches of 20. After each batch is labelled by the user, the predictions for the next batch are based on updated parameters from all preceding examples.

The figures below show the change of prediction quality effected through online training. For this small example, most predictions are pretty good without training. For most examples with low initial scores, there is a noticeable improvement. While effects vary depending on the order of batches, the overall effect is very stable.

In [None]:
# setup
symbols = np.random.permutation(np.unique(names['symbol']))
prior = (np.log(.2), 1.5)
batch_size = 20
log_odds_in_list = np.log(10.)

encoder = LabelEncoder()
res = pd.DataFrame()
pi, s = None, None
for train_size in range(0, symbols.size, batch_size):
    labels = label_training_data(names, 'wikipedia', symbols[:train_size])

    y = encoder.fit_transform(labels.values)
    counts, n_observations = get_group_statistics(X, y)
    is_test = np.array([s.startswith('slickcharts') for s in encoder.classes_])
    is_target = ~is_test
    pi, s = map_estimate(*compress_group_statistics(counts, n_observations), *prior, s_init=s, pi_init=pi)
    
    log_proba = get_log_proba(counts[is_test], counts[is_target], n_observations[is_target], pi, s)
    log_prior = get_log_prior(counts[is_test], pi)
    proba = get_proba(log_proba, log_prior, log_odds=log_odds_in_list)
    
    test_classes, target_classes = [{
        s.split(':')[-1]: i for i, s in enumerate(encoder.classes_[idx])}
        for idx in [is_test, is_target]]

    score = pd.Series({k: proba[test_classes[k], target_classes[k]] for k in test_classes}).sort_values()
    res = res.append(score.reset_index().rename(columns={'index': 'symbol', 0: 'proba'}).assign(train_size=train_size))
online_scores = res.sort_values(['symbol', 'train_size'])

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(14, 4))

ax = axes[0]
for _, df in online_scores.groupby('symbol'):
    ax.plot(df['train_size'].values, df['proba'].values, 'k-', alpha=.2)

ax.set_ylim((-.05, 1.05))
ax.grid()
ax.set_xlabel('matched examples')
ax.set_ylabel('predicted probability')
ax.set_title('prediction quality vs training set size\nup to prediction time')

ax = axes[1]
online_change = online_scores.groupby('symbol')['proba'].agg(['first', 'last'])
ax.plot(online_change['first'], online_change['last'], '.', alpha=.7)
ax.plot([0, 1], [0, 1], 'k:')
ax.grid()
ax.set_title('predicted probability of correct match\nwhen example is labeled')
ax.set_xlabel('before training')
ax.set_ylabel('after online training')
pass

# Unsupervised Learning

Instead of manually labelling examples, in unsupervised learing we use Expectation-Maximization to train parameters. We iteratively assign names randomly from the predictive distribution and update parameters using the resulting averaged entity statistics.

Here, we use a poor man's implementation that does not update entity statistics for simplicity. Instead, only global word-level parameters are maintained. We use a single matching sample to obtain the word-level statistics.

While results are not as good as for online learning (as expected), there are still marked improvements without manual input.

In [None]:
# setup
encoder = LabelEncoder()
prior = (np.log(.2), 1.5)
log_odds_in_list = np.log(10.)
learning_rate = .4
unsupervised_steps = 20

is_train = (names['source'] == 'slickcharts').values
is_target = ~is_train

test_classes = {s: i for i, s in enumerate(names[is_train]['symbol'])}
target_classes = {s: i for i, s in enumerate(names[is_target]['symbol'])}
correct_assignments = [(test_classes[k], target_classes[k]) for k in test_classes]

n_observations = np.ones(is_target.sum(), dtype=int)


# init with no assignments
pi, s = map_estimate(*compress_group_statistics(X, np.ones(X.shape[0], dtype=int)), *prior)

res = pd.DataFrame()
for step in range(unsupervised_steps + 1):
    # compute random assignments with current parameters
    log_proba = get_log_proba(X[is_train], X[is_target], n_observations, pi, s)
    log_prior = get_log_prior(X[is_train], pi)
    proba = get_proba(log_proba, log_prior, log_odds_in_list)
    
    # record scores
    if step == 0:
        initial_evaluation_data = get_evaluation_data(proba, correct_assignments)
    score = pd.Series({k: proba[test_classes[k], target_classes[k]] for k in test_classes})
    res = res.append(score.reset_index().rename(columns={'index': 'symbol', 0: 'proba'}).assign(step=step))

    # create entity labels
    idx_in_list = np.random.rand(proba.shape[0]) < np.array(proba.sum(axis=1)).ravel()
    idx_train = np.where(is_train)[0]
    y = np.empty(X.shape[0], dtype=int)
    for i in np.where(idx_in_list)[0]:
        lo, hi = proba.indptr[i], proba.indptr[i + 1]
        y[idx_train[i]] = np.random.choice(proba.indices[lo:hi], p=proba.data[lo:hi] / proba.data[lo:hi].sum())
    y[idx_train[~idx_in_list]] = proba.shape[1] + np.arange(np.sum(~idx_in_list))
    y[is_target] = np.arange(is_target.sum())

    # update parameters
    counts, nobs = get_group_statistics(X, y)
    pi_new, s_new = map_estimate(*compress_group_statistics(counts, nobs), *prior, s_init=s, pi_init=pi)
    _learning_rate = learning_rate / (step + 1.)
    pi *= (1. - _learning_rate)
    pi += _learning_rate * pi_new
    s *= (1. - _learning_rate)
    s += _learning_rate * s_new

unsupervised_scores = res.sort_values(['symbol', 'step'])
final_evaluation_data = get_evaluation_data(proba, correct_assignments)

In [None]:
fig, axes = plt.subplots(ncols=2, figsize=(14, 4))

ax = axes[0]
for _, df in unsupervised_scores.groupby('symbol'):
    ax.plot(df['step'].values, df['proba'].values, 'k-', alpha=.2)

ax.set_ylim((-.05, 1.05))
ax.grid()
ax.set_xlabel('EM step')
ax.set_ylabel('score')
ax.set_title('evolution of scores with unsupervised training')

ax = axes[1]
unsupervised_change = unsupervised_scores.groupby('symbol')['proba'].agg(['first', 'last'])
ax.plot(unsupervised_change['first'], change['last'], '.', alpha=.7)
ax.plot([0, 1], [0, 1], 'k:')
ax.grid()
ax.set_title('predicted probability of correct match')
ax.set_xlabel('before training')
ax.set_ylabel('after unsupervised training')
pass

In [None]:
fig, axes = plt.subplots(nrows=2, figsize=(7, 6), sharex=True, sharey=True)

for ax, df, title in zip(axes, [initial_evaluation_data, final_evaluation_data], ['before training', 'after training']):
    for (_y, _df), label in zip(df.groupby('y'), ['negative', 'positive']):
        ax.hist(_df['score'].values, bins=np.linspace(-.01, 1.01, 21), alpha=.5, density=True, log=True, label=label)
        ax.text(.5, .95, title, ha='center', va='top', transform=ax.transAxes, fontsize=12)
        
axes[0].legend(loc='upper left')
pass

In [None]:
comparison = unsupervised_change.join(online_change, lsuffix='_unsupervised', rsuffix='_online')
assert np.allclose(comparison['first_unsupervised'], comparison['first_online'])

fig, ax = plt.subplots(figsize=(6, 6))
ax.set_title('scores for\nunsupervised learning vs online training')
ax.set_xlabel('unsupervised')
ax.set_ylabel('online')

ax.plot(comparison['last_unsupervised'], comparison['last_online'], '.', alpha=.7)
ax.set_aspect(1)
lim = (-.05, 1.05)
ax.set_xlim(lim)
ax.set_ylim(lim)
ax.plot(lim, lim, 'k:')
ax.grid()
pass