In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm

Setup `matplotlib` formatting options.

In [None]:
import matplotlib.ticker as plticker

font = {'family' : 'normal',
        'weight' : 'normal',
        'size'   : 25}
matplotlib.rc('font', **font)
matplotlib.rc('text', usetex=False)

In [None]:
from data import *
from wknn import *
from search import *
from metrics import *

Data generation functions from the distribution described in section 5:

Visualize the data we're generating is about what we want.

In [None]:
n = 100000
x, y = generate_data(n)

In [None]:
interval = 0.005
bins = np.arange(0, 1 + interval, interval)
hists = [np.histogram(x.squeeze(), bins=bins, weights=(y == i).astype('float64'), density=False)[0] \
         for i in range(3)]
centers = (bins[:-1] + bins[1:]) / 2
for hist in hists:
    plt.scatter(centers, hist, s=2)
for eta_val in etas(centers):
    plt.plot(centers, eta_val * n / centers.shape[0])

In [None]:
def run_trial(train_X, train_y, wknn, granularity=10000, search='coordinate', **kwargs):
    wknn.fit(train_X, train_y)
    emp_f1s, wts = zip(*(Searcher.search_dispatch(search)(wknn, train_X, train_y, **kwargs)))
    true_f1s = []
    for wt in wts:
        wknn.set_weights(wt)
        true_f1s.append(true_f1_score(wknn, granularity=granularity))
    return emp_f1s, true_f1s, wts

In [None]:
# Initializle knn
init_weights = np.array([0.3, 0.3, 0.4])
wknn = WeightedKNN(wknn_weights=init_weights)

Comparing effect of different step sizes:

In [None]:
np.random.seed(326)

train_n = 1000
train_X, train_y = generate_data(train_n)
steps = 20
step_sizes = [0.01, 0.05]
results = []
for step_size in step_sizes:
    wknn.set_weights(init_weights)
    emp_f1s, true_f1s, _ = run_trial(train_X, train_y, wknn, steps=steps, step_size=step_size)
    results.extend([{'step size': step_size, 't': idx, 'metric': 'empirical F1', 'Score': val} for idx, val in enumerate(emp_f1s)] + \
                  [{'step size': step_size, 't': idx, 'metric': 'true F1', 'Score': val} for idx, val in enumerate(true_f1s)])
;

In [None]:
df = pd.DataFrame.from_records(results)
df['prod'] = df.apply(lambda x: 'step=' + str(x['step size']) + ', ' + x['metric'], axis=1)
plt.figure(figsize=(12, 10), facecolor='white')
blue, orange = sns.color_palette(n_colors=2)
sns.lineplot(data=df, x='t', y='Score', hue='prod', style='prod', palette=[blue, blue, orange, orange], linewidth=4)
loc = plticker.MultipleLocator(base=2) # this locator puts ticks at regular intervals
plt.gca().xaxis.set_major_locator(loc)
plt.gca().set_xlim(0, 20)
handles, old_labels = plt.gca().get_legend_handles_labels()
labels = [r'$\alpha=0.01, \widehat{\mathrm{F1}}(q_t)$', r'$\alpha=0.01, \mathrm{F1}(q_t)$', \
          r'$\alpha=0.05, \widehat{\mathrm{F1}}(q_t)$', r'$\alpha=0.05, \mathrm{F1}(q_t)$']
for handle in handles:
    handle.set_linewidth(4)
plt.gca().legend(handles=handles[1:], labels=labels, loc='center right', bbox_to_anchor=(1, 0.7))

In [None]:
from multiprocessing import Pool
from itertools import product

In [None]:
n_results = []

In [None]:
np.random.seed(322)
trials = 50

weights_list = weight_gen(3, 100)
def task(data):
    train_n, (i, (train_X, train_y)), search = data
    if search == 'none':
        weighting = np.ones(3) / 3
        weighting[2] = 1 - weighting[0] - weighting[1]
        wknn.set_weights(weighting)
        emp_f1, true_f1 = (f1_eval(wknn.predict(train_X), train_y), true_f1_score(wknn))
    else:
        kwargs = {'steps': steps, 'step_size': 0.01} if search == 'coordinate' else {'weights_list': weights_list}
        emp_f1s, true_f1s, weightings = run_trial(train_X, train_y, wknn, granularity=10000, search=search, **kwargs)
        emp_f1, true_f1, weighting = emp_f1s[-1], true_f1s[-1], weightings[-1]

    index_dict = {'Trial': i, 'n': train_n, 'Algorithm': search} 
    return[{ **index_dict, 'Metric': 'empirical', 'Score': emp_f1}, \
                      { **index_dict, 'Metric': 'true', 'Score': true_f1},
                      { **index_dict, 'Metric': 'true', **{f'Weight_{idx}': wt for idx, wt in enumerate(weighting)}}]
        
for train_n in tqdm([50] + list(range(100, 2600, 100)), desc='Training n', leave=False):
    wknn.set_weights(init_weights)
    datas = [generate_data(train_n) for i in range(trials)]
    with Pool(trials) as p: 
        n_results.extend([entry for entries in p.map(task, product([train_n], enumerate(datas), ['coordinate', 'grid', 'none'])) \
                          for entry in entries])

In [None]:
import os

In [None]:
n_df = pd.DataFrame.from_records(n_results)
if not os.path.exists('results'):
    os.mkdir('results')
n_df.to_pickle('results/synthetic_macro.pkl')

In [None]:
disp_df = n_df.groupby(['n', 'Algorithm', 'Metric'], as_index=False).mean()
disp_df['algometric'] = disp_df.apply(lambda x: x['Algorithm'] + ', ' + x['Metric'], axis=1)

In [None]:
def set_legend_labels(ax, labels, linewidth):
    handles, _ = ax.get_legend_handles_labels()
    for handle in handles:
        handle.set_linewidth(linewidth)
    ax.legend(handles=handles[1:], labels=labels)

In [None]:
plt.figure(figsize=(12, 8), facecolor='white')
sns.lineplot(data=disp_df[disp_df['n'] <= 2500][disp_df['Metric'] == 'true'], \
             x='n', y='Score', hue='algometric', style='algometric', \
             legend='brief')

In [None]:
diff_df = disp_df[disp_df['Algorithm'] != 'none'].copy()
def abs_diff(x):
    diff = np.abs(x.iloc[0]['Score'] - x.iloc[1]['Score'])
    n = x.iloc[0]['n']
    return pd.DataFrame([[n, diff]], columns=['n', 'Diff'])
res_df = diff_df[diff_df['Metric'] == 'true'].groupby(['n'], as_index=False)[['n', 'Score']].apply(abs_diff)

In [None]:
plt.figure(figsize=(12, 8), facecolor='white')

sns.lineplot(data=res_df, \
             x='n', y='Diff', \
             palette=[blue, blue, orange, orange], linewidth=linewidth, dashes=["", (5, 2), (1, 2, 16, 2), (1, 2, 5, 2)], \
             legend='brief')
plt.gca().legend(handles=plt.gca().lines[::(len(res_df) + 1)], labels=["$|\mathrm{F1}(q_{\mathrm{grid}}) - \mathrm{F1}(q^{(T)})|$"])

plt.ylabel('Difference')

In [None]:
plt.figure(figsize=(12, 8), facecolor='white')
def nearest_to_mean(x):
    median_dist = np.abs(x['Score'] - x['Score'].mean())
    mins = x[median_dist == median_dist.min()]
    res =  mins.head(1)
    return res

def l2_wt_dist(x):
    weights = x[['Weight_0', 'Weight_1', 'Weight_2']].to_numpy()
    l2 = np.linalg.norm(weights[0] - weights[1], 2)
    return pd.DataFrame([[x['n'].iloc[0], l2]], columns=['n', 'L2 Distance'])

o_df = diff_df[diff_df['Metric'] == 'true'].groupby(['Algorithm', 'n', 'Trial'], as_index=False).max()

o_df = o_df.groupby(['Algorithm', 'n'], as_index=False).mean()
o_df = o_df.groupby('n', as_index=False)[['n', 'Weight_0', 'Weight_1', 'Weight_2']].apply(l2_wt_dist)

plt.figure(figsize=(12, 8), facecolor='white')
sns.lineplot(data=o_df, x='n', y='L2 Distance', linewidth=linewidth)