## Validating the pairwise constraints with Metric scores for t-SNE

In [1]:
import numpy as np
import pandas as pd
import pickle

from scipy.spatial.distance import cosine

# folder containing the pre-calculated metrics and negative log likelihood for the constrained pairs
data_folder = 'output2_tsne'

In [2]:
# using plotly for notebook in offline mode
import plotly.plotly as py
import plotly.graph_objs as go

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

In [3]:
# using some gui elements (widgets) of ipython notebook: selection-box, check-box, slider
import ipywidgets as widgets
from ipywidgets import interact, interactive_output

### Util functions for loading data and plotting

In [4]:
def load_df(dataset_name):
    """Load data from pickle file
        The result is a list of object (dict), which then be converted into pandas dataframe
    """
    if dataset_name == "": return None
    
    in_name = '{}/tsne_{}.pkl'.format(data_folder, dataset_name)
    pkl_data = pickle.load(open(in_name, 'rb'))
    embeddeds = pkl_data['results']
    df = pd.DataFrame.from_records(embeddeds, exclude=["embedding"])
    return df

# global dataframe
g_df = load_df('MNIST-SMALL')

In [5]:
def _plot_surface(pivot_data, z_title='', chart_title=''):
    """Using plotly to plot the 3d surface of cost function, or metric scores
    """
    perps = pivot_data.columns
    lrs = pivot_data.transpose().columns
    data = [
        go.Surface(
            x = perps,
            y = lrs,
            z=pivot_data.as_matrix(),
        )
    ]
    layout = go.Layout(
        title=chart_title,
        autosize=True,
        width=600,
        height=600,
        margin=dict(
            l=65,
            r=50,
            b=65,
            t=90
        ),
        scene = dict(
            xaxis = dict(
                title='Perplexity'),
            yaxis = dict(
                title='Learning rate'),
            zaxis = dict(
                title=z_title),
        ),
    )
    fig = go.Figure(data=data, layout=layout)
    iplot(fig, filename='./plotly_output/test-3d-surface.html')

In [6]:
def _plot_lines_by_key(key_name, key_val, use_log_x, use_log_y, show_inverse, show_neg):
    """Plot multiple lines, each line is a column in filtered dataframe (by input `key_name`)
    """
    global g_df
    if g_df is None: return
    
    if key_name == 'perplexity':
        df_by_key = g_df[g_df[key_name]==key_val]
        x_data = np.log(df_by_key['learning_rate']) if use_log_x else df_by_key['learning_rate']
    else: # learning_rate
        df_by_key = g_df[g_df[key_name]==key_val]
        x_data = np.log(df_by_key['perplexity']) if use_log_x else df_by_key['perplexity']
    
    # customized functions that can be applied onto the y-values
    _inverse = lambda y: np.divide(1.0, y, out=np.zeros_like(y), where=(y != 0))
    _negative = lambda y: -y
    
    line_names = [
        # (line_name, display_name, show_by_default, can_modify)
        ('q_link', '[] -log(links) in low dim.', True, False),
        ('p_link', '[] -log(links) in high dim.', False, False),
        ('loss', '[] KL loss', False, False),
        ('auc_rnx', '[*] logRNX', True, True),
        ('cca_stress', '[*] CCA Stress', False, True),
        ('mds_isotonic', '[*] MDS Stress', False, True),
        ('pearsonr', '[*] Corr. Coef.', True, True),
        ('sammon_nlm', '[*] Sammon NLM', True, True)
    ]
    
    plot_data = []
    for line_name, display_name, show_by_default, can_modify in line_names:
        y = df_by_key[line_name]
        y_data = np.log(y) if use_log_y else y
        if can_modify:
            if show_inverse: y_data = _inverse(y_data)
            if show_neg: y_data = _negative(y_data)
            
        trace_i = go.Scatter(
            x = x_data,
            y = y_data,
            name = display_name,
            mode = 'lines+markers',
            line = {'shape': 'spline'},
            visible= '' if show_by_default else 'legendonly'
        )
        plot_data.append(trace_i)
    
    
    layout = dict(
        title = 'Metric values / NLL [{}  = {}]'.format(key_name, key_val),
        xaxis = dict(title = '{} {}'.format(
          'Log of ' if use_log_x else '',
          'Learning rate' if key_name == 'perplexity' else 'Perplexity')),
        yaxis = dict(title = '{} {}'.format(
          'Log of ' if use_log_y else '', 'Metrics value / negative LL')),
        autosize=False
    )
        
    fig = go.Figure(data=plot_data, layout=layout)
    iplot(fig)

In [7]:
# UI controls

datasetX = widgets.Dropdown(
    options={
        "Select dataset": "",
        "MNIST mini": "MNIST-SMALL",
        "COIL-20": "COIL20",
        "MNIST 2000 samples": "MNIST-2000",
        "Country Indicators 1999": "COUNTRY1999",
        "Country Indicators 2013": "COUNTRY2013",
        "Country Indicators 2014": "COUNTRY2014",
        "Country Indicators 2015": "COUNTRY2015",
        "Cars and Trucks 2004": "CARS04",
        "Breast Cancer Wisconsin (Diagnostic)": "BREAST-CANCER95",
        "Pima Indians Diabetes": "DIABETES",
        "Multidimensional Poverty Measures": "MPI"
    },
    value='',
    description='Dataset:',
)

pivotKeyX = widgets.Dropdown(
    options={
        'Negative log likelihood in low dim.': 'q_link',
        'Negative log likelihood in high dim.': 'p_link',
        'KL-loss': 'loss',
        'AUC R_NX': 'auc_rnx',
        'CorrCoef': 'pearsonr',
        'MDS Stress': 'mds_isotonic',
        'CCA Stress': 'cca_stress',
        'Sammon NLM': 'sammon_nlm'

    },
    value='q_link',
    description='Surface: ',
)

lrX = widgets.SelectionSlider(
    options=g_df['learning_rate'].unique(),
    value=100.0,
    description='LR ($\lambda$): '
)

perpX = widgets.SelectionSlider(
    options=g_df['perplexity'].unique(),
    value=50.0,
    description='Perplexity: '
)

useLogXAxisX = widgets.Checkbox(
    description='Log scale in xAxis',
    value=False,
)

useLogYAxisX = widgets.Checkbox(
    description='Log scale in yAxis',
    value=False,
)

showNegativeX = widgets.Checkbox(
    description='Negative stress func. [*]',
    value=True,
)

showInverseX = widgets.Checkbox(
    description='Inverse stress func. [*]',
    value=False,
)

useSameAxisX = widgets.Checkbox(
    description='Use same yAxis',
    value=False,
)

line1NameX = widgets.Dropdown(
    options=g_df.columns,
    description='Line1: '
)

line2NameX = widgets.Dropdown(
    options=g_df.columns,
    description='Line2: '
)

line3NameX = widgets.Dropdown(
    options=g_df.columns,
    description='Line3: '
)

### 3D surface of KL-loss, NLL or Metric scores

In [8]:
# interactive plot for plotting the 3d surface

@interact(dataset_name=datasetX)
def select_dataset(dataset_name):   
    global g_df
    g_df = load_df(dataset_name)
    
    @interact(key_to_pivot=pivotKeyX)
    def plot_surface_by_name(key_to_pivot='q_link'):
        global g_df
        if g_df is None: return

        pivot_df = g_df[['learning_rate', 'perplexity', key_to_pivot]].pivot(
            index='learning_rate', columns='perplexity', values=key_to_pivot)

        _plot_surface(pivot_data=pivot_df, chart_title=key_to_pivot)

A Jupyter Widget

### Compare NLL and all metric scores

In [9]:
# interactive plot all lines containint NLL and metric scores
def plot_lines_by_key(dataset_name, lr, perp, use_log_x, use_log_y, show_inverse, show_neg):
    global g_df
    g_df = load_df(dataset_name)
    
    _plot_lines_by_key('learning_rate', lr, use_log_x, use_log_y, show_inverse, show_neg)
    _plot_lines_by_key('perplexity', perp, use_log_x, use_log_y, show_inverse, show_neg)
    
ui = widgets.VBox([
    widgets.HBox([datasetX, lrX, perpX]),
    widgets.HBox([useLogXAxisX, useLogYAxisX, showInverseX, showNegativeX])
])

out = widgets.interactive_output(plot_lines_by_key, 
               {'dataset_name': datasetX,
                'lr':lrX,
                'perp':perpX,
                'use_log_x': useLogXAxisX,
                'use_log_y': useLogYAxisX,
                'show_inverse': showInverseX,
                'show_neg': showNegativeX})

display(ui, out)

A Jupyter Widget

A Jupyter Widget

### Plot running time and the number of iterations 

In [10]:
def _plot_runningtime(key_name, key_val):
    global g_df
    if g_df is None: return
    
    if key_name == 'perplexity':
        df_by_key = g_df[g_df[key_name]==key_val]
        x_data = df_by_key['learning_rate']
    else: # learning_rate
        df_by_key = g_df[g_df[key_name]==key_val]
        x_data = df_by_key['perplexity']
    
    trace1 = go.Scatter(
        x=x_data,
        y=df_by_key['running_time'],
        name='Runnning time'
    )
    trace2 = go.Bar(
        x=x_data,
        y=df_by_key['n_iter'],
        name='# of iterations',
        yaxis='y2'
    )
    data = [trace1, trace2]
    layout = go.Layout(
        title='Running time and number of iterations [{} = {}]'.format(key_name, key_val),
        xaxis = dict(title = '{}'.format('Learning rate' if key_name == 'perplexity' else 'Perplexity')),
        yaxis=dict(
            title='Running time (s)'
        ),
        yaxis2=dict(
            title='Number of iterations',
            titlefont=dict(
                color='orange'
            ),
            tickfont=dict(
                color='orange'
            ),
            overlaying='y',
            side='right'
        ),
        legend=dict(orientation="h"),
        autosize=False
    )
    fig = go.Figure(data=data, layout=layout)
    # iplot(fig,image='svg')

In [11]:
def plot_runningtime(dataset_name, lr, perp):
    global g_df
    g_df = load_df(dataset_name)
    
    _plot_runningtime('learning_rate', lr)
    _plot_runningtime('perplexity', perp)
    
ui2 = widgets.VBox([
    widgets.HBox([datasetX, lrX, perpX]),
])

out2 = widgets.interactive_output(plot_runningtime, 
               {'dataset_name': datasetX,
                'lr':lrX,
                'perp':perpX})

display(ui2, out2)

A Jupyter Widget

A Jupyter Widget

### Compare two lines in the same graph
+ We fix the value of `learning_rate`, e.g. choose `learning_rate = 100.0`, because it does not influence too much the final result.

In [12]:
def _normalize_0_1(arr):
    return (arr - arr.max()) / -np.ptp(arr)

def _standardize(arr):
    return (arr - arr.mean()) / arr.std()

In [13]:
def _plot_two_lines(line1_name, line2_name, line3_name, use_log_x, use_log_y,
                    use_same_yaxis, show_inverse, show_neg, lr=100.0):
    """Util function for plotting two line in the same plot with different `yAxis`
    """
    global g_df
    if g_df is None: return
    
    _inverse = lambda y: np.divide(1.0, y, out=np.zeros_like(y), where=(y != 0))
    
    df_by_key = g_df[g_df['learning_rate']==lr]
    
    x_data = df_by_key['perplexity'] if not use_log_x else np.log(df_by_key['perplexity'])
    y1_data = df_by_key[line1_name] if not use_log_y else np.log(df_by_key[line1_name])
    y2_data = df_by_key[line2_name] if not use_log_y else np.log(df_by_key[line2_name])
    
    if show_inverse:
        y2_data = _inverse(y2_data)
        
    if show_neg:
        y2_data = - y2_data
    
    trace1 = go.Scatter(
        x=x_data,
        y=y1_data,
        name=line1_name,
        xaxis='x',
        yaxis='y',
    )
    trace2 = go.Scatter(
        x=x_data,
        y=y2_data,
        name=line2_name,
        xaxis='x',
        yaxis='y' if use_same_yaxis else 'y2'
    )
    
    y1_normalized = _standardize(y1_data)
    y2_normalized = _standardize(y2_data)
    print("Cosine similarity: ", 1 - cosine(y1_normalized, y2_normalized))
    
    trace3 = go.Bar(
        x=x_data,
        y=(y1_normalized - y2_normalized),
        name='diff',
        yaxis='y3',
        xaxis = 'x',
        marker=dict(color='#d62728'),
    )
    
    data = [trace1, trace2, trace3]
        
    layout = go.Layout(
        title='Compare the shape of {} and {} [learning_rate = {}]'.format(line1_name, line2_name, lr),
        xaxis=dict(
            tickvals=np.log(df_by_key['perplexity']),
            ticktext=df_by_key['perplexity'],
            #title='Perplexity in log-scale'
        ) if use_log_x else dict(),
        yaxis=dict(
            title=line1_name,
            domain=[0.3, 1],
        ),
        yaxis2=dict(
            title=line2_name,
            titlefont=dict(
                color='orange'
            ),
            tickfont=dict(
                color='orange'
            ),
            overlaying='y',
            side='right',
        ),
        yaxis3=dict(
            title='Diff',
            titlefont=dict(
                color='#d62728'
            ),
            tickfont=dict(
                color='#d62728'
            ),
            domain=[0, 0.2],
        ),
        legend=dict(orientation="h"),
        autosize=False
    )
    fig = go.Figure(data=data, layout=layout)
    print("[*] Inverse function and Negative function are applied only for the `Line2`")
    iplot(fig)

In [14]:
def plot_two_lines(dataset_name, line1_name, line2_name, line3_name, use_log_x, use_log_y,
                   use_same_yaxis, show_inverse, show_neg):
    global g_df
    g_df = load_df(dataset_name)
    
    _plot_two_lines(line1_name, line2_name, line3_name, use_log_x, use_log_y,
                    use_same_yaxis, show_inverse, show_neg, lr=100.0)
    
ui3 = widgets.VBox([
    widgets.HBox([datasetX, line1NameX, line2NameX, line3NameX]),
    widgets.HBox([useLogXAxisX, useLogYAxisX, useSameAxisX]),
    widgets.HBox([showInverseX, showNegativeX])
])

out3 = widgets.interactive_output(plot_two_lines, {
    'dataset_name': datasetX,
    'line1_name': line1NameX,
    'line2_name': line2NameX,
    'line3_name': line3NameX,
    'use_log_x': useLogXAxisX,
    'use_log_y': useLogYAxisX,
    'use_same_yaxis': useSameAxisX,
    'show_inverse': showInverseX,
    'show_neg': showNegativeX
})

display(ui3, out3)

A Jupyter Widget

A Jupyter Widget

### Compare metric scores and negative log likelihood of constrained points

In [15]:
def _compare_lines(base_line, other_lines=[()], use_log_x=False, lr=100):
    """Util function for plotting the difference between
        each line in `other_lines` and `base_line`
    """
    global g_df
    if g_df is None: return
        
    df_by_key = g_df[g_df['learning_rate']==lr]
    df_by_key = df_by_key[(df_by_key['perplexity']<1000) & (df_by_key['perplexity']>=1.0)]
    x_data = df_by_key['perplexity'] if not use_log_x else np.log(df_by_key['perplexity'])
    
    base_data = _standardize(df_by_key[base_line])    
    
    traces = []
    for line_name, is_neg in other_lines:
        line_data = df_by_key[line_name] * (-1 if is_neg else 1)
        line_data = _standardize(line_data)
        # trace_data = (base_data - line_data).abs()
        trace_data = line_data
        
        trace = go.Scatter(
            x=x_data,
            y=trace_data,
            name=line_name,
            line = {'shape': 'spline'},
        )
        traces.append(trace)

    for line_name in ['q_link', 'q_ml', 'q_cl']:
        line_data = df_by_key[line_name]
        line_data = _standardize(line_data)
        # trace_data = (base_data - line_data).abs()
        trace_data = line_data
        
        trace = go.Scatter(
            x=x_data,
            y=trace_data,
            name=line_name,
            line = {'shape': 'spline'},
        )
        traces.append(trace)
        
    layout = go.Layout(
        # title='Difference between NLL and metric scores [learning_rate = {}]'.format(lr),
        autosize=False,
        xaxis=dict(
            tickvals=np.log(df_by_key['perplexity']),
            ticktext=df_by_key['perplexity'],
            title='Perplexity in log-scale'
        ) if use_log_x else dict(title='Perplexity'),
    )
    fig = go.Figure(data=traces, layout=layout)
    iplot(fig)
    # iplot(fig,image='svg')

In [16]:
@interact(dataset_name=datasetX, use_log_x=useLogXAxisX)
def plot_comparing_lines(dataset_name, use_log_x):
    global g_df
    g_df = load_df(dataset_name)
    
    base_line = 'q_link'
    other_lines = [
        ('auc_rnx', True),
        ('cca_stress', False),
        ('mds_isotonic', False),
        ('pearsonr', True),
        ('sammon_nlm', False)
    ]
    _compare_lines(base_line, other_lines, use_log_x, lr=100.0)

A Jupyter Widget

In [17]:
def sum_lines(lr=100.0):
    datasets = [
        'MNIST-SMALL',
        # 'MNIST-2000',
        'COIL20',
        'COUNTRY1999',
        # 'COUNTRY2013',
        'COUNTRY2014',
        # 'COUNTRY2015',
        'CARS04',
        'BREAST-CANCER95',
        'DIABETES',
        'MPI'
    ]
    lines = [
        ('auc_rnx', True),
        ('cca_stress', False),
        ('mds_isotonic', False),
        ('pearsonr', True),
        ('sammon_nlm', False)
    ]
    traces_data = {line_name: [] for line_name, _ in lines}
    sims = {line_name: [] for line_name, _ in lines}
    for dataset in datasets:
        df = load_df(dataset)
        df_by_key = df[df['learning_rate']==lr]
        df_by_key = df_by_key[(df_by_key['perplexity']<1000) & (df_by_key['perplexity']>=1.0)]
        
        q_links = _standardize(df_by_key['q_link'])
        q_mls = _standardize(df_by_key['q_ml'])
        q_cls = _standardize(df_by_key['q_cl'])
        
        for line_name, is_neg in lines:
            line_data = df_by_key[line_name] * (-1 if is_neg else 1)
            line_data = _standardize(line_data)
            mean_diff = np.sum(np.abs(q_links - line_data))           
            sim_qall = cosine(q_links, line_data)
            sim_qml = cosine(q_mls, line_data)
            sim_qcl = cosine(q_cls, line_data)
            
            traces_data[line_name].append(mean_diff)
            sims[line_name].append(sim_qml) # sim_qall
            

    traces = []
    for line_name, _ in lines:
        trace1 = go.Bar(
            x=datasets,
            y=traces_data[line_name],
            name=line_name,
        )
        trace2 = go.Bar(
            x=datasets,
            y=sims[line_name],
            name=line_name,
        )
        traces += [trace1]
    fig = go.Figure(data=traces)
    iplot(fig)
    
sum_lines(lr=100.0)

FileNotFoundError: [Errno 2] No such file or directory: 'output2_tsne/tsne_CARS04.pkl'

### Scatter2d of the embedded points with pairwise constraints

In [18]:
@interact(dataset_name=datasetX)
def plot_scatter(dataset_name):
    if dataset_name == "": return
    
    in_name = '{}/tsne_{}.pkl'.format(data_folder, dataset_name)
    pkl_data = pickle.load(open(in_name, 'rb'))
    embeddeds = pkl_data['results']
    mustlinks = pkl_data['mustlinks']
    cannotlinks = pkl_data['cannotlinnks']
    target_labels = pkl_data['target']
    print(np.unique(target_labels))
    
    def pad_then_flatten(a):
        b = [(x1, x2, None) for x1, x2 in a]
        c = list(sum(b, ()))
        return c
    
    
    @interact(lr=lrX, perp=perpX)
    def _plot_scatter(lr, perp):
        target = next((item for item in embeddeds if item['learning_rate'] == lr and item['perplexity'] == perp))
        X_embedded = target['embedding']
        
        ml_pos = [X_embedded[mustlinks][:, :, 0], X_embedded[mustlinks][:, :, 1]]
        cl_pos = [X_embedded[cannotlinks][:, :, 0], X_embedded[cannotlinks][:, :, 1]]
                
        trace_scatter = go.Scattergl(
            x = X_embedded[:,0],
            y = X_embedded[:,1],
            name = 'X_embedded',
            mode='markers',
            marker=dict(
                size='6',
                color = target_labels,
                colorscale='Viridis',
                opacity=0.7,
                # showscale=True
            )
        )
        
        trace_ml = go.Scattergl(
            x = pad_then_flatten(ml_pos[0]),
            y = pad_then_flatten(ml_pos[1]),
            connectgaps=False,
            name = 'Mustlink',
            line = dict(
                color = ('rgba(22, 96, 167, 0.4)'),
                width = 2,
                dash = 'dash'
            )
            
        )
        
        trace_cl = go.Scattergl(
            x = pad_then_flatten(cl_pos[0]),
            y = pad_then_flatten(cl_pos[1]),
            text=list(map(str, range(len(cl_pos)))),
            connectgaps=False,
            name = 'Cannotlink',
            line = dict(
                color = ('rgba(205, 12, 24, 0.4)'),
                width = 2,
                dash = 'dash'
            )
        )
            
        layout = go.Layout(
            autosize=True,
            width=800,
            height=600,
            xaxis=dict(
                autorange=True,
                showgrid=False,
                zeroline=False,
                showline=False,
                autotick=True,
                ticks='',
                showticklabels=False
            ),
            yaxis=dict(
                autorange=True,
                showgrid=False,
                zeroline=False,
                showline=False,
                autotick=True,
                ticks='',
                showticklabels=True
            )
        )
        fig = go.Figure(data=[trace_scatter, trace_ml, trace_cl], layout=layout)
#         fig = go.Figure(data=[trace_scatter], layout=layout)
        iplot(fig)

A Jupyter Widget

In [19]:
from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform

MACHINE_EPSILON = np.finfo(np.double).eps

def compute_Q(X_embedded):
    degrees_of_freedom = 1
    X_embedded = X_embedded.reshape(-1, 2)

    dist = pdist(X_embedded, "sqeuclidean")
    dist /= degrees_of_freedom
    dist += 1.
    dist **= (degrees_of_freedom + 1.0) / -2.0
    Q = np.maximum(dist / (2.0 * np.sum(dist)), MACHINE_EPSILON)

    return squareform(Q)

In [20]:
dataset_name='COUNTRY2015'
in_name = '{}/tsne_{}.pkl'.format(data_folder, dataset_name)
pkl_data = pickle.load(open(in_name, 'rb'))
embeddeds = pkl_data['results']

lr=100
perp=50
target = next((item for item in embeddeds if item['learning_rate'] == lr and item['perplexity'] == perp))
X_embedded = target['embedding']
        
mustlinks = pkl_data['mustlinks']
cannotlinks = pkl_data['cannotlinnks']
target_labels = pkl_data['target']
print(np.unique(target_labels))

Q = compute_Q(X_embedded)
a,b = np.unique(Q, return_counts=True)
data = [go.Histogram(x=a,y=b)]
iplot(data)

[-1  0  1  2  3  4]


In [21]:
print(np.min(Q))
print(np.percentile(Q, 25))
print(np.percentile(Q, 50))
print(np.percentile(Q, 75))
print(np.percentile(Q, 90))
print(np.max(Q))

0.0
9.12481589876e-07
3.5748365955e-06
1.39552873888e-05
5.21978108779e-05
0.000220511035881
