In [67]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import json


In [68]:
current_dir = os.path.abspath('.')
relevance_dir = Path(current_dir).resolve().parent

processed_data_path = relevance_dir / 'qualitative-analysis' / 'data' / 'processed_data_with_stimuli.csv'
output_md = relevance_dir / 'qualitative-analysis' / 'extreme_examples.md'

d = pd.read_csv(processed_data_path)

# Helper function to get stimulus from RowID
def get_stimulus(row_id, measures):
    row = d[d['RowID'] == row_id]
    text = row.stimulus.values[0]
    # grab ranks for each input measure
    measure_ranks = [row[f'{measure}_rank'].values[0] for measure in measures]
    # format the measures as comma-separated text
    measure_ranks_text = ', '.join([f'{measure}_rank: {measure_rank}' for measure, measure_rank in zip(measures, measure_ranks)])
    # put it all together
    fulltext = f'{measure_ranks_text}\n\n{text}'
    return fulltext



For each `measure`, precompute the top `k` rows with respect to `rank(measure) - rank(relevance)`.

In [69]:
measures = ['bfu', 'klu', 'ech', 'bch', 'beta_bfu', 'beta_klu', 'beta_ech', '2ord_bch']
ks = np.arange(100,1501,100)
for measure in measures:
    for k in ks:
        d[f'{measure}_rank_diff_top{k}'] = d['RowID'].isin(d.nlargest(k, [f'{measure}_rank_diff'], keep='all')['RowID'])
        d[f'{measure}_rank_diff_bot{k}'] = d['RowID'].isin(d.nsmallest(k, [f'{measure}_rank_diff'], keep='all')['RowID'])
d


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

Unnamed: 0,submission_id,StimID,AnswerCertainty,AnswerPolarity,ContextType,pos,pri,rel,bch,ech,...,2ord_bch_rank_diff_top1100,2ord_bch_rank_diff_bot1100,2ord_bch_rank_diff_top1200,2ord_bch_rank_diff_bot1200,2ord_bch_rank_diff_top1300,2ord_bch_rank_diff_bot1300,2ord_bch_rank_diff_top1400,2ord_bch_rank_diff_bot1400,2ord_bch_rank_diff_top1500,2ord_bch_rank_diff_bot1500
0,5794,2,non_answer,positive,neutral,0.70,0.70,0.04,0.00,0.00,...,True,False,True,False,True,False,True,False,True,True
1,5839,2,non_answer,positive,neutral,0.50,0.70,0.03,0.20,0.12,...,True,False,True,False,True,False,True,False,True,True
2,5857,2,non_answer,positive,neutral,0.70,0.70,0.00,0.00,0.00,...,True,False,True,False,True,False,True,False,True,False
3,5866,2,non_answer,positive,neutral,0.75,0.75,0.00,0.00,0.00,...,True,False,True,False,True,False,True,False,True,False
4,5906,2,non_answer,positive,neutral,0.55,0.55,0.00,0.00,0.00,...,True,False,True,False,True,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2335,5927,6,exhaustive,negative,negative,0.00,0.05,1.00,0.05,0.29,...,True,False,True,False,True,True,True,True,True,True
2336,5932,6,exhaustive,negative,negative,0.00,0.09,1.00,0.09,0.44,...,False,True,False,True,True,True,True,True,True,True
2337,6013,6,exhaustive,negative,negative,0.02,0.08,1.00,0.06,0.26,...,False,True,False,True,False,True,False,True,False,True
2338,6037,6,exhaustive,negative,negative,0.00,0.10,1.00,0.10,0.47,...,False,True,False,True,False,True,False,True,False,True


# Plots

Plot the overlap between top `k` for all measures.

In [126]:
firstorder = ['bfu', 'klu', 'ech']
secondorder = ['beta_bfu', 'beta_klu', 'beta_ech']
ms = firstorder + secondorder
topk = '_rank_diff_top200'
mstopk = [m + topk for m in ms]
all_query = ' and '.join([f'`{m}`' for m in mstopk])
any_query = ' or '.join([f'`{m}`' for m in mstopk])



In [127]:
import plotly.express as px
import plotly.graph_objects as go


In [128]:

px.parallel_categories(d.query(any_query).sort_values(mstopk), mstopk)

In [129]:
def gen_query(col_tuples, connective = 'and'):
    query_parts = []
    for colname, switch in col_tuples:
        # if switch is zero, negate the column, otherwise don't
        if not switch:
            query_parts.append(f'~`{colname}`')
        else:
            query_parts.append(f'`{colname}`')
    query = f' {connective} '.join(query_parts)
    return query
gen_query([('col1', 0), ('col2', 1)])

'~`col1` and `col2`'

In [162]:
def gen_parcats(measures, k=100, top_or_bot='top', highlight_switches=[]):
    # build 'topk' column names from measure names (e.g. 'bfu' -> 'bfu_rank_diff_top100')
    type = '_rank_diff_' + top_or_bot + str(k)
    measure_cols = {m: m + type for m in measures}
    colnames = list(measure_cols.values())
    # filter the data so that only trials that are topk
    # for any measure are included
    any_query = ' or '.join([f'`{m}`' for m in colnames])
    df = d.query(any_query)[colnames]
    # use color to highlight particular cross-sections of data
    df['color'] = 0
    for i, switch in enumerate(highlight_switches):
        switch_query = gen_query(zip(colnames, switch))
        df.loc[df.eval(switch_query), 'color'] = i + 1
    # make the plot
    dimensions = [go.parcats.Dimension(values=df[colname], label=m.upper(), 
                                       categoryarray=[True, False], ticktext=['', ''], categoryorder='array') 
                  for m, colname in measure_cols.items()]
    dimensions[0].update(ticktext = [top_or_bot + str(k), 'not ' + top_or_bot + str(k)])
    fig = go.Figure(
        data = [go.Parcats(
        dimensions=dimensions,
        line={'color': df.color,},
        labelfont={'size': 16,},
        tickfont={'size': 16,},
        arrangement='freeform')]) 
    return fig


In [166]:

firstorder_rank_diff_comparison_top200 = gen_parcats(firstorder, k=500, highlight_switches=[
        [1, 0, 0],
        [1, 1, 0],
        [1, 0, 1],
        [1, 1, 1],
    ])
firstorder_rank_diff_comparison_top200.update_layout(
    title=dict(text='Comparing First Order Rank Differences', 
               x=0.5, xanchor='center', yanchor='top'))
firstorder_rank_diff_comparison_top200

In [165]:
firstorder_rank_diff_comparison_bot200 = gen_parcats(firstorder, top_or_bot='bot', k=500, highlight_switches=[
        [1, 0, 0],
        [1, 1, 0],
        [1, 0, 1],
        [1, 1, 1],
    ])
firstorder_rank_diff_comparison_bot200.update_layout(
    title=dict(text='Comparing First Order Rank Differences', 
               x=0.5, xanchor='center', yanchor='top'))
firstorder_rank_diff_comparison_bot200

# Getting text output for interesting examples

This code makes a markdown file with the top and bottom `k` examples for each measure.
(Top and bottom are relative to the rank of the `rank(measure) - rank(relevance)` for each `measure`.)

Set `k` here. Also pick which measures to collect text for.

In [8]:

k = 25
chosen_measures_for_text_output = ['bfu']
# Make sure we computed it previously
assert(k in ks)


Put it all in a big string `output` and dump it into the MD file.

In [9]:

output = ''
# collect stim text from top k and bottom k for each chosen measure
for measure in chosen_measures_for_text_output:
    topk_rowids = d[d[f'{measure}_rank_diff_top{k}'] == True]['RowID'].tolist()
    # print(topk_rowids)
    botk_rowids = d[d[f'{measure}_rank_diff_bot{k}'] == True]['RowID'].tolist()
    output += f'\n\n# Rank diff: {measure} - relevance: Top {k}\n\n'
    for row_id in topk_rowids:
        stimtext = get_stimulus(row_id, ['rel', measure])
        output += f'## {row_id}\n\n{stimtext}\n\n'

# Write to markdown file
with open(output_md, 'w') as f:
    f.write(output)