In [2]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import json


In [3]:
current_dir = os.path.abspath('.')
relevance_dir = Path(current_dir).resolve().parent

processed_data_path = relevance_dir / 'qualitative-analysis' / 'data' / 'processed_data_with_stimuli.csv'
output_md = relevance_dir / 'qualitative-analysis' / 'extreme_examples.md'

d = pd.read_csv(processed_data_path)

In [4]:
d[['GroupID', '2ord_bch_rank_mean', 'rel_rank_mean', '2ord_bch_rank_diff_mean']]

In [5]:
examples = {
    # use head and tail to get top 10 and bottom 10
    
    col: {
        'top10': pd.Series(d.sort_values(f'{col}_rank_diff').tail(10)['RowID']).tolist(),
        'bot10': pd.Series(d.sort_values(f'{col}_rank_diff').head(10)['RowID']).tolist(),
    } for col in ['bfu', 'klu', 'ech']
}
examples

In [6]:
def get_stimulus(row_id):
    obj = d[d['RowID'] ==row_id]['stimulus']
    return obj.values[0]

In [7]:
get_stimulus('12high_certaintynegativenegative5837')

In [8]:
output = ''
example_text = {}
# collect stim text from top 10 and bottom 10 for each measure
for measure in examples.keys():
    example_text['top10'] = ['## ' + str(i) + '\n\n' + get_stimulus(row_id)
                             for i, row_id in enumerate(examples[measure]['top10'])]
    example_text['bot10'] = ['## ' + str(i) + '\n\n' + get_stimulus(row_id)
                             for i, row_id in enumerate(examples[measure]['bot10'])]
    output += f'\n\n# Rank diff: {measure} - relevance: Bottom 10 \n\n' + '\n\n'.join(example_text['bot10'])
    output += f'\n\n# Rank diff: {measure} - relevance: Top 10 \n\n' + '\n\n'.join(example_text['top10'])

with open(output_md, 'w') as f:
    f.write(output)