In [62]:
import pandas as pd
import numpy as np
import os
from pathlib import Path
import json


In [63]:
current_dir = os.path.abspath('.')
relevance_dir = Path(current_dir).resolve().parent

processed_data_path = relevance_dir / 'qualitative-analysis' / 'data' / 'processed_data_with_stimuli.csv'
output_md = relevance_dir / 'qualitative-analysis' / 'extreme_examples.md'

d = pd.read_csv(processed_data_path)

In [64]:
d[['GroupID', '2ord_bch_rank_mean', 'rel_rank_mean', '2ord_bch_rank_diff_mean']]

Unnamed: 0,GroupID,2ord_bch_rank_mean,rel_rank_mean,2ord_bch_rank_diff_mean
0,2non_answerpositiveneutral,977.42,639.42,338.00
1,2non_answerpositiveneutral,977.42,639.42,338.00
2,2non_answerpositiveneutral,977.42,639.42,338.00
3,2non_answerpositiveneutral,977.42,639.42,338.00
4,2non_answerpositiveneutral,977.42,639.42,338.00
...,...,...,...,...
2335,6exhaustivenegativenegative,1502.25,2134.00,-631.75
2336,6exhaustivenegativenegative,1502.25,2134.00,-631.75
2337,6exhaustivenegativenegative,1502.25,2134.00,-631.75
2338,6exhaustivenegativenegative,1502.25,2134.00,-631.75


In [65]:
examples = {
    # use head and tail to get top 10 and bottom 10
    
    col: {
        'top10': pd.Series(d.sort_values(f'{col}_rank_diff').tail(10)['RowID']).tolist(),
        'bot10': pd.Series(d.sort_values(f'{col}_rank_diff').head(10)['RowID']).tolist(),
    } for col in ['bfu', 'klu', 'ech']
}
examples

{'bfu': {'top10': ['12high_certaintypositiveneutral5806',
   '6low_certaintypositivenegative5863',
   '1low_certaintynegativepositive5958',
   '3low_certaintynegativepositive5806',
   '2exhaustivenegativenegative5848',
   '12high_certaintynegativenegative5848',
   '6low_certaintypositivenegative5901',
   '6non_answerpositiveneutral5955',
   '2non_answerpositivenegative5850',
   '12non_answerpositiveneutral5955'],
  'bot10': ['12exhaustivenegativeneutral6030',
   '12exhaustivenegativenegative6061',
   '12low_certaintynegativenegative6052',
   '12exhaustivenegativenegative5908',
   '12high_certaintynegativenegative5975',
   '12exhaustivenegativenegative6026',
   '12high_certaintynegativenegative5922',
   '12high_certaintynegativenegative5837',
   '8high_certaintynegativenegative5953',
   '5low_certaintypositivepositive5850']},
 'klu': {'top10': ['7high_certaintynegativepositive5844',
   '8high_certaintypositivenegative5923',
   '12high_certaintypositiveneutral5806',
   '1low_certaintyneg

In [66]:
def get_stimulus(row_id):
    obj = d[d['RowID'] ==row_id]['stimulus']
    return obj.values[0]

In [67]:
get_stimulus('12high_certaintynegativenegative5837')

"GroupID: **12high_certaintynegativenegative**\n\nAnswerCertainty: **`high_certainty`**\n\nAnswerPolarity: **`negative`**\n\nContextType: **`negative`**\n\nBefore bed last night, you saw it was supposed to snow overnight. You wake up at 6:00am and look out the window, and see that it's sunny and there's no snow on the ground.\n\nYou ask your mother: **Is school canceled?**\n\nYour mother responds: **A school bus is coming down the road now.**"

In [68]:
output = ''
example_text = {}
# collect stim text from top 10 and bottom 10 for each measure
for measure in examples.keys():
    example_text['top10'] = ['##' + str(i) + '\n\n' + get_stimulus(row_id)
                             for i, row_id in enumerate(examples[measure]['top10'])]
    example_text['bot10'] = ['##' + str(i) + '\n\n' + get_stimulus(row_id)
                             for i, row_id in enumerate(examples[measure]['bot10'])]
    output = f'#Rank diff: {measure} - relevance: Bottom 10 \n\n' + '\n\n'.join(example_text['bot10'])
    output += f'#Rank diff: {measure} - relevance: Top 10 \n\n' + '\n\n'.join(example_text['top10'])

with open(output_md, 'w') as f:
    f.write(output)