# Critic Feedback and Model Evaluation
This notebook provides the necessary scirpts to filter generated explanations using an automated critic based on ground truth reference. 

Additionally, it includes code to calculate scores for evaluation on validation and test splits

The subsequent code assume the following file structure for ILLUME training. The epoch directories of each iteration contain the csvs generated on the validation split for different training epochs, whereas the training directory contains multiple subdirectories for csvs generated at different temperatures on the train split.
If one evaluation (i.e. von leaf directory) is parallelized and split into multiple csv files, it is expected that all files are sorted alphatically by their respective split of the data. This is automatically taken care of when using the ```--split```argument of ```vqax_magma_eval.py```

```
workspace
│       
└─── repositories 
│   │
│   └─── ILLUME
│       │
│       └─── results
│           │
│           └─── vqax
│               │
│               └─── it0
│               │   │
│               │   └─── train
│               │       │                  
│               │       └─── 0.01
│               │       └─── 0.1
│               │       └─── ...
│               │
│               └─── it1 
│               │   │ 
│               │   └─── epoch1
│               │   └─── epoch2
│               │   └─── epoch...
│               │   └─── train
│               │       │                  
│               │       └─── 0.01
│               │       └─── 0.1
│               │       └─── ...
│               │
│               └─── it...
└─── datasets 
    │
    └─── COCO
        │
        └─── VQA-X
            └─── train.json
            └─── val.json
            └─── test.json
        
```

## Feedback on generated samples
Generate any number of explanations beforehand using ```vqax_magma_eval.py``` at various temperatures. 
If the file structure differs from the one discussed above this might require adjustments of the code below.

### 1) Load generated explanations into training data

In [2]:
######################
# Set this parameter to the iteration in question (use 0 for the first iteration)
iteration = 0
#####################

import pandas as pd
import numpy as np
import os
from nlgeval import NLGEval
import re

def isfloat(str_):
      return re.match(r'^-?\d+(?:\.\d+)$', str_) is not None



nlgeval = NLGEval(no_glove=True, no_skipthoughts=True)

expl_gen_cnt = 0


df_train = pd.read_json('/workspace/datasets/COCO/VQA-X/train.json')
df_train.reset_index(inplace=True)

root = '../results/vqax/'
it_root = os.path.join(root, f'it{iteration}', 'train')

mappings = {}
for root, dirs, files in os.walk(it_root):

    temps = [x for x in dirs if x.isnumeric() or isfloat(x)]
    break


for temp in temps:
    print(temp)
    temp_path =  os.path.join(it_root, temp)
    file_paths = []
    for root, dirs, files in os.walk(temp_path):
        file_paths = [os.path.join(temp_path, file) for file in sorted(files)]
        break 
    
    

    if type(file_paths) == str or len(file_paths) == 1:
        df_tmp = pd.read_csv(file_paths[0])
        df_tmp.fillna('', inplace=True)
    else:
        dfs = []
        for path in file_paths:
            dfs.append(pd.read_csv(path))
        df_tmp = pd.concat(dfs)
        
        df_tmp.reset_index(inplace=True)
        assert(len(df_tmp) == len(df_train))
        df_tmp.fillna('', inplace=True)
        
    
    for expl_column in df_tmp.loc[:, df_tmp.columns.str.startswith('gen_explanation')].columns:
        df_train[f'gen_explanation{expl_gen_cnt}'] = df_tmp[expl_column].apply(lambda x: str(x).lower().strip())
        mappings[expl_gen_cnt] = float(temp)
        expl_gen_cnt += 1
    df_train['gen_answer'] = df_tmp['gen_answer']
    df_train.gen_answer.fillna('', inplace=True)
    df_train['overlap'] = df_train.apply(lambda x: max(x['multiple_choice_answer'].count(x['gen_answer']), x['gen_answer'].count(x['multiple_choice_answer']))>0, axis = 1)




0.3


### 2) Calculate per sample Rouge-L score
Depending on the number of generated samples this may run for a few minutes

In [7]:
from tqdm import tqdm
df_train['explanation_reference'] = df_train.explanation.apply(lambda x: x.strip().lower())


scores = list()

for index, e in tqdm(df_train.iterrows(), total=len(df_tmp)):
    refs = [e.explanation_reference]
    refs_clean = []
    for ref in refs:
        if len(ref) > 0:
            refs_clean.append(ref)
    scores_sample = list()
    for hyp_column in df_train.loc[:, df_train.columns.str.startswith('gen_explanation')].columns:
        hyp = e[hyp_column]
        if len(hyp) < 3:
            score = 0.0
        else:
            score = nlgeval.compute_individual_metrics(refs_clean, hyp)['ROUGE_L']
        scores_sample.append(score)
    scores.append(scores_sample)

df_train.insert(12, 'scores', scores)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29459/29459 [02:31<00:00, 194.33it/s]


### 3) Load the training data of the previous iteration
Include already filtered samples from previous iterations. There won't be any for iteration 0

In [8]:
try:
    df_prev = pd.read_json(os.path.join(it_root, '..', 'train_samples.json'))
except: 
    df_prev = None
    print('No previous training samples detected')

No previous training samples detected


### 4) Apply critic

Filter for explanations with at R-L score over threshold

In [10]:
########################
# Set critic thhreshold
threshold = 0.7
#######################

from tqdm import tqdm
it_samples = []
cnt = 0
for row in tqdm(df_train.iterrows(), total=len(df_train)):
    idx, item = row
    ref_row = df_train.iloc[idx]
    image_id = item['image_id']
    question = item['question']
    question_id = item['question_id']
    answer = ref_row['multiple_choice_answer']
    scores = np.array(item['scores'])
    inds = np.array(np.where(scores >= threshold)) 
    exps = []
    if(len(inds[0])>0):
        cnt+=1
    for ind in inds[0]:
        exps.append(item[f'gen_explanation{ind}'])
        
    if df_prev:
        exp_prev = df_prev.loc[df_prev.st].loc[df_prev.question_id == question_id]
        for row_prev in exp_prev.iterrows():
            idx_prev, item_prev = row_prev
            exps.append(item_prev['explanation_sample'])
        
    for exp in set(exps):
        it_samples.append({'image_id': image_id, 'question_id':question_id, 'question': question, 'answer': answer, 'explanation_sample': exp})
        
    

print(f'Found at least one fitting explanation for {cnt} samples')

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29459/29459 [00:04<00:00, 7226.44it/s]

Found at least one fitting explanation for 1184 samples





### 5) Prepare training data for next iteration
Draw additional VQA only samples for more robust training

In [13]:
df_it = pd.DataFrame(it_samples)
df_it['st'] = True
print(f'Number of self-generated samples: {len(df_it)}')

Number of self-generated samples: 1367


In [16]:
########################
# Set number of additional VQA-Only Samples (We recommend roughly 10x the number of explanations samples)
vqa_samples = 10000
seed = 42
#######################

df_train['pos_sample'] = df_train.question_id.apply(lambda x: len(df_it.loc[df_it.question_id == x]) == 0)
df_sample = df_train.loc[df_train.pos_sample].sample(n=min(vqa_samples, len(df_train.loc[df_train.pos_sample])), random_state=seed)
print(f'Added {len(df_sample)} addtional VQA samples')
df_sample['st'] = False
df_st = df_it.append(df_sample)
df_st.reset_index(inplace=True, drop=True)
df_st['sample_id'] = df_st.index

print(f'Total number of training samples: {len(df_st)}')

Added 10000 addtional VQA samples
Total number of training samples: 11367


  df_st = df_it.append(df_sample)


### 6) Store training data for next iteration
This assumes the file structure mentioned above. Change if necessary

In [18]:
next_root = f'../results/vqax/it{iteration+1}'
os.makedirs(next_root, exist_ok=True)
df_st.to_json(os.path.join(next_root, 'train_samples.json'))

## Evaluation on validation split

As stated above assumes file structure that contain ```epoch...``` directories in alphabetical order.

In [19]:
######################
# Set this parameter to the iteration in question 
iteration = 1
#####################

import pandas as pd
from nlgeval import NLGEval
import os
nlgeval = NLGEval(no_glove=True, no_skipthoughts=True)


root = '../results/vqax/'
it_root = os.path.join(root, f'it{iteration}', )



for root, dirs, files in os.walk(it_root):
    epochs = [x for x in  dirs if 'epoch' in x]
    break

for epoch in sorted(epochs):
    epoch_path = os.path.join(it_root, epoch)

    file_paths = None
    for root, dirs, files in os.walk(epoch_path):
        file_paths = [os.path.join(epoch_path, file) for file in sorted(files)]
        break 
    
    df_val = pd.read_json('/workspace/datasets/COCO/VQA-X/val.json')
    df_val.reset_index(inplace=True)
    
    df_tmp = None
    if type(file_paths) == str or len(file_paths) == 1:
        df_tmp = pd.read_csv(file_paths[0])
        df_tmp.fillna('', inplace=True)
    else:
        dfs = []

        for path in file_paths:
            dfs.append(pd.read_csv(path))
        if len(dfs) == 0:
            print(f'Skipping epoch {epoch}')
            continue
        df_tmp = pd.concat(dfs)
        
        
        #df_tmp.drop_duplicates(subset=['image_id'], inplace=True)
        df_tmp.reset_index(inplace=True)
        assert(len(df_tmp) == len(df_val))
        df_tmp.fillna('', inplace=True)
    
    
    df_val['gen_answer'] = df_tmp['gen_answer'].astype(str)
    df_val['overlap'] = df_val.apply(lambda x: (max(x['multiple_choice_answer'].count(str(x['gen_answer'])), str(x['gen_answer']).count(x['multiple_choice_answer']))>0) and len(x['gen_answer'].strip()) >0  , axis = 1)
    expl_gen_cnt = 0
    for expl_column in df_tmp.loc[:, df_tmp.columns.str.startswith('gen_explanation')].columns:
        df_val[f'gen_explanation{expl_gen_cnt}'] = df_tmp[expl_column].apply(lambda x: str(x).lower().strip())
        expl_gen_cnt += 1

    df_val['explanation_reference'] = df_val.explanation.apply(lambda x: x.strip().lower())
    df_val['explanation_reference_2'] = df_val.explanation_2.apply(lambda x: x.strip().lower())
    df_val['explanation_reference_3'] = df_val.explanation_3.apply(lambda x: x.strip().lower())
    
    


    hypothesis = list(df_val.gen_explanation1.values)
    references = [list(df_val.explanation_reference.values), 
                  list(df_val.explanation_reference_2.values), 
                  list(df_val.explanation_reference_3.values)]
    metrics_dict = nlgeval.compute_metrics(ref_list=references, hyp_list=hypothesis)
    metrics_dict
    print(f"\033[1mEpoch | B-1        | B-2         | B-3        | B-4         | METEOR     | ROUGE_L    | CIDEr       | Q/A Acc\033[0m")
    print(f"{epoch}| {metrics_dict['Bleu_1']*100:0.2f}      | {metrics_dict['Bleu_2']*100:0.2f}       | {metrics_dict['Bleu_3']*100:0.2f}      | {metrics_dict['Bleu_4']*100:0.2f}       | {metrics_dict['METEOR']*100:0.2f}      | {metrics_dict['ROUGE_L']*100:0.2f}      | {metrics_dict['CIDEr']*100:0.2f}       | {len(df_val.loc[df_val.overlap])/len(df_val)*100:0.2f} |")



[1mEpoch | B-1        | B-2         | B-3        | B-4         | METEOR     | ROUGE_L    | CIDEr       | Q/A Acc[0m
epoch1| 42.54      | 30.44       | 20.73      | 14.06       | 14.96      | 39.52      | 44.57       | 80.66 |
[1mEpoch | B-1        | B-2         | B-3        | B-4         | METEOR     | ROUGE_L    | CIDEr       | Q/A Acc[0m
epoch2| 43.06      | 30.73       | 21.16      | 14.53       | 14.82      | 39.60      | 46.05       | 82.85 |


## Evaluation on test split

This assumes the explanation for a single (best) epoch to be in the given directory 

In [25]:
######################
# Directory of explanations generated on test split. Can contain one or multiple csv files
test_eval_path = '../results/test/ILLUME/'
#####################

import pandas as pd
import os
df_test = pd.read_json('/workspace/datasets/COCO/VQA-X/test.json')

# import os
import pandas as pd
from nlgeval import NLGEval
nlgeval = NLGEval(no_glove=True, no_skipthoughts=True)




file_paths = None
for root, dirs, files in os.walk(test_eval_path):
    file_paths = [os.path.join(test_eval_path, file) for file in sorted(files)]
    break 


df_tmp = None
if type(file_paths) == str or len(file_paths) == 1:
    df_tmp = pd.read_csv(file_paths[0])
    df_tmp.fillna('', inplace=True)
else:
    dfs = []

    for path in file_paths:
        dfs.append(pd.read_csv(path))
    if len(dfs) == 0:
        print(f'Skipping epoch {epoch}')
    df_tmp = pd.concat(dfs)

    df_tmp.reset_index(inplace=True)
    assert(len(df_tmp) == len(df_test))
    df_tmp.fillna('', inplace=True)


df_test['gen_answer'] = df_tmp['gen_answer'].astype(str)
df_test['overlap'] = df_test.apply(lambda x: (max(x['multiple_choice_answer'].count(str(x['gen_answer'])), str(x['gen_answer']).count(x['multiple_choice_answer']))>0) and len(x['gen_answer'].strip()) >0  , axis = 1)
expl_gen_cnt = 0
for expl_column in df_tmp.loc[:, df_tmp.columns.str.startswith('gen_explanation')].columns:
    df_test[f'gen_explanation{expl_gen_cnt}'] = df_tmp[expl_column].apply(lambda x: str(x).lower().strip())
    expl_gen_cnt += 1

df_test['explanation_reference'] = df_test.explanation.apply(lambda x: x.strip().lower())
df_test['explanation_reference_2'] = df_test.explanation_2.apply(lambda x: x.strip().lower())
df_test['explanation_reference_3'] = df_test.explanation_3.apply(lambda x: x.strip().lower())


hypothesis = list(df_test.gen_explanation1.values)
references = [list(df_test.explanation_reference.values), 
              list(df_test.explanation_reference_2.values), 
              list(df_test.explanation_reference_3.values)]
metrics_dict = nlgeval.compute_metrics(ref_list=references, hyp_list=hypothesis)
metrics_dict
print(f"\033[1m B-1       | B-2         | B-3        | B-4         | METEOR     | ROUGE_L    | CIDEr       | Q/A Acc\033[0m")
print(f"{metrics_dict['Bleu_1']*100:0.2f}      | {metrics_dict['Bleu_2']*100:0.2f}       | {metrics_dict['Bleu_3']*100:0.2f}      | {metrics_dict['Bleu_4']*100:0.2f}       | {metrics_dict['METEOR']*100:0.2f}      | {metrics_dict['ROUGE_L']*100:0.2f}      | {metrics_dict['CIDEr']*100:0.2f}       | {len(df_test.loc[df_test.overlap])/len(df_test)*100:0.2f} |")


[1m B-1       | B-2         | B-3        | B-4         | METEOR     | ROUGE_L    | CIDEr       | Q/A Acc[0m
50.32      | 37.20       | 26.62      | 19.01       | 16.52      | 44.24      | 60.18       | 85.48 |
