# Quantitative and Qualitative Analysis of the Outputs

## Table of Contents

1. [Imports](#imports)
2. [Helper Functions](#helper-functions)
3. [Dataset Statistics](#data)
4. [Output_Analysis](#output-analysis)
5. [Prompting Analysis](#prompting-analysis)


## 1. Imports <a name="imports"></a>

In [1]:
import json
import numpy as np
import os
import pandas as pd
import re
#import seaborn as sns

import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline

## 2. Helper Functions <a name="helper-functions"></a>

In [2]:
# empty outputs
def contains_only_white_spaces(text):
    # Define a regular expression pattern to match only white spaces
    whitespace_pattern = re.compile(r'^\s+$')

    # Search for white spaces in the text
    match = whitespace_pattern.search(text)
    return match is not None

In [3]:
# check patterns from instruction-tuning
pattern1 = 'based on the information provided'
pattern2 = 'explanation:'
pattern3 = 'logical reasoning'
pattern4 = 'reasoning:'
pattern5 = 'plausible'
pattern_unsure = 'not sure'
pattern_unrel = 'question is unrelated'

# very simple approach, does not catch orthographic mistakes
def check_pattern(pattern, text):
    return pattern in text.lower()

In [4]:
# does not catch all emojis
def count_emojis(text):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    
    emoji_count = len(emoji_pattern.findall(text))
    return emoji_count

In [5]:
def contains_non_ascii(text):
    # checks for diacritics and non-latin characters
    return not text.isascii()

## 3. Dataset Statistics <a name="data"></a>
Show the number of instances per subcorpus and category.

In [6]:
def split_and_count_by_source_category(df):
    try:
        sub_dataframes = dict(tuple(df.groupby('Category')))
    except Exception:
        sub_dataframes = dict(tuple(df.groupby('Source')))
    for category, sub_df in sub_dataframes.items():
        # add 5 per category from shot document
        # comparable to the values reported in the TRAM paper
        yield category, len(sub_df.index) + 5

In [7]:
# dataset statistics
data_dir = 'data/'
data_stat = dict()

for o in os.listdir(data_dir):
    mcq = f'{data_dir}{o}/{o}_mcq.csv'
    df_mcq = pd.read_csv(mcq, engine='python', encoding_errors='ignore')
    data_stat[o] = {'mcq': dict()}
    for cat, num in split_and_count_by_source_category(df_mcq):
        data_stat[o]['mcq'][cat] = num
    # saq exists
    if os.path.exists(f'{data_dir}{o}/{o}_saq.csv'):
        saq = f'{data_dir}{o}/{o}_saq.csv'
        data_stat[o]['saq'] = dict()
        df_saq = pd.read_csv(saq, engine='python', encoding_errors='ignore')
        for cat, num in split_and_count_by_source_category(df_saq):
            data_stat[o]['saq'][cat] = num
        assert data_stat[o]['saq'] == data_stat[o]['mcq']

In [8]:
data_stat

{'ambiguity_resolution': {'mcq': {'Interpretation': 395,
   'Shift - Calendar': 300,
   'Shift - LT': 600,
   'Shift - MT': 1354,
   'Shift - ST': 1000}},
 'duration': {'mcq': {'Analogy Inference': 800,
   'Commonsense': 315,
   'Computation': 1500,
   'Direct Comparison': 2000,
   'Facts': 135,
   'Multi-Step Comparison': 1500,
   'Reading Comprehension': 982},
  'saq': {'Analogy Inference': 800,
   'Commonsense': 315,
   'Computation': 1500,
   'Direct Comparison': 2000,
   'Facts': 135,
   'Multi-Step Comparison': 1500,
   'Reading Comprehension': 982}},
 'arithmetic': {'mcq': {'Application': 2042,
   'Date Computation': 6000,
   'Hour Adjustment (12h)': 1500,
   'Hour Adjustment (24h)': 1500,
   'Month Shift': 140,
   'Time Computation': 980,
   'Time Zone Conversion': 500,
   'Week Identification': 1497,
   'Year Shift': 1470},
  'saq': {'Application': 2042,
   'Date Computation': 6000,
   'Hour Adjustment (12h)': 1500,
   'Hour Adjustment (24h)': 1500,
   'Month Shift': 140,
   '

## 4. Output Analysis <a name="output-analysis"></a>



In [9]:
output_dir = 'output/timellama-7b/'
# features to compute
features = ['name',
            'subcorpus',
            'mcq',
            '5-shot',
            'category',
            'num_instances',
            'average_len',
            'whitespace',
            'emoji',
            'non_ascii',
            'pattern1',
            'pattern2',
            'pattern3',
            'pattern4',
            'pattern5',
            'pattern_unsure',
            'pattern_unrel']
# dataframe for all outputs
output = pd.DataFrame(columns=features)

# subcorpora names
outputs = dict()
for o in os.listdir(output_dir):
    subcorpus = o.split('_')[0]
    with open(f'{output_dir}{o}', 'r', encoding='utf-8') as f:
        txt = f.read()
    d = json.loads(txt)
    if 'mcq' in o:
        mcq = True
    else:
        mcq = False
    if '5shot' in o:
        fshot = True
    else:
        fshot = False
    new_row = [o, subcorpus, mcq, fshot]
    generations = []
    num_instances = 0
    current_category = list(d.items())[0][1]['Category']
    for n, (i, instance) in enumerate(d.items()):
        # group generations together
        generations += instance['Outputs']
        num_instances += 1
        # next category
        if n == len(d)-1 or list(d.items())[n+1][1]['Category'] != current_category:
            avg_len = np.mean([len(g) for g in generations])
            # count number of instances that contain whitespaces only
            whitespace = sum(contains_only_white_spaces(g) for g in generations)
            # count how many of the generations contain emojis
            emoji = sum([1 for g in generations if count_emojis(g) > 0])
            non_latin = sum(contains_non_ascii(g) for g in generations)
            p1 = sum(check_pattern(pattern1, g) for g in generations)
            p2 = sum(check_pattern(pattern2, g) for g in generations)
            p3 = sum(check_pattern(pattern3, g) for g in generations)
            p4 = sum(check_pattern(pattern4, g) for g in generations)
            p5 = sum(check_pattern(pattern5, g) for g in generations)
            p_unsure = sum(check_pattern(pattern_unsure, g) for g in generations)
            p_unrel = sum(check_pattern(pattern_unrel, g) for g in generations)
            new_row += [current_category, num_instances, avg_len, whitespace, emoji,
                        non_latin, p1, p2, p3, p4, p5, p_unsure, p_unrel]
            assert len(new_row) == len(features)
            # append new row
            output = output.append(pd.Series(new_row, index=output.columns), ignore_index=True)
            new_row = [o, subcorpus, mcq, fshot]
            generations = []
            num_instances = 0
            if n != len(d)-1:  # only if not last instance in file
                current_category = list(d.items())[n+1][1]['Category']

output

Unnamed: 0,name,subcorpus,mcq,5-shot,category,num_instances,average_len,whitespace,emoji,non_ascii,pattern1,pattern2,pattern3,pattern4,pattern5,pattern_unsure,pattern_unrel
0,typical_time_saq_5shot_nc.json,typical,False,True,Commonsense,5,182.76,0,3,3,0,0,0,0,0,8,2
1,typical_time_saq_5shot_nc.json,typical,False,True,Comparison,5,329.32,0,0,0,4,3,0,2,2,1,0
2,typical_time_saq_5shot_nc.json,typical,False,True,Facts,5,258.88,0,3,12,0,0,0,0,0,11,0
3,typical_time_saq_5shot_nc.json,typical,False,True,Reading Comprehension,1,188.00,0,0,1,0,0,0,0,0,0,0
4,typical_time_mcq_0shot_nc.json,typical,True,False,Commonsense,5,260.48,0,0,3,0,9,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,nli_saq_5shot_nc.json,nli,False,True,SNLI,5,385.76,0,1,2,9,2,0,2,6,2,0
133,causality_mcq_5shot_nc.json,causality,True,True,Cause,5,224.40,0,0,0,1,0,0,0,0,1,0
134,causality_mcq_5shot_nc.json,causality,True,True,Effect,5,272.92,0,0,2,0,0,0,0,0,0,0
135,nli_mcq_0shot_nc.json,nli,True,False,MNLI,5,304.00,0,1,6,2,0,0,0,1,2,0


In [68]:
output.to_csv('output/statistics.csv')

In [65]:
subframes = outputs.group_by('subcorpus')

Unnamed: 0,average_len
average_len,1.0


In [20]:
# transform to float
output.num_instances = output.num_instances.astype('float', errors='ignore')
output.whitespace = output.whitespace.astype('float', errors='ignore')
output.emoji = output.emoji.astype('float', errors='ignore')
output.non_ascii = output.non_ascii.astype('float', errors='ignore')
output.pattern1 = output.pattern1.astype('float', errors='ignore')
output.pattern2 = output.pattern2.astype('float', errors='ignore')
output.pattern3 = output.pattern3.astype('float', errors='ignore')
output.pattern4 = output.pattern4.astype('float', errors='ignore')
output.pattern5 = output.pattern5.astype('float', errors='ignore')
output.pattern_unsure = output.pattern_unsure.astype('float', errors='ignore')
output.pattern_unrel = output.pattern_unrel.astype('float', errors='ignore')

In [21]:
output.corr()

Unnamed: 0,num_instances,average_len,whitespace,emoji,non_ascii,pattern1,pattern2,pattern3,pattern4,pattern5,pattern_unsure,pattern_unrel
num_instances,1.0,0.112156,0.01572,0.07814,0.103484,0.062058,0.062873,0.016689,0.036241,0.028392,0.069428,0.038208
average_len,0.112156,1.0,-0.268308,0.197171,0.122067,0.307685,0.168906,-0.029962,0.206497,0.451731,-0.12146,-0.155008
whitespace,0.01572,-0.268308,1.0,-0.167055,-0.23985,0.08259,-0.134416,-0.035679,0.009171,-0.022377,0.016159,-0.000739
emoji,0.07814,0.197171,-0.167055,1.0,0.601194,-0.168562,0.035008,-0.140953,-0.063303,-0.16688,-0.106135,-0.050011
non_ascii,0.103484,0.122067,-0.23985,0.601194,1.0,-0.418628,-0.006733,-0.096576,-0.175396,-0.247785,-0.206724,-0.169
pattern1,0.062058,0.307685,0.08259,-0.168562,-0.418628,1.0,-0.173185,0.079111,0.246236,0.49014,0.277739,0.248343
pattern2,0.062873,0.168906,-0.134416,0.035008,-0.006733,-0.173185,1.0,0.25628,0.135699,0.156824,-0.26334,-0.196627
pattern3,0.016689,-0.029962,-0.035679,-0.140953,-0.096576,0.079111,0.25628,1.0,-0.082256,0.049474,-0.084192,-0.086721
pattern4,0.036241,0.206497,0.009171,-0.063303,-0.175396,0.246236,0.135699,-0.082256,1.0,0.284129,-0.038637,0.02629
pattern5,0.028392,0.451731,-0.022377,-0.16688,-0.247785,0.49014,0.156824,0.049474,0.284129,1.0,-0.049927,-0.106267


In [69]:
# subcorpora with whitespace-only outputs
output[output['whitespace'] != 0]

Unnamed: 0,name,subcorpus,mcq,5-shot,category,num_instances,average_len,whitespace,emoji,non_ascii,pattern1,pattern2,pattern3,pattern4,pattern5,pattern_unsure,pattern_unrel
16,frequency_mcq_5shot_nc.json,frequency,True,True,Commonsense,5,214.96,2,0,0,6,0,0,0,0,6,2
39,duration_mcq_5shot_nc.json,duration,True,True,Analogy Inference,5,248.84,1,0,1,4,0,0,2,0,2,0
79,storytelling_mcq_5shot_nc.json,storytelling,True,True,SCT,5,151.48,2,0,0,0,0,0,0,0,0,0
89,typical_time_mcq_5shot_nc.json,typical,True,True,Commonsense,5,173.24,1,0,3,0,0,0,0,1,3,0
109,relation_mcq_5shot_nc.json,relation,True,True,TempEval-3,5,109.0,1,0,0,0,0,0,0,0,3,0


In [23]:
# not all instances of a category processed
output[output['num_instances'] != 5]

Unnamed: 0,name,subcorpus,mcq,5-shot,category,num_instances,average_len,whitespace,emoji,non_ascii,pattern1,pattern2,pattern3,pattern4,pattern5,pattern_unsure,pattern_unrel
3,typical_time_saq_5shot_nc.json,typical,False,True,Reading Comprehension,1.0,188.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
output.describe()

Unnamed: 0,num_instances,average_len,whitespace,emoji,non_ascii,pattern1,pattern2,pattern3,pattern4,pattern5,pattern_unsure,pattern_unrel
count,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0,137.0
mean,4.970803,250.482336,0.051095,1.948905,6.094891,1.408759,1.430657,0.036496,0.255474,0.226277,2.576642,0.576642
std,0.341743,47.946583,0.279736,2.146547,4.237236,1.95371,1.958374,0.18821,0.606693,0.685916,3.194026,1.298883
min,1.0,109.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,5.0,218.76,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,252.2,0.0,1.0,6.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0
75%,5.0,277.68,0.0,3.0,10.0,2.0,2.0,0.0,0.0,0.0,4.0,1.0
max,5.0,385.76,2.0,11.0,17.0,9.0,9.0,1.0,3.0,6.0,17.0,7.0


In [None]:
# analyse per category

## 5. Prompting Analysis <a name="prompting-analysis"></a>

Compare the effect of different question types and prompting scenarios.

In [12]:
# compare 0-shot and 5-shot scenarios
shots = dict(tuple(output.groupby('5-shot')))
shots

{False:                                name     subcorpus    mcq 5-shot  \
 4    typical_time_mcq_0shot_nc.json       typical   True  False   
 5    typical_time_mcq_0shot_nc.json       typical   True  False   
 6    typical_time_mcq_0shot_nc.json       typical   True  False   
 7    typical_time_mcq_0shot_nc.json       typical   True  False   
 20   storytelling_mcq_0shot_nc.json  storytelling   True  False   
 ..                              ...           ...    ...    ...   
 128           nli_saq_0shot_nc.json           nli  False  False   
 129      ordering_saq_0shot_nc.json      ordering  False  False   
 130      ordering_saq_0shot_nc.json      ordering  False  False   
 135           nli_mcq_0shot_nc.json           nli   True  False   
 136           nli_mcq_0shot_nc.json           nli   True  False   
 
                   category num_instances  average_len whitespace emoji  \
 4              Commonsense             5       260.48          0     0   
 5               Comparis

In [None]:
# compare multiple-choice (mcq) and short-answer (saq) questions
qtypes = dict(tuple(output.groupby('mcq')))