#### Importing packages

In [9]:
# Load python packages

import copy
import json
import jsonlines
import krippendorff

import matplotlib.pyplot as plt
import nltk
import numpy as np
import os
import pandas as pd
import pymongo
import scipy.stats as stats
import seaborn as sns
import statsmodels.api as sm

from collections import Counter
from datetime import datetime
from scipy.stats import chi2_contingency, kendalltau, pearsonr, spearmanr
from statsmodels.formula.api import ols
from statsmodels.stats.inter_rater import fleiss_kappa


In [6]:
import logging

logger = logging.getLogger()
logger.setLevel(logging.INFO)
logging.info("test")


INFO:root:test


## Overview 
This notebook calculates the worker agreement scores and correlations. 


1. <strong>Load annotations for a task</strong> 

2. <strong>Run analysis for agreement among workers</strong> 

3. <strong>Run analysis for correlations among collected data</strong> 


Set the following variables first:


In [7]:
# Set one of the following options: "table_annotation", "adjusted_claim_annotation"
task_type = "chart_filtering"

# Load annotations into pd.Dataframe 'df' 
df = pd.read_pickle("")

# Set path to PubHealthTab dataset (dataset.jsonl) 
path_pubhealthtab = ""


FileNotFoundError: [Errno 2] No such file or directory: ''

-------------------------------

#### Preprocess dataset

In [96]:
# Set the task to be postprocessed: "claim_explanation_verification", "chart_filtering"
task = "claim_explanation_verification"

# Load file with annotations into pandas.DataFrame OR if annotations saved in DB, create client and load data, example below 
df = pd.DataFrame()

# PROJECT_PATH = r"/Users/user/Library/CloudStorage/OneDrive-King'sCollegeLondon/PycharmProjects/chartfc_dataset_wikicommons"
# PATH_MONGODB_CREDENTIALS = os.path.join(PROJECT_PATH, 'config/mongodb_credentials.json')
# PATH_BANNED_WORKERS = os.path.join(PROJECT_PATH, 'config/banlist.json')

# PATH_BANNED_WORKERS = os.path.join(PROJECT_PATH, 'config/banlist_claim_explanation_generation.json')
PATH_BANNED_WORKERS = os.path.join(PROJECT_PATH, 'config/banlist_claim_explanation_verification.json')



--------

In [98]:
# Load annotation data from MongoDB 

with open(PATH_MONGODB_CREDENTIALS,'r') as f:
    mongodb_credentials = json.load(f)

db_client = pymongo.MongoClient(mongodb_credentials["connection_string"])
db = db_client['chartfc']

if task == "chart_filtering":
    hit_result_collection = db.hit_results
elif task == "claim_explanation_verification":
    hit_result_collection = db.hit_results_claim_explanation_verification

cursor = hit_result_collection.find({})
df = pd.DataFrame(list(cursor))

print(f"Number of samples in 'hit_result_collection' is {len(df)}")


Number of samples in 'hit_result_collection' is 1869


In [99]:
df.head(3)

Unnamed: 0,_id,batch_id,type,references,taskSet_id,hit,timestamp,answers
0,3THR0FZ96WRCINKMS5SPWCYMLAXLON,4e982d1c-42e8-48d5-a3cd-e893899dbd2c,claim_explanation_verification,"[{'db_id': '3J94SKDELPN7C1QCA9Y24TQQVRJD5T', '...",085847c3-da58-46fa-8fd0-3ce78f6dcfda,"{'HITId': '3THR0FZ96WRCINKMS5SPWCYMLAXLON', 'H...",2023-04-27 12:47:33.500,[]
1,3YOAVL4CB7FSJJR8OM4PQT3FJSXZ42,ec997789-b359-467c-8e94-42074979d084,claim_explanation_verification,"[{'db_id': '34OWYT6U43F7LZ8A5DCU76B59XHI98', '...",e971fd49-3e31-4b2b-99d7-96f4f2761063,"{'HITId': '3YOAVL4CB7FSJJR8OM4PQT3FJSXZ42', 'H...",2023-04-28 10:35:33.226,[]
2,3QGTX7BCIW0AF2V82RNZVL9TJOSZ57,ec997789-b359-467c-8e94-42074979d084,claim_explanation_verification,"[{'db_id': '30UZJB2PPOA9U9OFHKSQRK4E4D653X', '...",58238466-7d19-45c2-8f54-85a1e0beb9dc,"{'HITId': '3QGTX7BCIW0AF2V82RNZVL9TJOSZ57', 'H...",2023-04-28 10:35:34.507,"[{'worker_id': 'A22IXLFA45IJJ2', 'assignment_i..."


### Create dataframe

In [7]:
# load list of banned workers to exclude them
with open(PATH_BANNED_WORKERS,'r') as f:
    banlist = json.load(f)

# load list of rejected assignments to exclude them (if any exist)
# rejected_assignments = pd.read_excel("filled_answer_df.xlsx")
# rejected_assignments = list(rejected_assignments[rejected_assignments["reject"]==1].assignment_id)

print(f"Number of banned workers is {len(banlist)}.")
# print(f"Number of rejected assignments is {len(rejected_assignments)}.")


Number of banned workers is 12.


In [100]:
cols = list(df.columns)
cols.remove("type")
cols.remove("timestamp")
# cols.remove("answers")
cols.extend(['worker_id', 'outputs', 'times', 'events', 'feedback'])

annotations_df = pd.DataFrame(columns = cols)
counter_skip = 0
index = 0
for i, row in df.iterrows(): 
    if task != "claim_generation" and (type(row["answers"])!=list or len(row["answers"]) < 3): # we only consider df entries with completed assignments 3/3
        counter_skip += 1
        continue 

#     if type(row["answers"])!=list or len(row["answers"]) == 0: # we only consider df entries with completed assignments 3/3
#         counter_skip += 1
#         continue 
    
    for worker_answer in row["answers"]:
        #if worker_answer["worker_id"] in banlist or worker_answer["assignment_id"] in rejected_assignments:
        #    print("Skipped because worker is banned or assignment has been rejected.")
        #    counter_skip += 1
        #    continue
        
        row["worker_id"] = worker_answer["worker_id"]
        annotations_df.at[index, "_id"] = row["_id"]
        annotations_df.at[index, "batch_id"] = row["batch_id"]
        annotations_df.at[index, "references"] = row["references"]
        annotations_df.at[index, "taskSet_id"] = row["taskSet_id"]
        annotations_df.at[index, "hit"] = row["hit"]
        
        annotations_df.at[index, "worker_id"] = worker_answer["worker_id"]
        annotations_df.at[index, "assignment_id"] = worker_answer["assignment_id"]
        annotations_df.at[index, "outputs"] = worker_answer["values"]["outputs"]
        annotations_df.at[index, "times"] = worker_answer["values"]["times"]
        annotations_df.at[index, "events"] = worker_answer["values"]["events"]
        annotations_df.at[index, "feedback"] = worker_answer["values"]["feedback"]
        index += 1

print(len(annotations_df))
counter_skip


5328


93

In [101]:
annotations_df.head(3)

Unnamed: 0,_id,batch_id,references,taskSet_id,hit,answers,worker_id,outputs,times,events,feedback,assignment_id
0,3HEA4ZVWWKKC8OBJV3GJ6H2YIVF55D,83418db6-f464-48d4-b513-755f6b6df424,"[{'db_id': '37Y5RYYI1W3MG9T4D5CMRB3APX9XSV', '...",f80e02a2-38db-4285-937d-647672503dd1,"{'HITId': '3HEA4ZVWWKKC8OBJV3GJ6H2YIVF55D', 'H...",,ASLGN9PS93UYR,"[{'label_claim': 0, 'label_explanation': 0, 'h...","[12691, 23636, 114491, 5881, 5550, 5160, 7269]","[{'timestamp': '2023-05-05T15:01:51.375Z', 'ty...",GOOD,32VNZTT0AF2M3C3B47FESAX6KKFR4G
1,3HEA4ZVWWKKC8OBJV3GJ6H2YIVF55D,83418db6-f464-48d4-b513-755f6b6df424,"[{'db_id': '37Y5RYYI1W3MG9T4D5CMRB3APX9XSV', '...",f80e02a2-38db-4285-937d-647672503dd1,"{'HITId': '3HEA4ZVWWKKC8OBJV3GJ6H2YIVF55D', 'H...",,A32YOD79DL837P,"[{'label_claim': 0, 'label_explanation': 0, 'h...","[6947, 3974, 4021, 4389, 3470, 5455, 16126]","[{'timestamp': '2023-05-06T02:36:30.258Z', 'ty...",good,3EG49X3512AKFVT1WUELEQ38HV8X6B
2,3HEA4ZVWWKKC8OBJV3GJ6H2YIVF55D,83418db6-f464-48d4-b513-755f6b6df424,"[{'db_id': '37Y5RYYI1W3MG9T4D5CMRB3APX9XSV', '...",f80e02a2-38db-4285-937d-647672503dd1,"{'HITId': '3HEA4ZVWWKKC8OBJV3GJ6H2YIVF55D', 'H...",,A2Z0OH990725SA,"[{'label_claim': 0, 'label_explanation': 0, 'h...","[11045, 5807, 4454, 4311, 4621, 3710, 7438]","[{'timestamp': '2023-05-06T04:35:29.926Z', 'ty...",GOOD,304SM51WAB2IPQOXYBP6QY88HI5SBA


In [104]:
# | worker | claim id | label |

if task in ["chart_filtering"]:
    output_df = pd.DataFrame(columns=["HIT_id", "batch_id", "taskSet_id", "worker_id", "chart_id", "chart_img", "caption",
                                      "label", "header", "events"])

    index = 0
    for i, row in annotations_df.iterrows():
        for j in range(len(row["references"])):
            if row["references"][j]["g_id"]!=-1:
                continue

            output_df.at[index, "HIT_id"] = row["_id"]
            output_df.at[index, "batch_id"] = row["batch_id"]
            output_df.at[index, "taskSet_id"] = row["taskSet_id"]
            output_df.at[index, "worker_id"] = row["worker_id"]

            output_df.at[index, "chart_id"] = row["references"][j]["db_id"]
            output_df.at[index, "chart_img"] = row["references"][j]["chart_img"]
            output_df.at[index, "caption"] = row["references"][j]["caption"]

            output_df.at[index, "label"] = row["outputs"][j]["label"]
            output_df.at[index, "header"] = row["outputs"][j]["header"]
            output_df.at[index, "events"] = row["events"][j]
            index += 1

    print(len(output_df))
    
elif task == "claim_explanation_generation": 
    
    output_df = pd.DataFrame(columns=["HIT_id", "batch_id", "taskSet_id", "worker_id", "chart_id", "chart_img", "caption",
                                      "claim_text_support", "explanation_claim_text_support", "claim_text_refute",
                                      "explanation_claim_text_refute", "events"])
    index = 0
    for i, row in annotations_df.iterrows():
        for j in range(len(row["references"])):
            try:
                output_df.at[index, "HIT_id"] = row["_id"]
                output_df.at[index, "batch_id"] = row["batch_id"]
                output_df.at[index, "taskSet_id"] = row["taskSet_id"]
                output_df.at[index, "worker_id"] = row["worker_id"]

                output_df.at[index, "chart_id"] = row["references"][j]["db_id"]
                output_df.at[index, "chart_img"] = row["references"][j]["chart_img"]
                output_df.at[index, "caption"] = row["references"][j]["caption"]

                output_df.at[index, "claim_text_support"] = row["outputs"][j]["claim_text_support"]
                output_df.at[index, "explanation_claim_text_support"] = row["outputs"][j]["explanation_claim_text_support"]
                output_df.at[index, "claim_text_refute"] = row["outputs"][j]["claim_text_refute"]
                output_df.at[index, "explanation_claim_text_refute"] = row["outputs"][j]["explanation_claim_text_refute"]

                output_df.at[index, "events"] = row["events"][j]

                index += 1  
            except Exception: 
                print(f"Error for index {i}")
                continue 

    print(f"Length of dataset: {len(output_df)}")
    print(f"Number of claims: {len(output_df)*2}")
    
elif task == "claim_explanation_verification": 
    output_df = pd.DataFrame(columns=["HIT_id", "batch_id", "taskSet_id", "worker_id", "chart_id", 
                                      "chart_img", "caption", "claim_original", "explanation_original", 
                                      "claim_rewritten", "explanation_rewritten", "label_claim", 
                                      "label_explanation", "events"])
    index = 0
    for i, row in annotations_df.iterrows():
        for j in range(len(row["references"])):
            try:
                if row["references"][j]["g_id_claim"]>-1: 
                    continue
                
                output_df.at[index, "HIT_id"] = row["_id"]
                output_df.at[index, "batch_id"] = row["batch_id"]
                output_df.at[index, "taskSet_id"] = row["taskSet_id"]
                output_df.at[index, "worker_id"] = row["worker_id"]

                output_df.at[index, "chart_id"] = row["references"][j]["db_id"]
                output_df.at[index, "chart_img"] = row["references"][j]["chart_img"]
                output_df.at[index, "caption"] = row["references"][j]["caption"]

                output_df.at[index, "claim_original"] = row["references"][j]["claim"]
                output_df.at[index, "explanation_original"] = row["references"][j]["explanation"]
                
                output_df.at[index, "label"] = row["references"][j]["label"]

                output_df.at[index, "claim_rewritten"] = row["outputs"][j]["claim_rewritten"]
                output_df.at[index, "explanation_rewritten"] = row["outputs"][j]["explanation_rewritten"]
                output_df.at[index, "label_claim"] = row["outputs"][j]["label_claim"]
                output_df.at[index, "label_explanation"] = row["outputs"][j]["label_explanation"]

                output_df.at[index, "events"] = row["events"][j]
                index += 1  
                
            except Exception as e: 
                print(f"Error {e} for index {i}")
                continue 

    print(f"Length of dataset verification: {len(output_df)}")
#     print(f"Number of claims: {len(output_df)*2}")
    


Error 'g_id_claim' for index 0
Error 'g_id_claim' for index 0
Error 'g_id_claim' for index 0
Error 'g_id_claim' for index 0
Error 'g_id_claim' for index 0
Error 'g_id_claim' for index 0
Error 'g_id_claim' for index 0
Error 'g_id_claim' for index 1
Error 'g_id_claim' for index 1
Error 'g_id_claim' for index 1
Error 'g_id_claim' for index 1
Error 'g_id_claim' for index 1
Error 'g_id_claim' for index 1
Error 'g_id_claim' for index 1
Error 'g_id_claim' for index 2
Error 'g_id_claim' for index 2
Error 'g_id_claim' for index 2
Error 'g_id_claim' for index 2
Error 'g_id_claim' for index 2
Error 'g_id_claim' for index 2
Error 'g_id_claim' for index 2
Error 'g_id_claim' for index 3
Error 'g_id_claim' for index 3
Error 'g_id_claim' for index 3
Error 'g_id_claim' for index 3
Error 'g_id_claim' for index 3
Error 'g_id_claim' for index 3
Error 'g_id_claim' for index 3
Error 'g_id_claim' for index 4
Error 'g_id_claim' for index 4
Error 'g_id_claim' for index 4
Error 'g_id_claim' for index 4
Error 'g

In [105]:
len(output_df)

26550

## Agreement scores

Agreement scores: 
* Krippendorf's alpha: works with nominal, ordinal, and interval data by
* Fleiss' kappa: categorical data
* Randolph's kappa: also categorical data; BUT to avoid the "high agreement, low kappa paradox" [2], Fleiss' kappa is known to be prone to when the true class distribution of the data is unbalanced [1]


Other agreement scores: 
* Scott's π => is equivalent to Fleiss' Kappa but for more than two judges 


In [137]:
def reliability_matrix_for_kripp_alpha(df: pd.DataFrame, label_column):
    """Creates reliability matrix for calculation of Krippendorf's alpha"""
    df = df[['worker_id', 'chart_id', label_column]].groupby(['worker_id', 'chart_id']).agg(np.max).reset_index()
    df = df.pivot(index = 'worker_id', columns = 'chart_id', values = label_column).fillna(np.nan)

    return df

def reliability_matrix_for_kripp_alpha_task_three(df: pd.DataFrame, label_column, index_column):
    """Creates reliability matrix for calculation of Krippendorf's alpha"""
    df = df[['worker_id', index_column, label_column]].groupby(['worker_id', index_column]).agg(np.max).reset_index()
    df = df.pivot(index = 'worker_id', columns = index_column, values = label_column).fillna(np.nan)

    return df

def reliability_matrix_for_fleiss_kappa(df: pd.DataFrame, label_column):
    """Creates reliability matrix for calculation of Fleiss kappa"""
    
    df = df[['chart_id', label_column]]
    df['count'] = 1
    df = df.groupby(['chart_id', label_column]).sum().reset_index()
    df = df.pivot(index = 'chart_id', columns = label_column, values = 'count').fillna(0)
    df = df[df.apply(lambda x : sum(x) == 3.0, axis=1)]

    return df

def reliability_matrix_for_fleiss_kappa_task_three(df: pd.DataFrame, label_column, index_column):
    """Creates reliability matrix for calculation of Fleiss kappa"""
    
    df = df[[index_column, label_column]]
    df['count'] = 1
    df = df.groupby([index_column, label_column]).sum().reset_index()
    df = df.pivot(index = index_column, columns = label_column, values = 'count').fillna(0)
    df = df[df.apply(lambda x : sum(x) == 3.0, axis=1)]

    return df


### F-Kappa

##### TASK 1

In [93]:
fleiss_df = reliability_matrix_for_fleiss_kappa(output_df.copy(), label_column="label")

fleiss_kappa_val = fleiss_kappa(fleiss_df.values, method = 'fleiss')
print('Fleiss\' kappa is {}.'.format(fleiss_kappa_val))


Fleiss' kappa is 0.30348701951423984.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['count'] = 1


##### TASK 3 (claim)

In [131]:
fleiss_df = reliability_matrix_for_fleiss_kappa_task_three(output_df.copy(), index_column="claim_original",
                                                           label_column = "label_claim")
fleiss_kappa_val = fleiss_kappa(fleiss_df.values, method = 'fleiss')
print('Fleiss\' kappa is {}.'.format(fleiss_kappa_val))


Fleiss' kappa is 0.3221175382619831.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['count'] = 1


##### TASK 3 (explanation)

In [133]:
fleiss_df = reliability_matrix_for_fleiss_kappa_task_three(output_df.copy(), index_column="explanation_original",
                                                           label_column = "label_explanation")
fleiss_kappa_val = fleiss_kappa(fleiss_df.values, method = 'fleiss')
print('Fleiss\' kappa is {}.'.format(fleiss_kappa_val))


Fleiss' kappa is 0.289801706908303.


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['count'] = 1


### R-Kappa

Arguments for Randolph's kappa additionally to Fleiss lappa: 
* Avoid the high agreement, low kappa paradox [2]
* I.e. a high value of observed agreement p, can be drastically lowered by a substantial imbalance of classes in the dataset
* Although raters have a high agreement => can result in low Fleiss kappa 
* Fleiss kappa makes assumptions about the distribution of classes => problematic if imbalance given [2]


##### TASK 1

In [94]:
randolph_df = reliability_matrix_for_fleiss_kappa(output_df.copy())

randolph_kappa_val = fleiss_kappa(randolph_df.values, method = 'randolph')
print('Randolph\'s kappa is {}'.format(randolph_kappa_val))


Randolph's kappa is 0.6623767026773133


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['count'] = 1


##### TASK 3 (claim)

In [135]:
randolph_df = reliability_matrix_for_fleiss_kappa_task_three(output_df.copy(), index_column="claim_original",
                                                             label_column = "label_claim")
randolph_kappa_val = fleiss_kappa(randolph_df.values, method = 'randolph')
print('Randolph\'s kappa is {}'.format(randolph_kappa_val))


Randolph's kappa is 0.6145181476846058


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['count'] = 1


##### TASK 3 (explanation)

In [136]:
randolph_df = reliability_matrix_for_fleiss_kappa_task_three(output_df.copy(), index_column="explanation_original",
                                                             label_column = "label_explanation")
randolph_kappa_val = fleiss_kappa(randolph_df.values, method = 'randolph')
print('Randolph\'s kappa is {}'.format(randolph_kappa_val))


Randolph's kappa is 0.5138699408822194


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['count'] = 1


### K-Alpha [4]
* Perfect agreement if K-alpha = 1
* Alpha = 0 if observed disagreement is equal to disagreement which would result if labels are chosen randomly 
* K-alpha applicable to: 
 - Any number of observers, not just two
 - Any number of categories, scale values, or measures
 - Any metric or level of measurement (nominal, ordinal, interval, ratio, and more)
 - Incomplete or missing data
 - Large and small sample sizes alike, not requiring a minimum
 
Arguments for Krippendorf's alpha additionally to Fleiss Kappa: 
* Can handle missing/incomplete data!
* Can handle dataset of different size


##### TASK 1

In [95]:
kripp_df = reliability_matrix_for_kripp_alpha(output_df.copy())
kalpha = krippendorff.alpha(kripp_df.values, level_of_measurement='nominal')
print('Krippendorff\'s alpha  {}'.format(kalpha))

kripp_df.head(3)


Krippendorff's alpha  0.30353196132161164


chart_id,6390685299c9ed38aa87098d,6390685299c9ed38aa87098e,6390685299c9ed38aa87098f,6390685299c9ed38aa870990,6390685299c9ed38aa870991,6390685299c9ed38aa870992,6390685299c9ed38aa870993,6390685299c9ed38aa870994,6390685299c9ed38aa870995,6390685299c9ed38aa870996,...,6390687199c9ed38aa871291,6390687199c9ed38aa871292,6390687199c9ed38aa871293,6390687199c9ed38aa871294,6390687199c9ed38aa871295,6390687199c9ed38aa871297,6390687199c9ed38aa871298,6390687199c9ed38aa871299,6390687199c9ed38aa87129a,6390687199c9ed38aa87129b
worker_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A102RSV009OCUW,,,,,,,,,,,...,,,,,,,,,,
A10AJ59UPJFRYL,,,,,,,,,,,...,,,,,,,,,,
A10KXF6PCBWDJN,,,,,,,,,,,...,,,,,,,,,,


##### TASK 3 (claim)

In [140]:
kripp_df = reliability_matrix_for_kripp_alpha_task_three(output_df.copy(), index_column="claim_original",
                                                         label_column = "label_claim")
kalpha = krippendorff.alpha(kripp_df.values, level_of_measurement='nominal')
print('Krippendorff\'s alpha  {}'.format(kalpha))


Krippendorff's alpha  0.32331952981374545


##### TASK 3 (explanation)

In [141]:
kripp_df = reliability_matrix_for_kripp_alpha_task_three(output_df.copy(), index_column="explanation_original",
                                                         label_column = "label_explanation")
kalpha = krippendorff.alpha(kripp_df.values, level_of_measurement='nominal')
print('Krippendorff\'s alpha  {}'.format(kalpha))


Krippendorff's alpha  0.2905870255024222


## Correlation 

"Correlation" measures used should depend on the type of variables being investigated:
* continuous variable v continuous variable: use "traditional" correlation - e.g. Spearman's rank correlation or Pearson's linear correlation.
* continuous variable v categorical variable: use an ANOVA F-test / difference of means
* categorical variable v categorical variable: use Chi-square / Cramer's V


### Correlation discrete variables
* Pair-wise correlation: Pearson's r, Kendall's τ, or Spearman's \rho 


In [None]:
# Load final dataset

dataset = []
with jsonlines.open(path_pubhealthtab) as reader:
    for line in reader: 
        dataset.append(line)
    
print(f"{len(dataset)} total entries in dataset.")

# convert dataset into pd.DataFrame
dataset_df = pd.DataFrame(dataset)
dataset_df.head(3)


In [None]:
corr_df = pd.DataFrame(columns = ["claim_len", "table_len"])
# corr_df = pd.DataFrame(columns = ["claim_len", "header_rationale_len", "table_len"])

corr_df['claim_len'] = [len(nltk.word_tokenize(x)) for x in dataset_df['claim']]
# corr_df['header_rationale_len'] = [len(x) for x in dataset_df['header_rationale']]
corr_df['table_len'] = [len(x["rows"]) for x in dataset_df['table']]

corr_df.head(3)


#### Pearson

In [None]:
# Correlation matrix

correlation_mat = corr_df.corr() # default method = pearson's
sns.heatmap(correlation_mat, annot = True)
plt.show()


In [None]:
# Test for significance 

for col in list(corr_df.columns):
    p_val = round(pearsonr(corr_df["claim_len"], corr_df[col])[1], 3)
    
    if p_val < 0.05: 
        print(f"The correlation coeff. between 'claim_len' and '{col}' is stat. significant (p-value = {p_val}).")
    else: 
        print(f"The correlation coeff. between 'claim_len' and '{col}' is NOT stat. significant (p-value = {p_val}).")
    

#### Kendall

In [None]:
correlation_mat = corr_df.corr(method="kendall")
sns.heatmap(correlation_mat, annot = True)
plt.show()


In [None]:
# Test for significance 

for col in list(corr_df.columns):
    p_val = round(kendalltau(corr_df["claim_len"], corr_df[col])[1], 2)
    
    if p_val < 0.05: 
        print(f"The correlation coeff. between 'claim_len' and '{col}' is stat. significant (p-value = {p_val}).")
    else: 
        print(f"The correlation coeff. between 'claim_len' and '{col}' is NOT stat. significant (p-value = {p_val}).")


#### Spearman

In [None]:
correlation_mat = corr_df.corr(method="spearman")
sns.heatmap(correlation_mat, annot = True)
plt.show()


In [None]:
# Test for significance 

for col in list(corr_df.columns):
    p_val = round(spearmanr(corr_df["claim_len"], corr_df[col])[1], 2)
    
    if p_val < 0.05: 
        print(f"The correlation coeff. between 'claim_len' and '{col}' is stat. significant (p-value = {p_val}).")
    else: 
        print(f"The correlation coeff. between 'claim_len' and '{col}' is NOT stat. significant (p-value = {p_val}).")


### Correlation categorical variables
* Chi-square test (2 categorical variables)


In [None]:
corr_df['label'] = dataset_df["label"]
corr_df['has_table_caption'] = [1 if x["caption"] else 0 for x in dataset_df['table']]
corr_df['has_table_header'] = [1 if (x["header_horizontal"] and len(x["header_horizontal"])>0) or (x["header_vertical"] and len(x["header_vertical"])>0) 
                               else 0 for x in dataset_df['table']]

corr_df.head(3)


#### Chi-square test

In [None]:
# label and has_caption

cont_table = pd.crosstab(corr_df["label"], corr_df["has_table_caption"]) 
print(chi2_contingency(cont_table)[1])
cont_table


In [None]:
# label and has_header

cont_table = pd.crosstab(corr_df["label"], corr_df["has_table_header"]) 
print(chi2_contingency(cont_table)[1])
cont_table


### Correlation categorical (e.g. label) and discrete variables

* Using __ANOVA F-test__ (1 continuous and 1 categorical variable)

* <font color=blue>__Null-hypothesis__</font>: label values (SUPPORTS, REFUTES, NEI) is equally distributed across the 2nd variable, e.g. claim length

* If <font color=blue>p-value is less 0.05</font>, we reject the null-hypothesis and can say there is a __stat. significant relation__ between label and 2nd variable [5]

<br>


In [None]:
model = ols('claim_len ~ label', data = corr_df).fit()
anova_result = sm.stats.anova_lm(model, typ=2)
print(f"P-value is {round(anova_result.iloc[0,3], 3)}")

corr_df[['label', 'claim_len']].boxplot(by='label')


In [None]:
model = ols('table_len ~ label', data = corr_df).fit()
anova_result = sm.stats.anova_lm(model, typ=2)
print(f"P-value is {round(anova_result.iloc[0,3], 3)}")

corr_df[['label', 'table_len']].boxplot(by='label')


In [None]:
corr_df.head(3)

In [None]:
model = ols('claim_len ~ has_table_caption', data = corr_df).fit()
anova_result = sm.stats.anova_lm(model, typ=2)
print(f"P-value is {round(anova_result.iloc[0,3], 3)}")

corr_df[['has_table_caption', 'claim_len']].boxplot(by='has_table_caption')


In [None]:
model = ols('claim_len ~ has_table_header', data = corr_df).fit()
anova_result = sm.stats.anova_lm(model, typ=2)
print(f"P-value is {round(anova_result.iloc[0,3], 3)}")

corr_df[['has_table_header', 'claim_len']].boxplot(by='has_table_header')


### References

    [1] https://files.eric.ed.gov/fulltext/ED490661.pdf
    
    [2] https://reader.elsevier.com/reader/sd/pii/089543569090158L?token=68830E1F9765B027D7AC8E0260BEF9640E96046B99C8C264BC3222EAB0FD1D41B9C7E24EC24E99C4003168D13B3B48DA&originRegion=eu-west-1&originCreation=20210718072311
    
    [3] http://up.csail.mit.edu/other-pubs/soylent.pdf
    
    [4] https://repository.upenn.edu/cgi/viewcontent.cgi?article=1043&context=asc_papers
    
    [5] https://support.minitab.com/en-us/minitab-express/1/help-and-how-to/modeling-statistics/anova/how-to/one-way-anova/interpret-the-results/key-results/
    
    