#### Importing packages

In [None]:
# Imports
import csv 
import json 
import jsonlines
import os
import pandas as pd
import re
import spacy
import string
import zipfile

from collections import Counter
from nltk import word_tokenize
from nltk.corpus import stopwords
from statistics import median, stdev


## Overview

1. <strong>Load the following datasets: PubHealthTab, InfoTabs, TabFact and FEVEROUS </strong>


2. <strong>Execute code below to compare datasets. </strong>

Set the following variables first:



In [None]:
# path to pubhealthtab dataset
path_pubhealthtab = ''

# path to TabFact tables and claims extracted from TabFact Github repo (https://github.com/wenhuchen/Table-Fact-Checking)
path_tabfact_tables = '' 
path_tabfact_claims = ''

# path to Infotabs datasetextracted from Github repo (https://github.com/infotabs/infotabs)
path_infotabs_tables = ''
path_infotabs_claims = ''

# extract tables from Wikipedia dump used for feverous claims (https://github.com/Raldir/FEVEROUS)
path_feverous_tables = '' 


--------------

#### Load PubHealthTab Dataset

In [None]:
# Load dataset

dataset = []
with jsonlines.open(path_pubhealthtab) as reader:
    for line in reader: 
        dataset.append(line)
    

In [None]:
dataset_df = pd.DataFrame(dataset)
dataset_df.head()

Number of veracity labels

In [None]:
Counter(dataset_df["label"])

Number of tables (general and unique)

In [None]:
rows = []
for table in dataset_df["table"]:
    if not table["rows"] in rows:
        rows.append(table["rows"])

len(rows)


In [None]:
table_html = [entry["website"] for entry in dataset_df["table"]]
table_html_unique = list(set(table_html))

print(len(table_html))
print(len(table_html_unique))


Number of websites (general and unique) 

#### TabFact data

In [None]:
tabfact_dataset = []

for filename in os.listdir(path_tabfact_tables):
    f = os.path.join(directory, filename)
    
    with open(f, 'r') as read_obj:
        # pass the file object to reader() to get the reader object
        csv_reader = csv.reader(read_obj)
        # Pass reader object to list() to get a list of lists
        list_of_rows = list(csv_reader)
        table_data = []
        for l in list_of_rows: 
            table_data.append(l[0].split("#"))
            
        tabfact_dataset.append(table_data)
    

In [None]:
tabfact_dataset[0]

In [None]:
tabfact_claims = []
    
with open(path_tabfact_tables, 'r') as file:
    data = json.load(file)
    

In [None]:
list(data.values())[0][0]

In [None]:
tabfact_claims = [claim for entry in list(data.values()) for claim in entry[0]]
len(tabfact_claims)

#### InfoTabs Data

In [None]:
infotabs_claims = []
counter = 0 

for filename in os.listdir(path_infotabs_claims):
    f = os.path.join(directory, filename)
    
    with open(f, 'r') as read_obj:
        # pass the file object to reader() to get the reader object
        csv_reader = csv.reader(read_obj, delimiter="\t")
        # Pass reader object to list() to get a list of lists
        list_of_rows = list(csv_reader)
        for l in list_of_rows: 
            if counter == 0: 
                counter = 1
                continue
            infotabs_claims.append(l[2])


In [None]:
infotabs_tables = []

for filename in os.listdir(path_infotabs_tables):
    f = os.path.join(directory, filename)
    with open(f, 'r') as file:
        t = json.load(file)
        infotabs_tables.append(t)

# infotabs_tables

In [None]:
len(infotabs_tables)

In [None]:
# Number of tables 

print(len(dataset))
print(len(tabfact_dataset))
print(len(infotabs_tables))

In [None]:
# Number of tables with captions
# MY DATASET

caption_ds = [1 for entry in dataset if entry["table"]["caption"] and entry["table"]["caption"].strip()!=""]
print(sum(caption_ds)) # number of tables with caption in our dataset

print(round(sum(caption_ds)/len(dataset), 3)) # ration of tables in our dataset with captions

In [None]:
# Number of tables with captions
# INFOTABS

caption_ds = [1 for entry in infotabs_tables if "title" in entry.keys()]
print(sum(caption_ds)) # number of tables with caption in our dataset

print(round(sum(caption_ds)/len(infotabs_tables), 3)) # ration of tables in our dataset with captions


In [None]:
# Number of tables with headers 
# MY DATASET

header_ds = [1 for entry in dataset if (entry["table"]["header_horizontal"] and entry["table"]["header_horizontal"]!=[]) or 
             (entry["table"]["header_vertical"] and entry["table"]["header_vertical"].strip()!=[])]

print(sum(header_ds)) # number of tables with caption in our dataset

print(round(sum(header_ds)/len(dataset), 3)) # ration of tables in our dataset with captions


In [None]:
# Number of tables with headers 
# INFOTABS

header_ds = []
for entry in infotabs_tables: 
    len_keys = len([k for k in entry.keys() if k.strip()!=""])
    if len_keys == len(entry):
        header_ds.append(1)
    
print(sum(header_ds)) # number of tables with caption in our dataset

print(round(sum(header_ds)/len(infotabs_tables), 3)) # ration of tables in our dataset with captions


In [None]:
# Table row counts 
# MY DATASET

# Ratio of tables with < 5 rows 
count_ds = [1 for entry in dataset if len(entry["table"]["rows"])<5]
print(round(sum(count_ds)/len(dataset), 3)) # ration of tables in our dataset with captions

count_tabfact = [1 for entry in tabfact_dataset if len(entry)<6] # first row is header 
print(round(sum(count_tabfact)/len(tabfact_dataset), 3)) # ration of tables in our dataset with captions

# Ratio of tables with >= 5 rows and < 10 rows 
count_ds = [1 for entry in dataset if (len(entry["table"]["rows"])>4 and len(entry["table"]["rows"])<10)]
print(round(sum(count_ds)/len(dataset), 3)) # ration of tables in our dataset with captions

count_tabfact = [1 for entry in tabfact_dataset if len(entry)>5 and len(entry)<11] # first row is header 
print(round(sum(count_tabfact)/len(tabfact_dataset), 3)) # ration of tables in our dataset with captions

# Ratio of tables with >= 10 rows 
count_ds = [1 for entry in dataset if len(entry["table"]["rows"])>9]
print(round(sum(count_ds)/len(dataset), 3)) # ration of tables in our dataset with captions

count_tabfact = [1 for entry in tabfact_dataset if len(entry)>10] # first row is header 
print(round(sum(count_tabfact)/len(tabfact_dataset), 3)) # ration of tables in our dataset with captions


In [None]:
# Table row counts 
# INFOTABS

# Ratio of tables with < 5 rows 
count_info = [1 for entry in infotabs_tables if (len(entry)-1)<5] # minus 1 because first entry is title 
print(round(sum(count_info)/len(infotabs_tables), 3)) # ration of tables in our dataset with captions

# Ratio of tables with >= 5 rows and < 10 rows 
count_info = [1 for entry in infotabs_tables if (len(entry)>5 and len(entry)<11)] # considering first entry is title 
print(round(sum(count_info)/len(infotabs_tables), 3)) # ration of tables in our dataset with captions

# Ratio of tables with >= 10 rows 
count_info = [1 for entry in infotabs_tables if len(entry)>10] # considering first entry is title 
print(round(sum(count_info)/len(infotabs_tables), 3)) # ration of tables in our dataset with captions


In [None]:
# Table column counts 

def get_col_len(rows: list): 
    col_len = 0
    for r in rows: 
        if len(r)>col_len:
            col_len = len(r)
    return col_len

# Ratio of tables with =< 5 columns 
# MY DATASET

count_ds = [1 for entry in dataset if get_col_len(entry["table"]["rows"])<6]
print(round(sum(count_ds)/len(dataset), 3)) # ration of tables in our dataset with captions

count_tabfact = [1 for entry in tabfact_dataset if len(entry[0])<6]
print(round(sum(count_tabfact)/len(tabfact_dataset), 3)) # ration of tables in our dataset with captions


In [None]:
# Avg cell len 

# MY DATASET
# Ratio of tables with =< 5 columns 
len_cells = []
count_num_cells = 0
count_non_alpha = 0

for entry in dataset: 
    l = [len(cell.strip()) for row in entry["table"]["rows"] for cell in row]    
    len_cells.extend(l)
    
    count_num_cells += len([1 for row in entry["table"]["rows"] for cell in row if re.findall(r'\d+', cell)!=[]])
    count_non_alpha += len([1 for row in entry["table"]["rows"] for cell in row if 
                            [x for x in re.findall(r'\W+', cell) if x.strip()!=""]!=[] or re.findall(r'\d+', cell)!=[]])
    
        
print(sum(len_cells)/len(len_cells)) # avg length of cells
print(round(count_num_cells/len(len_cells), 3)) # ratio of cells with numerical values
print(round(1-count_non_alpha/len(len_cells), 3)) # ratio of cells with only alphabetical characters

len_cells = []
count_num_cells = 0
count_non_alpha = 0

# TABFACT
for entry in tabfact_dataset: 
    l = [len(cell.strip()) for row in entry for cell in row]
    len_cells.extend(l)
    count_num_cells += len([1 for row in entry for cell in row if re.findall(r'\d+', cell)!=[]])
    count_non_alpha += len([1 for row in entry for cell in row if 
                            [x for x in re.findall(r'\W+', cell) if x.strip()!=""]!=[] or re.findall(r'\d+', cell)!=[]])
    
print(sum(len_cells)/len(len_cells)) # avg length of cells
print(round(count_num_cells/len(len_cells), 3)) # ratio of cells with numerical values
print(round(1-count_non_alpha/len(len_cells), 3)) # ratio of cells with only alphabetical characters



In [None]:
list(infotabs_tables[0].values())

In [None]:
# INFOTABS

len_cells = []
count_num_cells = 0
count_non_alpha = 0

for entry in infotabs_tables: 
    l = [len(row[0].strip()) for row in list(entry.values())]
    len_cells.extend(l)
    count_num_cells += len([1 for row in entry.values() if re.findall(r'\d+', row[0])!=[]])
    count_non_alpha += len([1 for row in entry.values() if 
                            [x for x in re.findall(r'\W+', row[0]) if x.strip()!=""]!=[] or re.findall(r'\d+', row[0])!=[]])
    

print(sum(len_cells)/len(len_cells)) # avg length of cells
print(round(count_num_cells/len(len_cells), 3)) # ratio of cells with numerical values
print(round(1-count_non_alpha/len(len_cells), 3)) # ratio of cells with only alphabetical characters


In [None]:
# Unzip FEVEROUS Wikipedia data

with zipfile.ZipFile("...", 'r') as zip_ref: 
    zip_ref.extractall("...")
    

In [None]:
# feverous_dataset = []
feverous_tables = []

for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    print(f)
    
    with jsonlines.open(f) as reader:
        for line in reader: 
            for elem in line["order"]: 
                if "table_" in elem: 
                    feverous_tables.append(line[elem])
                    
#                 feverous_dataset.append(feverous_example)


In [None]:
# Number of tables with caption 
counter = 0

for table in feverous_tables: 
    if "caption" in table.keys(): 
        counter += 1
        
print(f"Number of tables with captions is {counter}.")
print(f"Ratio of tables with captions is {round(counter/len(feverous_tables), 3)}.")
        

In [None]:
# Number of tables with header
counter = 0

for table in feverous_tables: 
    cells = [cell for row in table['table'] for cell in row]
    for cell in cells: 
        if cell["is_header"]: 
            counter += 1
            break

print(f"Number of tables with header cells is {counter}.")
print(f"Ratio of tables with header cells is {round(counter/len(feverous_tables), 3)}.")
        

In [None]:
# Number of tables with < 5 rows

len_tables = [len(table['table']) for table in feverous_tables]
tables_less_five = [1 for entry in len_tables if entry<5]

print(f"Number of tables with less than 5 rows is {len(tables_less_five)}.")
print(f"Ratio of tables with less than 5 rows is {round(len(tables_less_five)/len(feverous_tables), 3)}.")
        

In [None]:
# Number of tables with >= 5 rows and =< 10 rows
tables_between_five_ten = [1 for entry in len_tables if entry>4 and entry<11]

print(f"Number of tables with row length between 5 and 10 is {len(tables_between_five_ten)}.")
print(f"Ratio of tables with row length between 5 and 10 is {round(len(tables_between_five_ten)/len(feverous_tables), 3)}.")


In [None]:
# Number of tables with > 10 rows 
tables_more_ten = [1 for entry in len_tables if entry>10]

print(f"Number of tables with row length more than 10 is {len(tables_more_ten)}.")
print(f"Ratio of tables with row length more than 10 is {round(len(tables_more_ten)/len(feverous_tables), 3)}.")


In [None]:
# Number of cells 
# Number of cells with only __string content__ 
# Number of cells with any __numerical content__

count_num_cells = 0
count_non_alpha = 0
len_rows = []

for table in feverous_tables: 
    len_rows.append(sum([len(row) for row in table["table"]]))

    count_non_alpha += len([1 for row in table['table'] for cell in row if 
                        [x for x in re.findall(r'\W+', cell['value']) if x.strip()!=""]!=[] or re.findall(r'\d+', cell['value'])!=[]])

    count_num_cells += len([1 for row in table['table'] for cell in row if re.findall(r'\d+', cell['value'])!=[]])

    
print(sum(len_rows)) # total number of cells
print(count_num_cells) # number of cells with numerical values
print(count_non_alpha) # number of cells with non-alphabetical characters

print(round(count_num_cells/sum(len_rows), 3)) # ratio of cells with numerical values
print(round(1-count_non_alpha/sum(len_rows), 3)) # ratio of cells with only alphabetical characters


In [None]:
# Avg number of characters per cell 
cell_len = []
for table in feverous_tables: 
    cell_len.extend([len(cell['value'].strip()) for row in table["table"] for cell in row])
    
print(f"Average cell length is {round(sum(cell_len)/len(cell_len), 3)}.")
