In [18]:
l = 3000
with open("../data/tables.json", "r") as infile, open(f"../data/tables_{l}.json", "w") as outfile:
    for i, line in enumerate(infile):
        if i >= l:
            break
        outfile.write(line)

In [19]:
import pandas as pd
import numpy as np 
import torch 
import matplotlib.pyplot as plt 
import collections
import nltk 
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk import ngrams
import copy
from collections import defaultdict
from functools import cache
import cProfile
import pstats
import time
from tabulate import tabulate

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [20]:
# Utility functions
def get_clock_time_in_milli_sec():
    return int(time.time() * 1000)

def print_time(milli_sec):
    hours, remainder = divmod(milli_sec, 1000 * 60 * 60)
    minutes, remainder = divmod(remainder, 1000 * 60)
    seconds, milli_seconds = divmod(remainder, 1000)
    components = []
    if hours:
        components.append(f"{hours}h")
    if minutes:
        components.append(f"{minutes}m")
    if seconds:
        components.append(f"{seconds}s")
    if milli_seconds:
        components.append(f"{milli_seconds}ms")
    print(f"{milli_sec}[{':'.join(components)}]")


In [21]:
def create_table_list(dfs):
    return [[[cell.get('text', '') for cell in row] for row in table_data] for table_data in dfs['tableData']]

In [22]:
@cache
def process_words(text:str)->list: 
    sentence = text.lower().translate(str.maketrans("", "", string.punctuation))
    words = word_tokenize(sentence)
    return words 

In [23]:
ps = PorterStemmer()

In [24]:
@cache
def stem_cached(word):
    return ps.stem(word)

In [25]:
def create_projections(table_list,
                       start_table_index,
                       projections,
                       stemmer, 
                       stop_words: list = None, 
                       min_word_len = 0, 
                       create_ngrams:bool = False, 
                       ngrams_size:int = 2
                       ) -> dict: 

    ps = stemmer
    
    if stop_words: 
        stemmed_stopwords = {ps.stem_cached(w) for w in stop_words}
    else:
        stemmed_stopwords = set()

    for table_index, table in enumerate(table_list): 
        for row_index, row in enumerate(table): 
            for column_index, cell in enumerate(row): 

                words = process_words(cell)

                if create_ngrams: 
                    ngram_stem_set = set()

                for word in words: 
                    stemmed_word = stem_cached(word)
                        
                    if stemmed_word not in stemmed_stopwords and len(stemmed_word) >= min_word_len: 

                        projections.setdefault(stemmed_word, set()).add((table_index, row_index, column_index))

                        if create_ngrams: 
                            ngram_stem_set.update(stemmed_word)
                
                if create_ngrams: 
                    created_ngrams = ngrams(ngram_stem_set, ngrams_size)
                    for created_ngram in created_ngrams: 
                        projections.setdefault(created_ngram, set()).add((table_index, row_index, column_index))
    
    return projections

In [33]:
def create_tables_df(table_chunk, start_table_index, tables_df): 

    new_tables_df = pd.DataFrame({
        "id": range(start_table_index, start_table_index + len(table_chunk)),
        "pgTitle": table_chunk.get('pgTitle', ''),
        "sectionTitle": table_chunk.get('sectionTitle', ''),
        "tableCaption": table_chunk.get('tableCaption', ''),
        "weight": [0.5] * len(table_chunk)
    })

    if not tables_df.empty:
        tables_df = pd.concat([tables_df, new_tables_df], ignore_index=True)
    else:
        tables_df = new_tables_df

    return tables_df

In [27]:
def cleaner(value, create_ngrams:bool = False, ngram_size:int = 2) -> list: ####Umbenennen 
    """
    Returns the tokenized and stemed version of a Value
    """
    words = process_words(value)

    stemmed_words = [stem_cached(word) for word in words]

    if create_ngrams: 
        created_ngrams = ngrams(stemmed_words, ngram_size)
        stemmed_words.extend(list(created_ngrams))
        
    return stemmed_words

In [28]:
def indexing(cleaned_values:list, projections:dict)->dict: 
    """
    Return a dict of all the Examples found in the projections
    In: 
        Cleaned_Values: A List of all the Stemped Versions of one Example given
        Projections: A Dict of Projections of all given Tables
    Out: 
        Index_Dict: A dict of all the positions where the Example was found
                    Form: Key: (Table_ID, Row_ID) -> Value: (Col_ID)
    """
    index_dict = dict() 
    
    for value in cleaned_values: 
        value_index = projections.get(value, None)

        if value_index: 
            for index_pair in value_index: 
                table_id, row_id, col_id = index_pair 
                index_dict.setdefault((table_id, row_id), set()).add((col_id))
    
    return index_dict

In [29]:
def compareit(dictx:dict, dicty:dict, sub_key_mode:bool = False)-> list: 
    """
    Compairs two Dicts with eachother and return a Dict of Intersections between Dicts 
    In: 
        Dictx: Dict of the Form Key: (Table_ID, Row_ID) -> Value: (Row_ID(s))
        Dicty: Dict of the Form Key: (Table_ID, Row_ID) -> Value: (Row_ID(s))
        Sub_Key_Mode: Bool that assures that the output dict is in a Form that another instance of 
                      Compareit could use it
    Out: 
        Possible_Tables_Dict/ Subkey_Dict: A Dict of all Intersections between Dictx and Dicty
                                           In the Form of Dictx/ Dicty if Sub_Key_Mode == True 
    """

    if not sub_key_mode: 
        possible_tables_dict = dict()
    else: 
        subkey_dict = dict()
    intersecting_keys = set(dictx) & set(dicty)
    for key in intersecting_keys: 
        table_id, row_id = key
        for x_col_id in dictx[key]: 
            for y_col_id in dicty[key]: 
                if not sub_key_mode: 
                    possible_tables_dict.setdefault(table_id, set()).add((row_id, (x_col_id, y_col_id)))
                    
                else: 
                    subkey_dict.setdefault(key, set()).add((x_col_id, y_col_id))
    if not sub_key_mode: 
        return possible_tables_dict
    else: 
        return subkey_dict

In [30]:
def querry_thunel(example_pairs:list, projections:dict, tau:int=1):
    """
    Perfoms the complete Querrying Web Tables Operation. 
    In: 
        Example_Pairs: A List of the Semmantic Mapping 
        Projections: A Dict of Projections of all given Tables
        Tau: An Int which indicates how many Example_pairs must be in a Table to count as relevant
    Out: 
        Tables: A Dict of all relevant Tables 
                Form: Key: Table_ID -> Value: (Row_ID, (XColumn_ID, YColumn_ID))
    """
    
    if len(example_pairs) < tau: 
        raise ValueError(f"The Cardinality of given Example_Pairs {len(example_pairs)} must be greater or qual then Tau: {tau}!")
    

    possible_tables = dict()

    for tup in example_pairs: 
        list_of_keys = list()
        for val in tup: 
            
            if isinstance(val, tuple): 
                list_of_subkeys = list()
                for sub_key in val: 
                    cleaned_sub_key = cleaner(sub_key)
                    index_of_sub_key = indexing(cleaned_sub_key, projections)
                    list_of_subkeys.append(index_of_sub_key)
                key_val = compareit(*list_of_subkeys, sub_key_mode=True)
                
            
            else: 
                cleaned_key = cleaner(val)
                index_of_key = indexing(cleaned_key, projections)
                key_val = index_of_key
            
            list_of_keys.append(key_val)
        
        

        compared_things = compareit(*list_of_keys)
        possible_tables.update(compared_things)

    tables = dict()
    for key in possible_tables: 
        if len(possible_tables[key]) >= tau: 
            tables[key] = possible_tables[key]

    return tables 

In [31]:
def iter_json(path, csize):
    iter_chunk = pd.read_json(path, lines=True, chunksize=csize)
    for chunk in iter_chunk:
        yield chunk

In [34]:
if __name__ == "__main__":

    csize = 300
    path = "../data/tables_1000.json"

    start_table_index = 0

    projections = dict()
    tables_df = pd.DataFrame()

    c = get_clock_time_in_milli_sec()

    profiler = cProfile.Profile()
    profiler.enable()

    for table_chunk in iter_json(path, csize):
        
        table_list = create_table_list(table_chunk)

        projections = create_projections(table_list, start_table_index, projections, ps, create_ngrams=True, min_word_len=3)

        tables_df = create_tables_df(table_chunk, start_table_index, tables_df)

        start_table_index += csize

    print(tabulate(tables_df, headers='keys', tablefmt='psql'))

    c = get_clock_time_in_milli_sec() - c

    print("Time=", end="")
    print_time(c)
    print()

    profiler.disable()
    stats = pstats.Stats(profiler)
    stats.strip_dirs()
    stats.sort_stats("time")
    stats.print_stats()

    indexing_example = [('1929', 'Robert Crawford')]
    multikey_indexing_example = [(('1929', 'Robert Crawford'), 'Ulster Unionist')]
    tau = 1

    final_list_single_key = querry_thunel(indexing_example, projections, tau)
    final_list_muli_key = querry_thunel(multikey_indexing_example, projections, tau)

    print(final_list_single_key)
    print(final_list_muli_key)

+-----+------+------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------+----------+
|     |   id | pgTitle                                                                      | sectionTitle                                                                         | tableCaption                                                                                             |   weight |
|-----+------+------------------------------------------------------------------------------+--------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------+----------|
|   0 |    0 | Mid Antrim (Northern Ireland Parliament constituency)                        | Members o