In [1]:
import pandas as pd
import numpy as np 
import torch 
import matplotlib.pyplot as plt 
import collections
import nltk 
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import string
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk import ngrams
import copy
from collections import defaultdict
from functools import cache
import cProfile
import pstats
import time
from tabulate import tabulate
import itertools

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/christophhalberstadt/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/christophhalberstadt/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [2]:
data = pd.read_json('/Users/christophhalberstadt/Documents/TU Berlin/LSDIPro/tables.json', 
                    lines=True, nrows=100, )

In [3]:
class Config: 

    batch_size = 3000
    stop_words = None 
    stemmer = PorterStemmer()

    create_ngrams = False 
    ngram_size = 2
    min_word_len = 0 

    read_path = '/Users/christophhalberstadt/Documents/TU Berlin/LSDIPro/tables.json'
    

In [4]:
class Tokenizer: 

    def __init__(self, stemmer, stemmed_stopwords: set = None, min_word_len: int = 0): 
        self.stemmer = stemmer 
        self.stopwords = stemmed_stopwords or [] ######Hier stemmen? 
        self.min_word_len = min_word_len

    def __call__(self, value:str) -> str: 
        """
        Hier sollte ihr DocString stehen 
        """
        words = self.process_words(value)

        stemmed_words = []
        for word in words: 
            stemmed_word = self.stem_cached(word)

            if len(stemmed_word) >= self.min_word_len and stemmed_word not in self.stopwords: 
                stemmed_words.append(stemmed_word)

        #stemmed_words = [self.stem_cached(word) for word in words if len(word)>=self.min_word_len] ####stopwords hier noch 
        if len(stemmed_words) == 0: 
            return None
        joined_words = " ".join(stemmed_words)
            
        return joined_words           

    @cache
    def stem_cached(self, word:str):
        """
        Returns the stemmed version of the input word. 

        In: 
            Word: str
        
        Out: 
            The stemmed version of the word: str
        """
        return self.stemmer.stem(word)
    
    @cache
    def process_words(self, text:str)->list: 
        """
        Processes the input text into a lowercase, punctuation free and tokenized list of strings. 

        In: 
            text: str

        Out: 
            words: list(str)
        """
        sentence = text.lower().translate(str.maketrans("", "", string.punctuation))            ####Konfigurierbar in der Init machen 
        words = word_tokenize(sentence)
        return words 

In [5]:
def create_table_list(dfs):
    """
    Creates a list of tables. 
    Indexing Rank: Table_ID -> Row_ID -> Col_ID 

    In: 
        dfs:pd.dataframe? 
    Out: 
        Table_List: list
    """
    
    return [[[cell.get('text', '') for cell in row] for row in table_data] for table_data in dfs['tableData']]

In [6]:
def create_projections(table_list,
                       tokenizer, 
                       ) -> dict: 
    


    ######Trie Indexing for Fuzzy Matching fehlt 

    projections = dict()

    for table_index, table in enumerate(table_list): 
        for row_index, row in enumerate(table): 
            for cell_index, cell in enumerate(row): 
                stemmed_cell = tokenizer(cell)

                if stemmed_cell: 
                    projections.setdefault(stemmed_cell, set()).add((table_index, row_index, cell_index))

    return projections

In [7]:
def indexing(tokenized_value:list, projections:dict, key_id:int = None)->dict:
    """
    Return a dict of all the Examples found in the projections
    In: 
        Cleaned_Values: A List of all the Stemped Versions of one Example given
        Projections: A Dict of Projections of all given Tables
    Out: 
        Index_Dict: A dict of all the positions where the Example was found
                    Form: Key: (Table_ID, Row_ID) -> Value: (Col_ID)
    """ 
    key_id = key_id+1
    index_dict = dict() 
    
    value_index = projections.get(tokenized_value, None)

    if value_index: 
        for table_id, row_id, col_id in value_index: 

            if key_id: 
                index_dict.setdefault((table_id, row_id), set()).add((key_id-1, col_id))
            else: 
                index_dict.setdefault((table_id, row_id), set()).add((col_id))
    
    return index_dict

In [8]:
stemmer = PorterStemmer()

In [9]:
my_tokenizer = Tokenizer(stemmer)

In [10]:
table_list = create_table_list(data)
print(len(table_list))

100


In [11]:
projections = create_projections(table_list, my_tokenizer)

In [12]:
print(len(projections))

2130


In [17]:
def find_direct_tables(Examples:set,  
                       projections:dict, 
                       tau: int, 
                       tokenizer:Tokenizer):

    evidence = dict()
    E = len(Examples)
    K = None

    

    if tau > E: 
        raise ValueError(f"At least Tau: {tau} examples must be given!")
    

    for Example in Examples: 

        if not K: 
            K = len(Example)                
        else: 
            if K != len(Example): 
                raise ValueError(f"All Examples must be of the same Size!")
        

        

        idx_list = list()               
        for key_id, example_key in enumerate(Example):               
            tokens_of_example_key = tokenizer(example_key)

            dict_of_idx = indexing(tokens_of_example_key, projections, key_id)  
            idx_list.append(dict_of_idx)


        key_sets = [set(d.keys()) for d in idx_list]
        unique_shared_keys = set.intersection(*key_sets)
        
        if not unique_shared_keys: 
            continue 

        for key in unique_shared_keys:

            all_mappings_for_this_row = []
            for dict_of_idx in idx_list:

                mappings = dict_of_idx[key] 
                all_mappings_for_this_row.append(mappings)

            
            for candidate_mapping in itertools.product(*all_mappings_for_this_row):

                table_id, row_id = key

                evidence.setdefault((table_id, candidate_mapping), set()).add(Example)

    
    relevant_tables = dict()

    for key, value in evidence.items(): 
        if len(value) >= tau: 
            table_id, candidate_mapping = key
            relevant_tables.setdefault(table_id, list()).append(dict(candidate_mapping))

    return relevant_tables

In [14]:
indexing_example = [('1929', 'Robert Crawford'), ('1938', 'John Patrick')]
tau = 2

In [18]:
indexing_example = [( 'Robert Crawford' ,'Ulster Unionist', '1929')]
tau = 1

In [19]:
relevant_tables = find_direct_tables(indexing_example, projections, tau, my_tokenizer)


In [20]:
print(relevant_tables)

{0: [{0: 2, 1: 3, 2: 1}]}


In [28]:
Querries = [('Robert Crawford', 'Ulster Unionist'), ('John Patrick', 'Ulster Unionist')]


In [41]:
table = table_list[0]
my_list = list()
for row in table: 
    my_list.append(my_tokenizer(row[2]))

print(my_list)

['robert crawford', 'robert crawford', 'john patrick', 'john patrick', 'robert nichol wilson', 'robert simpson', 'robert simpson', 'robert simpson', 'robert simpson', 'robert simpson', 'constitu abolish']


In [48]:
my_list = np.array(my_list)

In [49]:
d = my_tokenizer(Querries[0][0]) 
print(d)
print(type(d))

robert crawford
<class 'str'>


In [None]:
c = np.where(d == my_list, 1, 0)
print(c)    

[1 1 0 0 0 0 0 0 0 0 0]


In [30]:
anz_querries = len(Querries)
cols = list()
for relevant_table, possible_mappings in relevant_tables.items(): 
    table = table_list[relevant_table]
    for possible_mapping in possible_mappings: 
        for Ev_Col, Tab_col in possible_mapping.items(): 
            #Ev_Col, Tab_col = mapping.items()
            tokenized_querries = []
            print(Ev_Col, Tab_col)
            print(type(Ev_Col))
            col = list()
            for row in table: 
                col.append(row[Tab_col])

            np_col = np.array(col)
            for i in range(anz_querries):
                x = Querries[i]
                print(x)
                print(Ev_Col)
                print(x[Ev_Col])
                tokenized_x = my_tokenizer(x[Ev_Col])
                index = np.where(tokenized_x ==np_col)[0]
                print(index)
            
            cols.append(col)
for col in cols: 
    print(col)

0 2
<class 'int'>
('Robert Crawford', 'Ulster Unionist')
0
Robert Crawford
[]
('John Patrick', 'Ulster Unionist')
0
John Patrick
[]
1 3
<class 'int'>
('Robert Crawford', 'Ulster Unionist')
1
Ulster Unionist
[]
('John Patrick', 'Ulster Unionist')
1
Ulster Unionist
[]
2 1
<class 'int'>
('Robert Crawford', 'Ulster Unionist')
2


IndexError: tuple index out of range

In [31]:
a = np.arange(0, 10) -2
print(a)

[-2 -1  0  1  2  3  4  5  6  7]


In [34]:
b = np.where(a>0, 1, 0)
print(b)

[0 0 0 1 1 1 1 1 1 1]


IIIC: 

Einstieg vor/bei/nach dem Vergleich zwischen X und Y bei direct_table_join. 
Ich nehme die menge aller hier (Table_ID; Row_ID) Tuple meiner Tabelle, die dann nur ein View sind, und ziehe all die ab, die für meinen direct join genutzt werden. 

Ich nehme alle E.X werte dieser Tabelle und generiere eher den View (Table_ID; Col_ID): set(Row_ID). 
Wenn len(set(Row_ID)) == 1 ist, dann ist eine funktionale Beziehung zu allen Elementen in der Zeile gegeben. Also nehme ich all diese Werte als Zwischenwerte. 

Wenn >1, dann nehme ich mir all die Zeilen, wo E.X drin ist und itteriere über alle Spalten. Ich sammle die Elemente dort in einem set() und wenn dieses wieder len(set())==1 für eine Saplte ist, dann steht dieser Wert in einer funktional abhängigen Beziehung zu E.x.
Wenn nicht, dann verwerfe ich diese Werte. 

Funktionelle Abhängigkeit wird nur für E.x geprüft, nicht für ein x in Q. 
x in Q folgen nur dem "Pfad", der durch E.x festgelegt wurde. 