In [1]:
import pandas as pd
from IPython.display import display
import sys
import os
os.chdir('/home/gfan/gen-t-demo/')
os.getcwd()

'/home/gfan/gen-t-demo'

In [2]:
# Parameters
benchmark = "t2d_gold"
source_table_name = "table_471_1.csv" # table_471, table_471_1 (with nulls injected), table_594, table_640
# benchmark = "tptr"
# source_table_name = "psql_0_c_oj_asia.csv" # psql_0_c_oj_asia, psql_8_s_ij_ps
BENCHMARKPATH = '/home/gfan/Datasets/%s/' % (benchmark)  
sim_threshold = 0.2 # for set similarity
integration_timeout = 3600 # for integration

In [3]:
# Find Candidate Tables
sys.path.append("discovery/")
from discovery_utils import get_lake, get_starmie_candidates
from discover_candidates import CandidateTables

runStarmie = 0
lake_dfs, all_lake_table_cols = get_lake(benchmark)
source_candidates = []
if runStarmie: 
    starmie_candidates = get_starmie_candidates(benchmark)
    source_candidates = starmie_candidates[source_table_name.replace('.csv', '')]

# Call CandidateTables to find candidates
candidate_table_finder = CandidateTables(benchmark, sim_threshold,lake_dfs, all_lake_table_cols, source_candidates)
candidateTablesFound, _ = candidate_table_finder.find_candidates(source_table_name)
print(f"{len(candidateTablesFound)} Candidate Tables found for Source Table {source_table_name}")
print(list(candidateTablesFound.keys()))

12 Candidate Tables found for Source Table table_471_1.csv
['table_536.csv', 'table_50.csv', 'table_187.csv', 'table_267.csv', 'table_594.csv', 'table_25.csv', 'table_244.csv', 'table_95.csv', 'table_640.csv', 'table_292.csv', 'table_674.csv', 'table_760.csv']


In [4]:

# Prune Candidate Tables to find the set of Originating Tables
from prune_candidates import OriginatingTables
# Call OriginatingTables to prune candidates to a set of originating tables
originating_tables_finder = OriginatingTables(benchmark, candidateTablesFound)
originating_tables, matrix_traversal_runtimes = originating_tables_finder.find_originating_tables(source_table_name)
source_df = originating_tables_finder.source_table
primary_key = originating_tables_finder.primary_key
print(f"From {len(candidateTablesFound)} candidate tables, found {len(originating_tables)} originating tables")
print(originating_tables)
origin_tables_matched_cols = {}
if originating_tables: origin_tables_matched_cols = {t: candidateTablesFound[t] for t in originating_tables}


From 12 candidate tables, found 6 originating tables
['table_50.csv', 'table_267.csv', 'table_95.csv', 'table_674.csv', 'table_25.csv', 'table_244.csv']


In [5]:
os.chdir('integration/')
os.getcwd()

'/home/gfan/gen-t-demo/integration'

In [6]:
# Integrate set of originating tables to reproduce source table
from targeted_integration import TableIntegration
import integration_utils as utils 
sys.path.append('../discovery/')
from evaluatePaths import setTDR

table_integrator = TableIntegration(benchmark, origin_tables_matched_cols, integration_timeout)
timed_out, noCandidates, numOutputVals = table_integrator.integrate_tables(source_table_name)
integration_result = table_integrator.reproducedSourceTable
table_ops = table_integrator.tableOpsUsed
TDR_recall, TDR_precision = setTDR(table_integrator.source_df, integration_result)
# print(f"Final Integration Result has Recall {TDR_recall:.3f} and Precision {TDR_precision:.3f}")
integration_result = table_integrator.order_rows_cols(integration_result)
integration_result_highlighted = table_integrator.highlight_overlap_values(integration_result)
display(integration_result_highlighted)

-----x---------x--------x---
Source Table has 100 rows, 5 columns
Resulting Table has 115 rows, 5 columns
5 overlapping columns: ['Title', "Fans' Rank", 'Year', 'Director(s)', 'Overall Rank']
-----x---------x--------x---


Unnamed: 0,Title,Fans' Rank,Year,Director(s),Overall Rank
0,Pulp Fiction,2,1994,Quentin Tarantino,15
1,Citizen Kane,3,1941,Orson Welles,2
2,Star Wars,5,1977,George Lucas,8


In [7]:
expanded_integ_result, new_cols, new_tuples = table_integrator.expand_tables('all', size=3)
expanded_integration_result_highlighted = table_integrator.highlight_overlap_values(expanded_integ_result)
display(expanded_integration_result_highlighted)


Integrating raw Originating tables produces Table of shape (514, 8), with 4744 values
Integrated Table has 3 new columns: ['Religion', 'Actor', 'Rank']
Integrated Table has 414 new tuples


Unnamed: 0,Title,Fans' Rank,Year,Director(s),Overall Rank,Religion,Actor
0,Pulp Fiction,2,1994,Quentin Tarantino,15,Atheist,John Travolta
1,Citizen Kane,3,1941,Orson Welles,2,,Orson Welles
2,Star Wars,5,1977,George Lucas,8,,
3,Kill Bill: Vol. 2,224,2004,Quentin Tarantino,701,,
4,Night of the Living Dead,229,1968,George A. Romero,139,Christian,
5,The Piano,307,1993,Jane Campion,247,Christian,


In [8]:
ranked_df = table_integrator.rank_tuples(expanded_integ_result, rankCol='Overall Rank')
ranked_df = table_integrator.highlight_overlap_values(ranked_df)
display(ranked_df)

Unnamed: 0,Title,Fans' Rank,Year,Director(s),Overall Rank,Religion,Actor
0,Citizen Kane,3,1941,Orson Welles,2,,Orson Welles
1,Star Wars,5,1977,George Lucas,8,,
2,Pulp Fiction,2,1994,Quentin Tarantino,15,Atheist,John Travolta
3,Night of the Living Dead,229,1968,George A. Romero,139,Christian,
4,The Piano,307,1993,Jane Campion,247,Christian,
5,Kill Bill: Vol. 2,224,2004,Quentin Tarantino,701,,


In [9]:
filter_dict = {'Director(s)': ['Quentin Tarantino', 'Steven Spielberg'], 
               'Overall Rank': [i for i in range(1000)]}
filtered_df = table_integrator.filter_tuples(expanded_integ_result, filter_dict)
filtered_df = table_integrator.highlight_overlap_values(filtered_df)
display(filtered_df)

Unnamed: 0,Title,Fans' Rank,Year,Director(s),Overall Rank,Religion,Actor
0,Pulp Fiction,2,1994,Quentin Tarantino,15,Atheist,John Travolta
1,Kill Bill: Vol. 2,224,2004,Quentin Tarantino,701,,


In [10]:
# Outer join list of candidate tables retrieved from data lake
joined_df = utils.outerjoin(list(originating_tables_finder.candidate_table_dfs.values())) # candidate tables
joined_df = table_integrator.order_rows_cols(joined_df)
joined_df = table_integrator.highlight_overlap_values(joined_df)
display(joined_df)

Unnamed: 0,Title,Fans' Rank,Year,Director(s),Overall Rank,Religion,Actor
0,Pulp Fiction,,1994,,,,
1,Pulp Fiction,2.0,1994,Quentin Tarantino,15.0,Atheist,
2,Pulp Fiction,,1994,Quentin Tarantino,15.0,,John Travolta
3,Citizen Kane,3.0,1941,Orson Welles,2.0,,
4,Citizen Kane,,1941,,,,
5,Citizen Kane,,1941,Orson Welles,2.0,,Orson Welles
6,Star Wars,,1977,George Lucas,8.0,,
7,Star Wars,5.0,1977,George Lucas,8.0,,
8,The Piano,307.0,1993,Jane Campion,247.0,Christian,
9,Night of the Living Dead,229.0,1968,George A. Romero,139.0,Christian,
