In [None]:
from main import init_system
from api.apiutils import Relation
api, reporting = init_system("/Users/ra-mit/development/discovery_proto/test/mitdwh.pickle")

In [None]:
%matplotlib inline

## Discovery Functions

### Similar Tables

In [None]:
table = "Fac_building.csv"  # The table of interest
table_drs = api.drs_from_table(table)  # We get the representation of that table in DRS
similar_tables = api.similar_content_to(table_drs)  # similar tables are those with similar content

In [None]:
similar_tables.set_table_mode()  # We are interested in seeing tables, so we set the appropriate mode in DRS
for x in similar_tables:
    print(x)

Although for convenience, DRS objects has methods "print_tables()" and "print_columns()" that we use from now on

In [None]:
similar_tables.print_tables()

In [None]:
similar_tables.pretty_print_columns()

**We want to tune our definition of "similar table", we want both content and schema names to be similar**

In [None]:
table = "Fac_building.csv"  # The table of interest
table_drs = api.drs_from_table(table)  # We get the representation of that table in DRS
content_similar = api.similar_content_to(table_drs)  # similar tables are those with similar content
schema_similar = api.similar_schema_name_to(table_drs)
similar_tables = api.intersection(content_similar, schema_similar)

In [None]:
similar_tables.print_tables()

**A table is similar to one of reference when content, schema are similar and there's also some PKFK relationship involved**

In [None]:
table = "Fac_building.csv"  # The table of interest
table_drs = api.drs_from_table(table)  # We get the representation of that table in DRS
content_similar = api.similar_content_to(table_drs)  # similar tables are those with similar content
schema_similar = api.similar_schema_name_to(table_drs)  # similar attribute names
pkfk_similar = api.pkfk_of(table_drs) # some pkfk relationship involved too
inters1 = api.intersection(content_similar, schema_similar) # similar tables are similar in content and schema
similar_tables = api.intersection(inters1, pkfk_similar)

In [None]:
similar_tables.print_tables()

In [None]:
similar_tables.visualize_provenance()

**Trying it out with different tables: "Fctlt_building_hist_1.csv"**

In [None]:
table = "Fclt_building_hist_1.csv"  # The table of interest
table_drs = api.drs_from_table(table)  # We get the representation of that table in DRS
content_similar = api.similar_content_to(table_drs)  # similar tables are those with similar content
schema_similar = api.similar_schema_name_to(table_drs)  # similar attribute names
pkfk_similar = api.pkfk_of(table_drs) # some pkfk relationship involved too
inters1 = api.intersection(content_similar, schema_similar) # similar tables are similar in content and schema
similar_tables = api.intersection(inters1, pkfk_similar)

In [None]:
similar_tables.print_tables()

**Trying it out with different tables: "Iap_subject_detail.csv"**

In [None]:
table = "Iap_subject_detail.csv"  # The table of interest
table_drs = api.drs_from_table(table)  # We get the representation of that table in DRS
similar_tables = api.similar_content_to(table_drs)  # similar tables are those with similar content

In [None]:
similar_tables.print_tables()

In [None]:
table = "Iap_subject_detail.csv"  # The table of interest
table_drs = api.drs_from_table(table)  # We get the representation of that table in DRS
content_similar = api.similar_content_to(table_drs)  # similar tables are those with similar content
schema_similar = api.similar_schema_name_to(table_drs)  # similar attribute names
similar_tables = api.intersection(content_similar, schema_similar)

In [None]:
similar_tables.print_tables()

### Schema Complement

Given some table of reference, I want to know with which attributes I can extend it

In [None]:
table = "short_cis_course_catalog.csv"  # The table of interest
table_drs = api.drs_from_table(table)  # We get the representation of that table in DRS
joinable_tables = api.pkfk_of(table_drs)
all_attributes = api.drs_expand_to_table(joinable_tables)
attrs_to_extend = api.difference(all_attributes, table_drs)

In [None]:
attrs_to_extend.print_columns()

### Join Path

Given two tables of reference I want to know if I can join them directly or through some other tables

In [None]:
table1_name = "Drupal_employee_directory.csv"
table2_name = "Employee_directory.csv"
table1 = api.drs_from_table(table1_name)
table2 = api.drs_from_table(table2_name)
paths = api.paths_between(table1, table2, Relation.PKFK)

In [None]:
res = paths.paths()
for r in res:
    print(str(r))

In [None]:
paths.visualize_provenance()

In [None]:
table1_name = "Drupal_employee_directory.csv"
table2_name = "Employee_directory.csv"
table1 = api.drs_from_table(table1_name)
table2 = api.drs_from_table(table2_name)
paths = api.paths_between(table1, table2, Relation.CONTENT_SIM)

In [None]:
paths.print_tables()

In [None]:
paths.visualize_provenance()

## Reporting

### Visualizations

In [None]:
res = api.keyword_search("Madden")

In [None]:
res.print_columns()

In [None]:
res.visualize_provenance(labels=True)

### Data about the network

In [None]:
reporting.num_columns

In [None]:
reporting.num_tables

In [None]:
reporting.num_content_sim_relations

In [None]:
reporting.num_schema_sim_relations

In [None]:
reporting.num_pkfk_relations