# DIALITE: Discover, Align and Integrate open Data Tables

## Demo 1: DIALITE use case

In [1]:
# Import the necessary libraries
import pandas as pd
import dialite_server as dialite
import json
import time
import glob

## Step 1: Discover
The first step of DIALITE is to search for the related tables from open data repository. DIALITE offers state-of-the-art table search techniques to find the joinable, unionable or related tables from the open data repositories.

In [2]:
# Upload the query table.
query_table = dialite.upload_query_table("data/query/covid19_t1.csv")

Query table name: covid19_t1.csv
Query table uploaded successfully.
   Country        City  Vaccination Rate
0  Germany      Berlin                63
1  England  Manchester                78
2    Spain   Barcelona                82


In [3]:
# Execute discovery algorithm and create integration set.
integration_set = dialite.discover_tables(query_table, algorithm= ['SANTOS','JOSIE'], k = 1)

Enter index of intent column:
1
Enter index of query column:
1
Integration set after table discovery:
covid19_t3.csv
covid19_t1.csv
covid19_t2.csv


In [4]:
# Run integration algorithm over the integration set to get an integrated table.
integrated_table = dialite.integrate_tables(integration_set, algorithm = "ALITE")

Successfully integrated 3 tables using ALITE.
Integrated table:
   Country         City  Vaccination Rate  Total Cases  Death Rate
0  Germany       Berlin              63.0    1411749.0       147.0
1  England   Manchester              78.0          NaN         NaN
2    Spain    Barcelona              82.0    2676188.0       275.0
3   Canada      Toronto              83.0          NaN         NaN
4   Mexico  Mexico City               NaN          NaN         NaN
5      USA       Boston              62.0     263978.0       335.0
6      NaN    New Delhi               NaN    2006680.0       158.0


In [5]:
# Apply downstreaming tasks and run analysis over the integrated table. 
# In this example, we run an aggregation query over the selected attributes and measure correlation.
integrated_table_name = dialite.get_table_name(query_table)
sql = "SELECT city from integrated_table ORDER BY `Vaccination Rate` ASC LIMIT 1"
dialite.analyze_sql(integrated_table, query = sql)

     City
0  Boston


In [6]:
#See the correlation between the attributes by directly using pandas dataframe
integrated_table.corr()

Unnamed: 0,Vaccination Rate,Total Cases,Death Rate
Vaccination Rate,1.0,0.899881,0.160787
Total Cases,0.899881,1.0,-0.285772
Death Rate,0.160787,-0.285772,1.0
