In [1]:
import pandas as pd
import pymongo
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import sys
import os
import itertools
import json

In [2]:
DB_URI = "mongodb://127.0.0.1"
METRICS = "containment_min_score"
SET_SIMILARITY_THRESHOLD = 0.0
UNION_THRESHOLD = 0.0
CHUNK_SIZE = 1000

In [3]:
mongo_client = pymongo.MongoClient(DB_URI)
db = mongo_client['opencanada']
setoverlapresults_collection = db["setoverlapresults"]
inferredstats_collection = db["inferredstats"]
inferredcolumnstats_collection = db["inferredcolumnstats"]
metadata_collection = db["metadata"]
keyjoinscores_collection = db["keyjoinscores"]
usecasediscoveries_collection = db["usecasediscoveries"]

In [4]:
inferredstats_fields = {}
for i in inferredstats_collection.find({}):
    fields = []
    for f in i['schema']['fields']:
        fields.append(f['name'].lower())
    fields.sort()
    inferredstats_fields[i['uuid']] = fields

In [5]:
combs = list(itertools.combinations(sorted(list(inferredstats_fields.keys())), 2))
results = []
for t1, t2 in tqdm(combs):
    fields_t1 = inferredstats_fields[t1]
    fields_t2 = inferredstats_fields[t2]
    is_same_schema = fields_t1 == fields_t2
    if is_same_schema:
        results.append({
            'table_1': t1,
            'table_2': t2,
            'is_same_schema': is_same_schema
        })
schemas_hash = {}
for (k, v) in inferredstats_fields.items():
    hash_key = json.dumps(v)
    if hash_key not in schemas_hash:
        schemas_hash[hash_key] = []
    schemas_hash[hash_key].append(k)
groups = list(schemas_hash.values())
groups_more_than_one = [g for g in groups if len(g) > 1]

100%|██████████| 112267620/112267620 [01:02<00:00, 1787976.35it/s]


In [6]:
resources_hash = {}
for i in metadata_collection.aggregate([{'$match': {'resources.format': 'CSV'}}, {'$unwind': '$resources'}, {'$match': {'resources.format': 'CSV'}}]):
    resources_hash[i['resources']['id']] = i['resources']

In [7]:
def is_same_language(uuid1, uuid2):
    res_1 = resources_hash[uuid1]
    res_2 = resources_hash[uuid2]
    # get intersection of languages
    languages_1 = set(res_1['language'])
    languages_2 = set(res_2['language'])
    intersection = languages_1.intersection(languages_2)
    return len(intersection) > 0

In [8]:
def is_same_schema(uuid1, uuid2):
    fields_1 = inferredstats_fields[uuid1]
    fields_2 = inferredstats_fields[uuid2]
    return fields_1 == fields_2

In [9]:
groups_joinable = []
for group in tqdm(groups_more_than_one):
    joinable_dict = {}
    for uuid in group:
        joinables = []
        for joinable in keyjoinscores_collection.find({
            'query_uuid': uuid, 
            METRICS: {'$gte': SET_SIMILARITY_THRESHOLD}}):
            if not is_same_language(uuid, joinable['target_uuid']):
                continue
            if is_same_schema(uuid, joinable['target_uuid']):
                continue
            joinables.append(joinable)
        if len(joinables) == 0:
            continue
        joinable_scores_dict = {j['target_uuid']: j[METRICS] for j in joinables}
        joinable_uuids = [j['target_uuid'] for j in joinables]

        curr_joinable_dict = {}
        for target_uuid in joinable_uuids:
            fields = inferredstats_fields[target_uuid]
            curr_joinable_dict[target_uuid] = {
                'fields': fields, 
                'score': joinable_scores_dict[target_uuid]}
        joinable_dict[uuid] = curr_joinable_dict
    if(len(joinable_dict) == 0):
        continue
    groups_joinable.append(joinable_dict)

100%|██████████| 1409/1409 [09:57<00:00,  2.36it/s] 


In [10]:
output = []
for group in tqdm(groups_joinable):
    schemas_groups_dict = {}
    for query_uuid, target_schemas in group.items():
        for target_uuid, target_schema in target_schemas.items():
            hash_key = json.dumps(target_schema['fields'])
            if hash_key not in schemas_groups_dict:
                schemas_groups_dict[hash_key] = {}
            if query_uuid not in schemas_groups_dict[hash_key] or schemas_groups_dict[hash_key][query_uuid]['score'] < target_schema['score']:
                schemas_groups_dict[hash_key][query_uuid] = {
                    'target_uuid': target_uuid, 'score': target_schema['score']}
    for schema_group in schemas_groups_dict.values():
        if len(schema_group) < 2:
            continue
        joinable_percentage = len(schema_group) / len(group)
        if joinable_percentage < UNION_THRESHOLD:
            continue
        output_dict = {"union": [], "joinable_percentage": joinable_percentage}
        for query_uuid, target in schema_group.items():
            target_uuid = target['target_uuid']
            score = target['score']
            joinable_pair = keyjoinscores_collection.find(
                {"query_uuid": query_uuid, "target_uuid": target_uuid}).sort([(METRICS, -1)]).limit(1).next()
            output_dict["union"].append({
                "join": {
                    "query_uuid": query_uuid,
                    "target_uuid": target_uuid,
                    "query_index": joinable_pair["query_index"],
                    "target_index": joinable_pair["target_index"],
                },
                "score": joinable_pair[METRICS]
            })
        if(len(output_dict["union"])) == 0:
            continue
        output.append(output_dict)


100%|██████████| 727/727 [00:38<00:00, 18.72it/s] 


In [11]:
usecasediscoveries_collection.delete_many({})
usecasediscoveries_collection.insert_many(output)

<pymongo.results.InsertManyResult at 0x2dea0fb00>