In [None]:
import re
import pickle

import pymongo
import polars as pl
from tabulate import tabulate

from dltf.utils import tables
from dltf.utils import misc
from dltf.gsa.josie.josie import JOSIEGS
from dltf.utils.datalake import MongoDBDataLakeHandler

import importlib
importlib.reload(misc)

<module 'dltf.utils.misc' from '/home/nanni/datalake-table-finder/dltf/utils/misc.py'>

### Open the connection to MongoDB

In [2]:
mc = pymongo.MongoClient()
collection = mc.sloth.latest_snapshot_tables

### Function to extract results from JOSIE CSV results file

In [3]:
def get_result_ids(s):
    return list(map(int, re.findall(r'\d+', s)[::2]))

def get_result_overlaps(s):
    return list(map(int, re.findall(r'\d+', s)[1::2]))

### Setup for further inspections

In [4]:
data_path                   = f'./data'
mode                        = 'bag'
blacklist                   = set(['–', '—', '-', '●', '&nbsp', '&nbsp;', '&nbsp; &nbsp;', 'yes', 'no', 'n/a', 'none', '{{y}}', '{{n}}', '{{yes}}', '{{no}}', '{{n/a}}'] + list(map(str, range(1000))))
string_translators          = ['whitespace', 'lowercase']
string_patterns             = [] # [[re.compile(r'^.*\|([^}]*)\}\}$'),  r'\1']]

k                           = 10
min_w                       = 3
min_h                       = 10

# Set up the DataLake handler
datalake_name               = 'demo'
datalake_location           = 'mongodb'
datasets                    = ['sloth.latest_snapshot_tables']
dlh                         = MongoDBDataLakeHandler(datalake_location, datalake_name, datasets)


# JOSIE (global search tool) parameters
force_sampling_cost         = False # force JOSIE to do cost sampling before querying
token_table_on_memory       = False # build the token table used by JOSIE directly on disk
results_directory           = f'{data_path}'
tokens_bidict_file          = f'{data_path}/josie-tokens-bidict.pickle'
results_file                = f'{data_path}/results/result.csv'
log_file                    = f'{data_path}/.log'

# connection info for the JOSIE inverted index
db_config = {
    'drivername': 'postgresql',
    'database'  : 'DEMODB',
    'port'      :  5442,
    'host'      : 'localhost',
    'username'  : 'demo',
    'password'  : 'demo',
}

# Instatiate JOSIE
josie = JOSIEGS(
    mode=mode,
    blacklist=blacklist,
    datalake_handler=dlh,
    string_translators=string_translators,
    string_patterns=string_patterns,
    dbstatfile=None,
    tokens_bidict_file=tokens_bidict_file,
    josie_db_connection_info=db_config,
    spark_config=None
)

In [11]:
# Load the bidictionary between the JOSIE tokens IDs and the correspondent original string
with open(tokens_bidict_file, 'rb') as fr:
    tokens_bidict = pickle.load(fr)

In [12]:
def prepare_query(qdoc, mode, blacklist, string_translators, string_patterns, tokens_bidict):    
    # Extract a bag of tokens from the document's content
    query_sets = [
        [
            doc['_id_numeric'],
            tables.table_to_tokens(
                table=doc['content'], 
                valid_columns=doc['valid_columns'], 
                mode=mode, 
                blacklist=blacklist,
                string_translators=string_translators,
                string_patterns=string_patterns
            )
        ]
        for doc in [qdoc]
    ]
    
    # Map each token in the sets with its correspondent token ID for JOSIE
    query_sets = [
        [
            query_id, 
            sorted([
                tokens_bidict.inverse[misc.clean_string(token, string_translators, string_patterns)]
                for token in query_set 
                if misc.clean_string(token, string_translators, string_patterns) in tokens_bidict.inverse
            ])
        ]
        for query_id, query_set in query_sets
    ]

    # Transform the list <ID, tokens[]> into a dictionary <ID: tokens[]>
    query_sets = dict(query_sets)
    return query_sets

## Inspect results

In [23]:
query_id = 558808

In [24]:
qdoc = collection.find_one({'_id_numeric': query_id})
num_header_rows = qdoc['num_header_rows']
headers = qdoc['content'][:num_header_rows][0] if num_header_rows > 0 else []
content = [[misc.clean_string(cell, string_translators, string_patterns) for cell in row] for row in qdoc['content']]

print(f'Table {qdoc["_id_numeric"]}')
# print(f'Keeping columns: {keep_columns}, filtered tokens: {blacklist}')
print(f"Num header rows: {qdoc['num_header_rows']}, valid columns: {qdoc['valid_columns']}")
print(f'Context: {" - ".join(qdoc["context"])}')
print(tabulate(content[num_header_rows:], headers=headers, tablefmt='simple_outline'))

Table 558808
Num header rows: 1, valid columns: [0, 1, 1, 1, 0, 0, 0, 0, 1]
Context: List of country subdivisions by GDP over 200 billion USD - List
┌────────┬─────────────────────────────────┬───────────────────────┬───────────────┬───────┬────────┬───────┬────┬────────────────┐
│   Rank │ Subdivision                     │ Country               │ Continent     │ Nominal
GDP
(billions
USD)       │   Year │       Population
(millions) │    Nominal
GDP per
Capita
(thousands
USD) │ Largest
city                │
├────────┼─────────────────────────────────┼───────────────────────┼───────────────┼───────┼────────┼───────┼────┼────────────────┤
│      1 │ {{flag|california}}             │ {{usa}}               │ north america │ 3,020 │   2018 │  39.6 │ 76 │ los angeles    │
│      2 │ kanto region                    │ {{jpn}}               │ asia          │ 2,500 │   2014 │  43.3 │ 60 │ tokyo          │
│      3 │ {{flag|england}}                │ {{gbr}}               │ europe        │ 2,080

In [25]:
# Search results for the query sets
r = josie.query(
    results_file=results_file, 
    k=k, 
    queries=prepare_query(qdoc, mode, blacklist, string_translators, string_patterns, tokens_bidict)
)

qdoc['content'] = [[misc.clean_string(cell, string_translators, string_patterns) for cell in row] for row in qdoc['content']]

In [26]:
df = pl.read_csv(results_file + '.raw')
josie_res = df.select('query_id', 'results').rows()[0][1]
sloth_res = []

if josie_res is not None:
    josie_res = [(i, rid, jov) for i, (rid, jov) in enumerate(zip(get_result_ids(josie_res), get_result_overlaps(josie_res)))]
    print(f'Number of results: {len(josie_res)}')
    for i, rid, jov in josie_res:
        rdoc = collection.find_one({'_id_numeric': rid})
        rdoc['content'] = [[misc.clean_string(cell, string_translators, string_patterns) for cell in row] for row in rdoc['content']]
        
        try:
            sloth_overlap = misc.largest_overlap_sloth(
                qdoc['content'][qdoc['num_header_rows']:], 
                rdoc['content'][rdoc['num_header_rows']:], 
                qdoc['valid_columns'], 
                rdoc['valid_columns'], 
                blacklist=blacklist, 
                min_w=min_w,
                min_h=min_h,
                verbose=False
            )[0]
            sloth_res.append([i, rid, sloth_overlap])
        except KeyboardInterrupt: 
            sloth_res.append([i, rid, -1])
            
        print(f"{i}\t{rid}\t{jov}:  {' - '.join(rdoc['context'])}")
else:
    print('No results')

Number of results: 10
0	48147	216:  List of cities by GDP - 
1	1737281	135:  List of metropolitan areas by population - List
2	393853	128:  List of largest cities - Largest cities
3	488489	125:  Into the Wild Tour - Tour dates
4	922693	121:  Junkie Tour - Tour dates
5	541043	105:  Hot Fuss Tour - Tour dates
6	1918462	102:  List of cities by international visitors - 
7	781281	100:  Beacon World Tour - Tour dates
8	672464	99:  High Flying Birds Tour - Tour dates
9	189762	99:  Megacity - Largest cities


In [27]:
sloth_res.sort(key=lambda x: x[2], reverse=True)
sloth_res

[[0, 48147, 93],
 [9, 189762, 69],
 [1, 1737281, 0],
 [2, 393853, 0],
 [3, 488489, 0],
 [4, 922693, 0],
 [5, 541043, 0],
 [6, 1918462, 0],
 [7, 781281, 0],
 [8, 672464, 0]]

In [17]:
j = 0

josie_rank, rid, tabov = sloth_res[j]
rdoc = collection.find_one({'_id_numeric': rid})
rdoc['content'] = [[misc.clean_string(cell, string_translators, string_patterns) for cell in row] for row in rdoc['content']]

print(f'Table {rdoc["_id_numeric"]}, JOSIE rank: {josie_rank}, SLOTH rank: {j}, overlap {tabov}')
print(f'Num header rows: {rdoc["num_header_rows"]}, valid columns: {rdoc["valid_columns"]}')
print(f'Context: {" - ".join(rdoc["context"])}')
print(tabulate(rdoc['content'], headers='firstrow', tablefmt='simple_outline'))

r = misc.largest_overlap_sloth(
    qdoc['content'][qdoc['num_header_rows']:], rdoc['content'][rdoc['num_header_rows']:], qdoc['valid_columns'], rdoc['valid_columns'],
    verbose=True,
    min_h=min_h,
    min_w=min_w,
    blacklist=blacklist,
    )

Table 229843, JOSIE rank: 0, SLOTH rank: 0, overlap 90
Num header rows: 0, valid columns: [0, 1, 1, 1, 1, 0]
Context: Largest cities in the Americas - 
┌────┬─────────────────────────┬─────────────────────────────────────────────────────────────────────────────────┬─────────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│    │ city                    │ image                                                                           │ country       

In [37]:
'n/a' in blacklist

False

In fase di pulizia delle tabelle si possono togliere le cose tra doppie parentesi quadre? 

Colombia: 169633

UK ethnicity parliament: 520835

Country GDP: 558808

Largest cities in america: 572002

939555

Congo: 7376

Stadiums: 9892

Paleontology: 1003853

Army equipment: 1037572

Universities: 751207