In [2]:
import re
import pickle

import pymongo
import polars as pl
from tabulate import tabulate

from dltf.utils import tables
from dltf.utils import misc
from dltf.testers.josie.josie import JOSIETester
from dltf.utils.datalake import MongoDBDataLakeHandler

import importlib
importlib.reload(misc)

<module 'dltf.utils.misc' from '/home/nanni/datalake-table-finder/dltf/utils/misc.py'>

### Open the connection to MongoDB

In [3]:
mc = pymongo.MongoClient()
collection = mc.sloth.latest_snapshot_tables

### Function to extract results from JOSIE CSV results file

In [4]:
def get_result_ids(s):
    return list(map(int, re.findall(r'\d+', s)[::2]))

def get_result_overlaps(s):
    return list(map(int, re.findall(r'\d+', s)[1::2]))

### Setup for further inspections

In [19]:
data_path                   = f'./data'
mode                        = 'bag'
blacklist                   = set(['–', '—', '-', '●', '&nbsp', '&nbsp;', '&nbsp; &nbsp;', 'yes', 'no', 'n/a', 'none', '{{y}}', '{{n}}', '{{yes}}', '{{no}}', '{{n/a}}'] + list(map(str, range(1000))))
string_translators          = ['whitespace', 'lowercase']
string_patterns             = [] # [[re.compile(r'^.*\|([^}]*)\}\}$'),  r'\1']]

k                           = 10
min_w                       = 3
min_h                       = 10

# Set up the DataLake handler
datalake_name               = 'demo'
datalake_location           = 'mongodb'
datasets                    = ['sloth.latest_snapshot_tables']
dlh                         = MongoDBDataLakeHandler(datalake_location, datalake_name, datasets)


# JOSIE (global search tool) parameters
force_sampling_cost         = False # force JOSIE to do cost sampling before querying
token_table_on_memory       = False # build the token table used by JOSIE directly on disk
results_directory           = f'{data_path}'
tokens_bidict_file          = f'{data_path}/josie-tokens-bidict.pickle'
results_file                = f'{data_path}/results/result.csv'
log_file                    = f'{data_path}/.log'

# connection info for the JOSIE inverted index
db_config = {
    'drivername': 'postgresql',
    'database'  : 'DEMODB',
    'port'      :  5442,
    'host'      : 'localhost',
    'username'  : 'demo',
    'password'  : 'demo',
}

# Instatiate JOSIE
josie = JOSIETester(
    mode=mode,
    blacklist=blacklist,
    datalake_handler=dlh,
    string_translators=string_translators,
    string_patterns=string_patterns,
    dbstatfile=None,
    tokens_bidict_file=tokens_bidict_file,
    josie_db_connection_info=db_config,
    spark_config=None
)

In [20]:
# Load the bidictionary between the JOSIE tokens IDs and the correspondent original string
with open(tokens_bidict_file, 'rb') as fr:
    tokens_bidict = pickle.load(fr)

In [21]:
def prepare_query(qdoc, mode, blacklist, string_translators, string_patterns, tokens_bidict):    
    # Extract a bag of tokens from the document's content
    query_sets = [
        [
            doc['_id_numeric'],
            tables.table_to_tokens(
                table=doc['content'], 
                valid_columns=doc['valid_columns'], 
                mode=mode, 
                blacklist=blacklist,
                string_translators=string_translators,
                string_patterns=string_patterns
            )
        ]
        for doc in [qdoc]
    ]
    
    # Map each token in the sets with its correspondent token ID for JOSIE
    query_sets = [
        [
            query_id, 
            sorted([
                tokens_bidict.inverse[misc.clean_string(token, string_translators, string_patterns)]
                for token in query_set 
                if misc.clean_string(token, string_translators, string_patterns) in tokens_bidict.inverse
            ])
        ]
        for query_id, query_set in query_sets
    ]

    # Transform the list <ID, tokens[]> into a dictionary <ID: tokens[]>
    query_sets = dict(query_sets)
    return query_sets

## Inspect results

In [55]:
query_id = 751207

In [56]:
qdoc = collection.find_one({'_id_numeric': query_id})
num_header_rows = qdoc['num_header_rows']
headers = qdoc['content'][:num_header_rows][0] if num_header_rows > 0 else []
content = [[misc.clean_string(cell, string_translators, string_patterns) for cell in row] for row in qdoc['content']]

print(f'Table {qdoc["_id_numeric"]}')
# print(f'Keeping columns: {keep_columns}, filtered tokens: {blacklist}')
print(f"Num header rows: {qdoc['num_header_rows']}, valid columns: {qdoc['valid_columns']}")
print(f'Context: {" - ".join(qdoc["context"])}')
print(tabulate(content[num_header_rows:], headers=headers, tablefmt='simple_outline'))

Table 751207
Num header rows: 0, valid columns: [1, 1, 1, 1]
Context: List of research universities in the United States - Universities classified as "R1: Doctoral Universities – Very high research activity"
┌──────────────────────────────────────────────────────────────────┬──────────────────────┬─────────────────┬───────┐
│ institution                                                      │ control              │ city            │ state │
│ arizona state university                                         │ public               │ tempe           │ az    │
│ auburn university                                                │ public               │ auburn          │ al    │
│ binghamton university                                            │ public               │ vestal          │ ny    │
│ boston college                                                   │ private (non-profit) │ chestnut hill   │ ma    │
│ boston university                                                │ private (non-pr

In [57]:
# Search results for the query sets
r = josie.query(
    results_file=results_file, 
    k=k, 
    queries=prepare_query(qdoc, mode, blacklist, string_translators, string_patterns, tokens_bidict)
)

qdoc['content'] = [[misc.clean_string(cell, string_translators, string_patterns) for cell in row] for row in qdoc['content']]

In [58]:
df = pl.read_csv(results_file + '.raw')
josie_res = df.select('query_id', 'results').rows()[0][1]
sloth_res = []

if josie_res is not None:
    josie_res = [(i, rid, jov) for i, (rid, jov) in enumerate(zip(get_result_ids(josie_res), get_result_overlaps(josie_res)))]
    print(f'Number of results: {len(josie_res)}')
    for i, rid, jov in josie_res:
        rdoc = collection.find_one({'_id_numeric': rid})
        rdoc['content'] = [[misc.clean_string(cell, string_translators, string_patterns) for cell in row] for row in rdoc['content']]
        
        try:
            sloth_overlap = misc.largest_overlap_sloth(
                qdoc['content'][qdoc['num_header_rows']:], 
                rdoc['content'][rdoc['num_header_rows']:], 
                qdoc['valid_columns'], 
                rdoc['valid_columns'], 
                blacklist=blacklist, 
                min_w=min_w,
                min_h=min_h,
                verbose=False
            )[0]
            sloth_res.append([i, rid, sloth_overlap])
        except KeyboardInterrupt: 
            sloth_res.append([i, rid, -1])
            
        print(f"{i}\t{rid}\t{jov}:  {' - '.join(rdoc['context'])}")
else:
    print('No results')

Number of results: 10
0	1757533	512:  Research I university - Institutions
1	815257	256:  List of NCAA Division I basketball arenas - Current arenas
2	755346	247:  List of research universities in the United States - Universities classified as "R2: Doctoral Universities – High research activity"
3	1519949	216:  List of NCAA Division I women's soccer programs - Current Division I schools
4	1392827	216:  List of research universities in the United States - Universities classified as "D/PU: Doctoral/Professional Universities"
5	1732715	201:  List of metropolitan planning organizations in the United States - List of metropolitan planning organizations
6	1044118	190:  List of K-LOVE stations - 
7	15608	180:  List of NCAA Division I FBS football stadiums - Current stadiums
8	381466	173:  Urban planning education - Accreditation in North America | Accredited planning programs | United States of America and Puerto Rico
9	167894	167:  List of Pi Kappa Phi chapters - Chapters | Collegiate chapte

In [59]:
sloth_res.sort(key=lambda x: x[2], reverse=True)
sloth_res

[[0, 1757533, 464],
 [8, 381466, 129],
 [2, 755346, 45],
 [4, 1392827, 42],
 [1, 815257, 0],
 [3, 1519949, 0],
 [5, 1732715, 0],
 [6, 1044118, 0],
 [7, 15608, 0],
 [9, 167894, 0]]

In [49]:
j = 0

josie_rank, rid, tabov = sloth_res[j]
rdoc = collection.find_one({'_id_numeric': rid})
rdoc['content'] = [[misc.clean_string(cell, string_translators, string_patterns) for cell in row] for row in rdoc['content']]

print(f'Table {rdoc["_id_numeric"]}, JOSIE rank: {josie_rank}, SLOTH rank: {j}, overlap {tabov}')
print(f'Num header rows: {rdoc["num_header_rows"]}, valid columns: {rdoc["valid_columns"]}')
print(f'Context: {" - ".join(rdoc["context"])}')
print(tabulate(rdoc['content'], headers='firstrow', tablefmt='simple_outline'))

r = misc.largest_overlap_sloth(
    qdoc['content'][qdoc['num_header_rows']:], rdoc['content'][rdoc['num_header_rows']:], qdoc['valid_columns'], rdoc['valid_columns'],
    verbose=True,
    min_h=min_h,
    min_w=min_w,
    blacklist=blacklist,
    )

Table 1335701, JOSIE rank: 5, SLOTH rank: 0, overlap 30
Num header rows: 1, valid columns: [1, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0]
Context: 2014 Spanish Socialist Workers' Party leadership election - Opinion polls | Spanish voters
┌─────────────────────────────┬──────────────────┬───────────────┬──────────────────────────────────────────┬──────────────────────────────────────────────────────────┬────────────────────────────────────────┬────────────────────────────────────────┬────────────────────────────────────────────────────┬───────────────────────────────────────┬────────────────────────┬────────────────────────────────────────┬─────────────────────────────────────────────────┬───────────────┬─────────────┬────────┐
│ polling firm/commissioner   │ fieldwork date   │ sample size   │ file:eduardo madina 2016 (cropped).jpg   │ pedro sánchez                                            │ file:carme chacón 2010 (cropped).jpg   │ file:susana díaz 2015g (cropped).jpg   │ file:alfredo 

In [37]:
'n/a' in blacklist

False

In fase di pulizia delle tabelle si possono togliere le cose tra doppie parentesi quadre? 

Colombia: 169633

UK ethnicity parliament: 520835

Country GDP: 558808

Largest cities in america: 572002

939555

Stadiums: 9892

Paleontology: 1003853

Army equipment: 1037572

Universities: 751207