In [1]:
import re
import pickle

import pymongo
import polars as pl
from tabulate import tabulate

from dltf.utils import tables
from dltf.sloth import sloth, utils
from dltf.utils.misc import clean_string
from dltf.testers.josie.josie import JOSIETester
from dltf.utils.datalake import MongoDBDataLakeHandler
from dltf.testers.josie import josie as josiemod

import importlib
importlib.reload(tables)
importlib.reload(josiemod)

<module 'dltf.testers.josie.josie' from '/home/nanni/datalake-table-finder/dltf/testers/josie/josie.py'>

### Open the connection to MongoDB

In [2]:
mc = pymongo.MongoClient()
collection = mc.sloth.latest_snapshot_tables

### Function to extract results from JOSIE CSV results file

In [3]:
def get_result_ids(s):
    return list(map(int, re.findall(r'\d+', s)[::2]))

def get_result_overlaps(s):
    return list(map(int, re.findall(r'\d+', s)[1::2]))

### Setup for further inspections

In [4]:
data_path                   = f'./data'
k                           = 50

# Set up the DataLake handler
datalake_name               = 'demo'
datalake_location           = 'mongodb'
datasets                    = ['sloth.latest_snapshot_tables']
dlh                         = MongoDBDataLakeHandler(datalake_location, datalake_name, datasets)

# JOSIE (global search tool) parameters
mode                        = 'bag'
blacklist                   = set()
token_translators           = ['whitespace', 'lowercase']
force_sampling_cost         = False # force JOSIE to do cost sampling before querying
token_table_on_memory       = False # build the token table used by JOSIE directly on disk
results_directory           = f'{data_path}'
tokens_bidict_file          = f'{data_path}/josie-tokens-bidict.pickle'
results_file                = f'{data_path}/results/with-filter.csv'
log_file                    = f'{data_path}/.log'

# connection info for the JOSIE inverted index
db_config = {
    'drivername': 'postgresql',
    'database'  : 'DEMODB',
    'port'      :  5442,
    'host'      : 'localhost',
    'username'  : 'demo',
    'password'  : 'demo',
}

# Instatiate JOSIE
josie = JOSIETester(
    mode=mode,
    blacklist=blacklist,
    datalake_handler=dlh,
    token_translators=token_translators,
    dbstatfile=None,
    tokens_bidict_file=tokens_bidict_file,
    josie_db_connection_info=db_config,
    spark_config=None
)

In [5]:
# Load the bidictionary between the JOSIE tokens IDs and the correspondent original string
with open(tokens_bidict_file, 'rb') as fr:
    tokens_bidict = pickle.load(fr)

In [7]:
def prepare_query(filter_query_id, keep_columns, blacklist, regex_replace_pattern=re.compile('')):
    qdoc = dlh.get_table_by_numeric_id(filter_query_id)
    qdoc['content'] = [[re.sub(regex_replace_pattern, '', cell) for i, cell in enumerate(row) if i in keep_columns] for row in qdoc['content']]
    qdoc['valid_columns'] = [vc for i, vc in enumerate(qdoc['valid_columns']) if i in keep_columns]
    
    query_docs = [qdoc]
    
    # For each document, extract a set of tokens from its content
    query_sets = [
        [
            doc['_id_numeric'],
            tables.table_to_tokens(
                table=doc['content'], 
                valid_columns=doc['valid_columns'], 
                mode=mode, 
                blacklist=blacklist,
                string_translators=token_translators
            )
        ]
        for doc in query_docs
    ]
    
    # Map each token in the sets with its correspondent token ID for JOSIE
    query_sets = [
        [
            query_id, 
            sorted([
                tokens_bidict.inverse[clean_string(token, 'lowercase', 'whitespace')]
                for token in query_set 
                if clean_string(token, 'lowercase', 'whitespace') in tokens_bidict.inverse
            ])
        ]
        for query_id, query_set in query_sets
    ]

    # Transform the list <ID, tokens[]> into a dictionary <ID: tokens[]>
    query_sets = dict(query_sets)

    return qdoc, query_sets

## Analyse results

In [21]:
topic = 'patho'

df = pl.read_csv(f'data/results/{topic}.csv.raw')
df.head()

query_id,query_size,query_num_token,num_result,duration,preproc_duration,num_set_read,num_list_read,num_byte_read,max_set_size_read,max_list_size_read,max_counter_size,ignore_size,actions,results,benefit_cost
i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,i64,str,str,str
31713,28,2,4,2,1,0,2,0,0,4,0,1,,"""s2087013o1s243207o1s280752o1s1…","""None"""
43091,52,28,50,136,1,1,28,0,1235,7547,258,2,,"""s466456o6s334630o5s223168o5s20…","""None"""
74170,1,1,50,3,0,0,1,0,0,338,0,1,,"""s32492o1s13505o1s33855o1s34697…","""None"""
79178,60,23,50,15,1,1,23,0,10,29,90,2,,"""s2087013o5s1764227o4s598139o4s…","""None"""
111351,28,9,50,397,1,0,9,0,0,48729,0,1,,"""s524886o5s512109o5s998205o5s21…","""None"""


In [20]:
results = df.select('query_id', 'results').rows()
results = [[q, list(zip(get_result_ids(r), get_result_overlaps(r)))] for q, r in results]

In [10]:
curr_table_idx = 17
q = results[curr_table_idx][0]
res = results[curr_table_idx][1]

qdoc = collection.find_one({'_id_numeric': q})
qdoc['content'] = [[clean_string(cell, 'lowercase', 'whitespace') for cell in row] for row in qdoc['content']]
headers = qdoc['content'][:qdoc['num_header_rows']][0] if qdoc['num_header_rows'] > 0 else []

print(f'Table {qdoc["_id_numeric"]}')
print(f'Rows: {len(qdoc["content"])}, Columns: {len(qdoc["content"][0])}')
print(f'Context: {" - ".join(qdoc["context"])}')
print(tabulate(qdoc['content'][qdoc['num_header_rows']:], headers=headers, tablefmt='simple_outline'))

Table 1157584
Rows: 40, Columns: 4
Context: Ovarian cancer - Pathophysiology
┌──────────────────┬────────────────────────────────────────┬───────────────────────────────┬──────────────┐
│ gene mutated     │ mutation type                          │ subtype                       │ prevalence   │
├──────────────────┼────────────────────────────────────────┼───────────────────────────────┼──────────────┤
│ akt1             │ amplification                          │                               │ 3%           │
│ akt2             │ amplification/mutation                 │                               │ 6%, 20%      │
│ arid1a           │ point mutation                         │ endometrioid and clear cell   │              │
│ becn1            │ deletion                               │                               │              │
│ braf             │ point mutation                         │ low-grade serous              │ 0.5%         │
│ brca1            │ nonsense mutation             

In [11]:
print(f'Total results: {len(res)}')
for j, r in enumerate(res[:10]):
    rdoc = collection.find_one({'_id_numeric': r[0]})
    print(f"{j}\t{r[1]}\t{rdoc['_id_numeric']}:  {' - '.join(rdoc['context'])}")

Total results: 50
0	27	1753269:  Opinion polling for the 2017 Chilean general election - President | Open-ended question
1	27	1844480:  Opinion polling on the Emmanuel Macron presidency - Political barometers
2	27	1486294:  Metrolinx mobility hubs - Mobility hubs in the GTHA | Demographics
3	27	1238806:  Nationwide opinion polling for the 2016 Democratic Party presidential primaries - Individual polls | Polls conducted in 2015
4	27	905348:  Results of the 2013 Malaysian state elections by constituency - Sabah
5	27	1201815:  2015 Polish presidential election - Opinion polls | First round
6	27	1072062:  2014 Hungarian parliamentary election - Opinion polls
7	27	1473872:  2018 Florida gubernatorial election - Democratic primary | Polling
8	26	1031199:  Housing in Europe - 
9	26	112980:  1996 Republican Party presidential primaries - Results | Statewide


In [12]:
j = 0
rdoc = collection.find_one({'_id_numeric': res[j][0]})
rdoc['content'] = [[clean_string(cell, 'lowercase', 'whitespace') for cell in row] for row in rdoc['content']]

print(f'Table {rdoc["_id_numeric"]}, overlap {res[j][1]}')
print(f'Context: {" - ".join(rdoc["context"])}')
print(tabulate(rdoc['content'], headers='firstrow', tablefmt='simple_outline'))

r = sloth.sloth(
    utils.parse_table(qdoc['content'], len(qdoc['content'][0]), qdoc['num_header_rows']), 
    utils.parse_table(rdoc['content'], len(rdoc['content'][0]), rdoc['num_header_rows']),
    complete=True)

Table 1753269, overlap 27
Context: Opinion polling for the 2017 Chilean general election - President | Open-ended question
┌───────────────────┬─────────────────────────────────────┬──────────────────────────────────────────────────┬────────────────────────────────────────────────┬─────────────────────────────────────────────────────────────────┬───────────────────────────────────────────────┬─────────────────────────────────────────────────────────┬─────────────────────────────────────┬───────────────┬──────────────────────────────────────────────────────┬─────────────────────────────────┬───────────────────────────────────────────────────────┬───────────────────────────────────────────────────────┬──────────┬─────────┬─────────┐
│ polling firm      │ field dates                         │ file:marco enríquez-ominami 2018 (4x3).jpg meo   │ file:carolina goic 2011 4x3 cropped.jpg goic   │ file:alejandro guillier (2017) 4x3 cropped image.jpg guillier   │ file:felipe josé kast sommerhoff.

## Inspect results

In [13]:
print(qdoc['num_header_rows'])
print(tabulate([list(range(len(qdoc['content'][0]))), qdoc['content'][:qdoc['num_header_rows']][0]], tablefmt='simple_outline'))

1
┌──────────────┬───────────────┬─────────┬────────────┐
│ 0            │ 1             │ 2       │ 3          │
│ gene mutated │ mutation type │ subtype │ prevalence │
└──────────────┴───────────────┴─────────┴────────────┘


In [25]:
keep_columns = [0, 1, 2]
blacklist =  {''}
regex_replace_pattern = re.compile('') 
# regex_replace_pattern = re.compile(r' \(\d+%\)')
# regex_replace_pattern = re.compile(r' virus')
fqdoc, query_sets = prepare_query(qdoc['_id_numeric'], keep_columns, blacklist, regex_replace_pattern)

# we can define some constraints on the overlap
min_num_columns_on_overlap = 0
min_num_rows_on_overlap = 0
search_all_overlaps = True

# Search results for the query sets
r = josie.query(
    results_file=results_file, 
    k=k, 
    queries=query_sets, 
    force_sampling_cost=force_sampling_cost, 
    results_directory=results_directory, 
    token_table_on_memory=token_table_on_memory,
    verbose=True
)

100%|██████████| 1/1 [00:00<00:00, 42.72it/s]


In [26]:
fqdoc = collection.find_one({'_id_numeric': qdoc['_id_numeric']})
headers = fqdoc['content'][:fqdoc['num_header_rows']][0] if fqdoc['num_header_rows'] > 0 else []
headers = [h for ii, h in enumerate(headers) if ii in keep_columns]

fqdoc['content'] = fqdoc['content'][fqdoc['num_header_rows']:]
fqdoc['content'] = [[clean_string(re.sub(regex_replace_pattern, '', cell), 'lowercase', 'whitespace') for ii, cell in enumerate(row) if ii in keep_columns] for row in fqdoc['content']]
fqdoc['content'] = [[cell if cell not in blacklist else '#####' for cell in row] for row in fqdoc['content']]

print(f'Table {fqdoc["_id_numeric"]}')
print(f'Keeping columns: {keep_columns}, filtered tokens: {blacklist}')
print(f'Context: {" - ".join(fqdoc["context"])}')
print(tabulate(fqdoc['content'], headers=headers, tablefmt='simple_outline'))

Table 1157584
Keeping columns: [0, 1, 2], filtered tokens: {''}
Context: Ovarian cancer - Pathophysiology
┌──────────────────┬────────────────────────────────────────┬───────────────────────────────┐
│ Gene mutated     │ Mutation type                          │ Subtype                       │
├──────────────────┼────────────────────────────────────────┼───────────────────────────────┤
│ akt1             │ amplification                          │ #####                         │
│ akt2             │ amplification/mutation                 │ #####                         │
│ arid1a           │ point mutation                         │ endometrioid and clear cell   │
│ becn1            │ deletion                               │ #####                         │
│ braf             │ point mutation                         │ low-grade serous              │
│ brca1            │ nonsense mutation                      │ high-grade serous             │
│ brca2            │ frameshift mutation        

In [27]:
filtdf = pl.read_csv('data/results/with-filter.csv.raw')
fres = filtdf.select('query_id', 'results').rows()
if fres[0][1] != None:
    fres = [[q, list(zip(get_result_ids(r), get_result_overlaps(r)))] for q, r in fres][0][1]
    print(f'Number of results: {len(fres)}')

    for j, r in enumerate(fres[:10]):
        frdoc = collection.find_one({'_id_numeric': r[0]})
        print(f"{j}\t{r[1]}\t{frdoc['_id_numeric']}:  {' - '.join(frdoc['context'])}")
else:
    print('No results')

Number of results: 50
0	13	1070370:  Epigenetics and melanoma - Roles microRNAs play in melanoma development
1	7	797192:  List of genes mutated in cutaneous conditions - 
2	3	653831:  Gene duplication - As amplification | Role in cancer
3	3	4058:  DOM events - Events | HTML events | Common events
4	3	1375208:  Non-small-cell lung carcinoma - Cause | DNA repair deficiency in NSCLC
5	3	841505:  Cancer biomarker - Types of cancer biomarkers | Molecular cancer biomarkers
6	3	1745952:  Glucose-6-phosphate dehydrogenase deficiency - Cause | Genetics
7	2	1024649:  Protein kinase inhibitor - Comparison of available agents
8	2	1391493:  Antineoplastic resistance - Genetic markers for drug sensitivity and resistance
9	2	1734953:  RNA interference - Applications | Medicine | Prospects as a Therapeutic Technique


In [29]:
jf = 0

frdoc = collection.find_one({'_id_numeric': fres[jf][0]})
frdoc['content'] = [[clean_string(cell, 'lowercase', 'whitespace') for cell in row] for row in frdoc['content']]

print(f'Table {frdoc["_id_numeric"]}, overlap {fres[jf][1]}')
print(f'Header rows: {frdoc["num_header_rows"]}')
print(f'Context: {" - ".join(frdoc["context"])}')
print(tabulate(frdoc['content'], headers='firstrow', tablefmt='simple_outline'))

r = sloth.sloth(
    utils.parse_table(fqdoc['content'], len(fqdoc['content'][0]), 0), 
    utils.parse_table(frdoc['content'], len(frdoc['content'][0]), frdoc['num_header_rows']),
    complete=True,
    verbose=True,
    )

Table 1070370, overlap 13
Header rows: 1
Context: Epigenetics and melanoma - Roles microRNAs play in melanoma development
┌────────────────────────┬─────────────┬─────────────────────────┬───────────────────────────────────────────────┐
│ gene type              │ gene name   │ alteration type         │ frequency of alteration within melanoma (%)   │
├────────────────────────┼─────────────┼─────────────────────────┼───────────────────────────────────────────────┤
│ proto- oncogenes       │ nras        │ mutation                │ 15- 25                                        │
│                        │ braf        │ mutation                │ 50- 70                                        │
│                        │ kit         │ mutation                │ 2- 10                                         │
│                        │ cdk4        │ mutation, amplification │ 0- 9                                          │
│                        │ ctnnb1      │ mutation                │ 2- 23 

### Try to clean and modify the query table

In [None]:
table2 = [[cancer, m] for cancer, markers in rdoc['content'] for m in markers.split(',')]

print(tabulate(table2, headers='firstrow', tablefmt='simple_outline'))
r = sloth.sloth(
    utils.parse_table(qdoc['content'], len(qdoc['content'][0]), qdoc['num_header_rows']), 
    utils.parse_table(table2, len(table2[0]), rdoc['num_header_rows']),
    complete=True,
    min_w=0,
    min_h=0
)

## Possible example tables

- PATHO
    - 1, 43091
    - 6, 149698
    - 9, 267433
    - 14, 558801
    - 17, 1157584 --> better

- CANCER
    - 653831

- CONSUME
    - 1356495