# Post-processing (1/n)

Compute log-likelihood for the bacteria CV datasets

In [1]:
import gzip, json
from functools import cache
from multiprocessing import Pool
from operator import itemgetter

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.auto import tqdm

from colaml.__main__ import model_from_json, phytbl_from_json
    
from myconfig import ROOT_DIR, DATA_DIR, DATASET_DIR

In [2]:
from threadpoolctl import threadpool_limits
threadpool_limits(1)

<threadpoolctl.threadpool_limits at 0x7f60d46c2860>

In [3]:
import colaml
colaml.__version__

'0.1.dev14+g6c01617'

In [4]:
conditions = pd.read_csv('conditions-main.tsv', sep='\t')

jobs = pd.read_csv(
    '240701-batch-job-array-bacteria.txt', sep='\t'
).assign(testfile=lambda df: df.infile.str.replace('train', 'test'))

## Log-likelihood computation

In [5]:
def batch_loglik(job):
    phytbl_train, _ = phytbl_from_json(ROOT_DIR/job.infile  , job.lmax)
    phytbl_test , _ = phytbl_from_json(ROOT_DIR/job.testfile, job.lmax)

    loglik_train = loglik_test = np.nan

    try:
        model = model_from_json(ROOT_DIR/job.outfile)
        
        stats_train = model.sufficient_stats(phytbl_train)
        loglik_train = stats_train.col_loglik.sum()
        
        stats_test = model.sufficient_stats(phytbl_test)
        loglik_test = stats_test.col_loglik.sum()
    
    except Exception:
        pass
    
    return dict(
        **job.loc[['conditionID', 'fold', 'fit_rep']].to_dict(), 
        ncols_train = phytbl_train.ncols,
        ncols_test = phytbl_test.ncols,
        loglik_train = loglik_train, 
        loglik_test = loglik_test
    )

In [6]:
with Pool(16) as pool:
    loglik = pd.DataFrame(tqdm(pool.imap(
        batch_loglik, map(itemgetter(1), jobs.iterrows())
    )))

0it [00:00, ?it/s]

## Inspect

In [7]:
loglik.loc[loglik[['loglik_train', 'loglik_test']].isna().any(axis=1)]

Unnamed: 0,conditionID,fold,fit_rep,ncols_train,ncols_test,loglik_train,loglik_test
576,cond12,3,7,2301,575,,
598,cond12,5,9,2301,575,,
1100,cond23,1,1,2300,576,,
1101,cond23,1,2,2300,576,,
1102,cond23,1,3,2300,576,,
1103,cond23,1,4,2300,576,,
1104,cond23,1,5,2300,576,,
1105,cond23,1,6,2300,576,,
1106,cond23,1,7,2300,576,,
1107,cond23,1,8,2300,576,,


## Save

In [8]:
loglik.to_pickle(DATA_DIR/'post-batch'/'02-bacteria'/'loglik-main.pkl.bz2')