In [13]:
import glob
import pandas as pd
import numpy as np
import os
from IPython.display import HTML
import scipy

In [14]:
!ls ../../ppc/scripts

autotune_eval_zifa_G.sh  real_datasets_autotune_eval_last2.sh
model_eval_zi_12000.sh	 sos4


In [15]:


data_path = '../../ppc/scripts'
dataset_names = {'log_poisson_zifa_dataset_12000_0.08_{}'.format(zifa_lambda):
                 '$\lambda = {}$'.format(str(zifa_lambda).rstrip('0').rstrip('.'))
                 for zifa_lambda in [0., 1e-3, 1e-2, 1e-1, 1.0, 10.0]}
data_files = {}
data_names = {}
for dataset_name in dataset_names:
    data_files[dataset_name] = sorted(glob.glob(os.path.join(data_path, '{}/*.p'.format(dataset_name))))
    data_names[dataset_name] = []
    for data_file in data_files[dataset_name]:
        data_name = data_file.split('/')[-1].split('_')[0]
        data_names[dataset_name].append(data_name)

for dataset_name in list(dataset_names.keys()):
    if len(data_names[dataset_name]) != 2:
        del dataset_names[dataset_name]
        del data_files[dataset_name]
        del data_names[dataset_name]

In [16]:
dataset_names

{}

In [17]:
data_names

{}

In [18]:
df = {}
for dataset_name in dataset_names:
    dfs = []
    for data_name, f in zip(data_names[dataset_name], data_files[dataset_name]):
        if data_name != 'zifa_full':

            #my_df = pd.read_csv(f, sep='\t')
            my_df = pd.read_pickle(f)
            my_df.loc[:, 'model'] = data_name
            dfs.append(my_df)
            print(data_name, f)
    #         display(my_df)
    df[dataset_name] = pd.concat(dfs, axis=0)

In [19]:
from statsmodels.stats.weightstats import ttest_ind

metrics = ['ll_ll', 't_dropout_ks_stat', 't_ratio_ks_stat', 't_cv_ks_stat']

h1_hypothesis = ['greater', 'greater', 'greater', 'greater']


df_summary = {}

n_to_keep = 100

for dataset_name in dataset_names:

    df_nb = df[dataset_name].loc[df[dataset_name].model=='nb', metrics].sort_values(by='ll_ll').iloc[:n_to_keep,:]
    df_zinb = df[dataset_name].loc[df[dataset_name].model=='zinb', metrics].sort_values(by='ll_ll').iloc[:n_to_keep,:]
    
    df_nb_2 = df_nb.copy()
    df_nb_2['model'] = 'nb'
    df_zinb_2 = df_zinb.copy()
    df_zinb_2['model'] = 'zinb'
    df_new = pd.concat([df_nb_2, df_zinb_2])
    
    print("Shapes")
    print(df_nb.shape)
    print(df_zinb.shape)
    print(df_new.shape)
    print(df_new)

    def get_pvals(gby, other_df):
        my_df = gby[metrics]
        print(gby)
        assert my_df.shape[1] == len(metrics)
        assert other_df.shape[1] == len(metrics)
        pvals = []
        for idx, alternative in enumerate(h1_hypothesis):
            assert len(other_df.values[:, idx]) != len(h1_hypothesis), (len(other_df.values[:, idx]), len(h1_hypothesis))
            #_, pval, _ = ttest_ind(other_df.values[:, idx], my_df.values[:, idx], alternative=alternative)
            try:
                _, pval = scipy.stats.mannwhitneyu(np.sort(other_df.values[:, idx])[:n_to_keep],
                                       np.sort(my_df.values[:, idx])[:n_to_keep][::-1],
                                       use_continuity=True,
                                       alternative=alternative)
            except ValueError:
                pval = 0.5
            
            pvals.append(pval)
        return np.array(pvals)

    pvals_against_zinb = df_new.groupby('model').apply(get_pvals, other_df=df_zinb)
    pvals_against_zinb = (pvals_against_zinb
             .apply(lambda x: pd.Series(x))
             .T)
    pvals_against_zinb = pvals_against_zinb.rename(index={idx: met for (idx,met) in enumerate(metrics)})

    pvals_against_nb = df_new.groupby('model').apply(get_pvals, other_df=df_nb)
    pvals_against_nb = (pvals_against_nb
             .apply(lambda x: pd.Series(x))
             .T)
    pvals_against_nb = pvals_against_nb.rename(index={idx: met for (idx,met) in enumerate(metrics)})

    def get_summary(gby):
        res = {}
        res['median'] = gby.median()
        res['pvals_against_nb'] = pvals_against_nb[gby.name]
        res['pvals_against_zi'] = pvals_against_zinb[gby.name]
        return pd.DataFrame(res).T

    # df_summary = df.groupby('model')['ll_ll', 'imputation_median_imputation_score', 't_dropout_ks_stat',
    #                                  't_ratio_ks_stat', 't_cv_ks_stat'].agg(['mean', 'std']).T
    df_summary[dataset_name] = df[dataset_name].groupby('model')[metrics].apply(get_summary)


    df_summary[dataset_name] = df_summary[dataset_name].stack().unstack(1).sort_index(level=1).swaplevel()

# Export to Latex

In [20]:
def boldizer(row):
    to_bold = (row.pvals_against_nb <= 5e-2) | (row.pvals_against_zi <= 5e-2)
#     print(to_bold)
    if to_bold:
        row.latex = '\mathbf{' + row.latex + '}'
    return row

In [9]:
n_dec = 3

work_df = {}

for dataset_name in dataset_names:

    work_df[dataset_name] = (df_summary[dataset_name]
               .reset_index(level=-1)
               .reset_index()
               .assign(latex=lambda x: x['median'].round(n_dec).astype(str))
               .apply(boldizer, axis=1)
               .assign(Metric=lambda x: 
                       x.loc[:, 'index'].replace({
                           'll_ll': 'Negative LL',
                           't_cv_ks_stat': 'Coefficient of variation',
                           't_dropout_ks_stat': 'Dropout ratio',
                           't_ratio_ks_stat': 'Zeros-to-expression ratio',
                           'model': 'Model'
               }),
                       latex=lambda x: '$' + x.latex + '$'
                      )
              )

    work_df[dataset_name] = work_df[dataset_name].rename(columns={'model': 'Model', 'index': 'metric',
                                                                 'Metric': 'Metric (medians)'})
    work_df[dataset_name]['Model'] = work_df[dataset_name]['Model'].str.upper()
    #work_df[dataset_name]['Dataset'] = dataset_names[dataset_name]
    

In [10]:
pivot = {}
for dataset_name in dataset_names:
    pivot[dataset_name] = pd.pivot(work_df[dataset_name], index='Metric (medians)', values='latex', columns='Model')
    pivot[dataset_name]['Dataset'] = dataset_names[dataset_name]
    pivot[dataset_name].reset_index(inplace=True)
    pivot[dataset_name].set_index(['Dataset', 'Metric (medians)'], inplace=True)
pivot = pd.concat(list(pivot.values()), axis=0)

ValueError: No objects to concatenate

In [11]:
pivot.info()

AttributeError: 'dict' object has no attribute 'info'

In [12]:
res = pivot.to_latex(escape=False)
print(res)

AttributeError: 'dict' object has no attribute 'to_latex'