# Analyze results from Leon French's java app

A split-half training set and a split-quarter training set were each run through Leon's java app on jjm15. Later we fed Leon's app full matrices under a couple of different contexts. It re-ranked probes on every iteration. 


## Data loading and organization

In [1]:
""" Set up the context. """

import os


# Results from Leon's java optimizer.
# Most of the data recorded here was entered manually from copy-pasted log files and checking file inode timestamps.
java_base_dir = "/home/mike/projects/bams_and_allen"
java_results = [
    {   # 2020-03-18 - Running on jjm14 - should finish around Friday 3/20 mid-day
        'name': 'java_whole_23c_60g', 'plot': False,  # identical to 12c, n/a yet
        'base_dir': java_base_dir, 'subdir': "whole_23c_60g", 'filename': "LOOResults.1583164067177.txt", 
        'cores': 23, 'ram': 60, 'columns': 1139, 'genes': 15745, 'first_estimates': [397, 432, 450, 437, ],
        'command': "java -Xmx60g -cp ./BAMSandAllen_fat.jar ubic.BAMSandAllen.MatrixPairs.FromFileMatrixPair 23 /home/mikes/projects/bams_allen_java/whole_23c_60g/hcp_conn_sim_whole.tsv /home/mikes/projects/bams_allen_java/whole_23c_60g/ahba_expr_whole.tsv",
        'start': "2020-03-02 10:42:11", 'node': "jjm14",
    },
    {
        'name': 'java_whole_12c_10g', 'plot': True,
        'base_dir': java_base_dir, 'subdir': "whole", 'filename': "LOOResults.1582232409921.txt", 
        'cores': 12, 'ram': 10, 'columns': 1139, 'genes': 15745, 'first_estimates': [537, 511, 467, 533, ],
        'command': "java -Xmx10g -cp ./BAMSandAllen_fat.jar ubic.BAMSandAllen.MatrixPairs.FromFileMatrixPair 12 /home/mikes/projects/bams_allen_java/whole/hcp_conn_sim_whole.tsv /home/mikes/projects/bams_allen_java/whole/ahba_expr_whole.tsv",
        'start': "2020-02-20 16:04:33", 'end': "2020-03-13 05:35:49", 'node': "jjm15",
    },
    {
        'name': 'java_half_12c_10g', 'plot': True,
        'base_dir': java_base_dir, 'subdir': "202", 'filename': "LOOResults.1580525004509.txt",
        'columns': 568, 'genes': 15745,
        'start': "2020-01-31 21:44:32", 'end': "2020-02-08 12:40:30",
    },
    {
        'name': 'java_quarter_23c_60g_1', 'plot': False,  # identical to 12c and to _2
        'base_dir': java_base_dir, 'subdir': "quarter_23_60", 'filename': "LOOResults.1583035707345.txt", 
        'cores': 23, 'ram': 60, 'columns': 283, 'genes': 15745, 'first_estimates': [26, 21, 26, 26, 21, ],
        'command': "java -Xmx60g -cp ./BAMSandAllen_fat.jar ubic.BAMSandAllen.MatrixPairs.FromFileMatrixPair 23 /home/mikes/projects/bams_allen_java/quarter_23_60/hcp_conn_sim_train402.tsv /home/mikes/projects/bams_allen_java/quarter_23_60/ahba_expr_train402.tsv",
        'start': "2020-02-29 23:08:45", 'end': "2020-03-02 03:22:16", 'node': "jjm14",
    },
    {
        'name': 'java_quarter_23c_60g_2', 'plot': False,  # identical to 12c and to _1
        'base_dir': java_base_dir, 'subdir': "402", 'filename': "LOOResults.1582340405040.txt",
        'columns': 283, 'genes': 15745,
        'start': "2020-02-21 22:01:02", 'end': "2020-02-23 07:15:15",
    },
    {
        'name': 'java_quarter_12c_10g', 'plot': True,
        'base_dir': java_base_dir, 'subdir': "402", 'filename': "LOOResults.1580573710986.txt",
        'columns': 283, 'genes': 15745,
        'start': "2020-02-01 11:15:39", 'end': "2020-02-04 04:00:30",
    },
    {
        'name': 'java_parcels_23c_60g', 'plot': True,
        'base_dir': java_base_dir, 'subdir': "whole_glasser_23c_60g", 'filename': "LOOResults.1584153479004.txt",
        'columns': 177, 'genes': 15745,
        'start': "2020-03-13 22:37:49", 'end': "2020-03-14 07:50:56",
    },
]

# These runs were killed, some because they were run with too few CPUs or insufficient RAM and we gave up.
# We do nothing with them here, but this can help with cross-referencing files in the filesystem.
java_cancels = [
    {'subdir': "whole_23", 'filename': "LOOResults.1582223481773.txt", },
    {'subdir': "whole_23", 'filename': "LOOResults.1582232409921.txt", },
    {'subdir': "whole_23", 'filename': "LOOResults.1582428218753.txt", },
    {'subdir': "whole", 'filename': "LOOResults.1582223481773.txt", },
]

# Mike's python-based results (not identical, but should have reasonable ktau vs java)
pygest_base_dir = "/data/derivatives/sub-all_hem-A_samp-glasser_prob-fornito/"
pygest_results = [
    {   # 2020-03-18 - running on jjm15
        'name': 'pygest_whole_evry_12c', 'plot': False,  # n/a yet
        'base_dir': java_base_dir, 'subdir': 'pygest_whole_evry_12c',
        'filename': "",
        'columns': 1139, 'genes': 15745, 'cores': 12,
        'node': "jjm15",
    },
    {
        'name': 'pygest_whole_smrt_12c', 'plot': False,  # identical to 4c
        'base_dir': java_base_dir, 'subdir': 'pygest_whole_smrt_12c',
        'filename': "ahbaexprwholedf_comp-hcpniftismoothgrandmeansim_mask-none_norm-srs_adj-none.tsv",
        'columns': 1139, 'genes': 15745, 'cores': 12,
        'start': "2020-03-16 23:59:05", 'end': "2020-03-17 06:27:26", 'node': "jjm15",
    },
    {
        'name': 'pygest_whole_smrt_4c', 'plot': True,
        'base_dir': java_base_dir, 'subdir': 'pygest_whole_smrt_4c',
        'filename': "ahbaexprwholedf_comp-hcpniftismoothgrandmeansim_mask-none_norm-srs_adj-none.tsv",
        'columns': 1139, 'genes': 15745, 'cores': 4,
        'start': "2020-03-17 11:34:12", 'end': "2020-03-18 01:33:25", 'node': "jjm15",
    },
    {
        'name': 'pygest_whole_once_8c', 'plot': True,
        'base_dir': java_base_dir, 'subdir': 'pygest_whole_once_8c',
        'filename': "ahbaexprwholedf_comp-hcpniftismoothgrandmeansim_mask-none_norm-srs_adj-none.tsv",
        'columns': 1139, 'genes': 15745, 'cores': 8,
        'start': "2020-03-17 11:36:18", 'end': "2020-03-17 22:44:55", 'node': "jjm15",
    },
    {   # 2020-03-18 - running on jjm6
        'name': 'pygest_half_evry_6c', 'plot': False,  # n/a yet
        'base_dir': java_base_dir, 'subdir': 'pygest_half_evry_6c',
        'filename': "",
        'columns': 568, 'genes': 15745, 'cores': 6,
        'node': "jjm6",
    },
    {
        'name': 'pygest_half_smrt_4c', 'plot': False,  # old version, upgraded to 6c
        'base_dir': pygest_base_dir, 'subdir': 'parby-wellid_splby-wellid_batch-train00202/tgt-max_algo-smrt',
        'filename': "sub-all_comp-hcpniftismoothgrandmeansim_mask-none_norm-srs_adj-none.tsv",
        'columns': 568, 'genes': 15745, 'cores': 4,
        'start': "2019-12-28 13:43:11", 'end': "2019-12-28 16:21:22", 'node': "jjm15",
    },
    {
        'name': 'pygest_half_smrt_6c', 'plot': True,
        'base_dir': java_base_dir, 'subdir': 'pygest_half_smrt_6c',
        'filename': "sub-all_comp-hcpniftismoothgrandmeansim_mask-none_norm-srs_adj-none.tsv",
        'columns': 568, 'genes': 15745, 'cores': 6,
        'start': "2020-03-17 18:48:25", 'end': "2020-03-17 21:30:57", 'node': "jjm6",
    },
    {   # 2020-03-18 - running on jjm12
        'name': 'pygest_quarter_evry_12c', 'plot': False,  # n/a yet
        'base_dir': java_base_dir, 'subdir': 'pygest_quarter_evry_12c',
        'filename': "",
        'columns': 283, 'genes': 15745, 'cores': 12,
        'node': "jjm12",
    },
    {
        'name': 'pygest_quarter_smrt_4c', 'plot': True,
        'base_dir': pygest_base_dir, 'subdir': 'parby-wellid_splby-wellid_batch-train00402/tgt-max_algo-smrt',
        'filename': "sub-all_comp-hcpniftismoothgrandmeansim_mask-none_norm-srs_adj-none.tsv",
        'columns': 283, 'genes': 15745, 'cores': 4,
        'start': "2020-01-31 14:16:40", 'end': "2020-01-31 15:24:19", 'node': "jjm8",
    },
    {   # 2020-03-18 - running on jjm5, killed and resumed with 14 cores
        'name': 'pygest_glasser_evry_8c', 'plot': False,  # n/a yet
        'base_dir': java_base_dir, 'subdir': 'pygest_glasser_evry_8c',
        'filename': "",
        'columns': 177, 'genes': 15745, 'cores': 8,
        'node': "jjm5",
    },
    {
        'name': 'pygest_glasser_smrt_8c', 'plot': True,
        'base_dir': java_base_dir, 'subdir': 'pygest_glasser_smrt_8c',
        'filename': "ahbaexprwholeparbyglasserdf_comp-hcpconnsimwholeparbyglasser_mask-none_norm-none_adj-none.tsv",
        'columns': 177, 'genes': 15745, 'cores': 8,
        'start': "2020-03-17 21:09:35", 'end': "2020-03-17 21:40:18", 'node': "jjm5",
    },
]


In [2]:
""" Read the results files """

import pandas as pd
from datetime import datetime


time_format = "%Y-%m-%d %H:%M:%S"

def peak_index(df):
    """ Find the index at the peak (not quite the same as the index with the highest Mantel) """
    
    last_r = -1.0
    last_idx = 0
    for idx, row in df.sort_index(ascending=True).iterrows():
        if row['r'] < last_r:
            print("    current ({}:{:0.6f}) < prior ({}:{:0.6f})".format(idx, row['r'], last_idx, last_r))
            return last_idx
        last_r = row['r']
        last_idx = idx
    # The for loop should never complete, but just in case...
    return 0


# Read each result file, adjusting both result types to match
for rslt, src in [(rslt, "java") for rslt in java_results] + [(rslt, "pygest") for rslt in pygest_results]:
    if os.path.isfile(os.path.join(rslt['base_dir'], rslt['subdir'], rslt['filename'])):
        print("{name:<24.24} {subdir:<42.42} {filename:<42.42} - {check}".format(**rslt, check=u'\u2713'))
        rslt['df'] = pd.read_csv(
            os.path.join(rslt['base_dir'], rslt['subdir'], rslt['filename']),
            sep="," if src=="java" else "\t",
            index_col=0,
            header=None if src=="java" else 0,
        )
        if src == "java":
            rslt['df'].columns = ['r', 'probe_id', ]
        if src == "pygest":
            rslt['df'] = rslt['df'][['r', 'probe_id', ]]

        # Both have an index representing order of importance; but pygest is 1-based and java is 0-based.
        # Make them comparable, then name the index
        rslt['df'] = rslt['df'].sort_index(ascending=True).reset_index(drop=True)[['probe_id', 'r', ]]
        rslt['df'].index.name = 'seq'

        # Calculate elapsed time if start and end times are available
        if 'end' in rslt.keys() and 'start' in rslt.keys():
            elapsed = datetime.strptime(rslt['end'], time_format) - datetime.strptime(rslt['start'], time_format)
            rslt['elapsed_hours'] = elapsed.total_seconds() / 3600
            # print("  {} to {} : {} == {:0.1f} hrs".format(rslt['start'], rslt['end'], elapsed, elapsed.total_seconds() / 3600))
            # print(rslt['df'].sample(2))
        else:
            rslt['elapsed_hours'] = 0.0

        # Calculate the index position at which the highest correlation was achieved.
        rslt['peak_idx'] = peak_index(rslt['df'])

        print("  : {shape} : took {elapsed_hours:0.1f} hrs, peak @{peak_idx}".format(
            **rslt, shape=rslt['df'].shape
        ))
    else:
        print("{name:<24.24} {subdir:<42.42} {filename:<42.42} - missing".format(**rslt))



java_whole_23c_60g       whole_23c_60g                              LOOResults.1583164067177.txt               - ✓
  : (9906, 2) : took 0.0 hrs, peak @0
java_whole_12c_10g       whole                                      LOOResults.1582232409921.txt               - ✓
    current (15653:0.166041) < prior (15652:0.166041)
  : (15745, 2) : took 517.5 hrs, peak @15652
java_half_12c_10g        202                                        LOOResults.1580525004509.txt               - ✓
    current (15650:0.181832) < prior (15649:0.181836)
  : (15745, 2) : took 182.9 hrs, peak @15649
java_quarter_23c_60g_1   quarter_23_60                              LOOResults.1583035707345.txt               - ✓
    current (15664:0.223576) < prior (15663:0.223577)
  : (15745, 2) : took 28.2 hrs, peak @15663
java_quarter_23c_60g_2   402                                        LOOResults.1582340405040.txt               - ✓
    current (15664:0.223576) < prior (15663:0.223577)
  : (15745, 2) : took 33.2 hrs, peak 

In [3]:
""" Combine all results into a single dataframe. """

all_results = pd.DataFrame(java_results + pygest_results).set_index('name')


In [4]:
""" Combine all ranks into a single dataframe with probe_ids as the index. """

def swap_index_for_probe(df, column_name):
    return df.reset_index()[['probe_id', 'seq']].set_index('probe_id').sort_values('seq', ascending=True).rename(columns={'seq': column_name})

orders = pd.concat([
    swap_index_for_probe(result['df'], result['name']) for result in java_results + pygest_results if 'df' in result.keys()
], axis=1)
orders['mean_seq'] = orders.dropna(axis=1).mean(axis=1)
orders = orders.sort_values('mean_seq')

# The index of orders is now all probe_ids in average order of removal.


## Data visualization

In [5]:
""" Plot each result against each other result, by seq in order of the averaged probe list. """

import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import kendalltau


def plot_ranks(df, x, x_peak, y, y_peak):
    """ multi-scatter-plot to compare gene rankings from two sources """
    
    # Make x the ruler, in its own order, and plot y based on x's probe-order
    plottable_df = df[[y, x]].dropna(axis=0)
    plottable_df = plottable_df.sort_values(y)
    # import pdb; pdb.set_trace()
    plottable_df['y_hi'] = plottable_df[y] >= y_peak
    plottable_df = plottable_df.sort_values(x)
    plottable_df['x_hi'] = plottable_df[x] >= x_peak
    
    fig, axes = plt.subplots(figsize=(6,6))
    
    r, p = kendalltau(df[y], df[x])
    r2, p2 = kendalltau(plottable_df[y], plottable_df[x])
    print("{:<24} vs {:<24} : Kendall tau = {:0.3f}, p={:0.4f} (dropna {:0.3f} p={:0.4f})".format(
        y, x, r, p, r2, p2
    ))
    sns.regplot(data=plottable_df,
                y=y, x=x, color='lightskyblue', scatter_kws={'s': 2})
    sns.scatterplot(data=plottable_df[plottable_df['x_hi'] | plottable_df['y_hi']],
                    y=y, x=x, color='dodgerblue', size=2)
    sns.scatterplot(data=plottable_df[plottable_df['x_hi'] & plottable_df['y_hi']],
                    y=y, x=x, color='midnightblue', size=5)
    axes.axhline(y_peak, color='red', linestyle=":")
    axes.axvline(x_peak, color='red', linestyle=":")
    axes.get_legend().remove()
    axes.set_xlim(-100, 16000)
    axes.set_ylim(-100, 16000)
    axes.set_xticks([0, 5000, 10000, x_peak])
    axes.set_yticks([0, 5000, 10000, y_peak])
    fig.suptitle("Kendall tau = {:0.3f}, p={:0.5f}\n{} past x, {} past y, {} agree".format(
        r, p, sum(plottable_df['x_hi']), sum(plottable_df['y_hi']),
        sum(plottable_df['y_hi'] & plottable_df['x_hi'])
    ))
    
    return fig, axes


In [6]:
""" Plot multiple runs juxtaposed with each other. """

import matplotlib.pyplot as plt
import seaborn as sns


def plot_runs(dfs):
    """ plot the rising Mantel r values for optimization runs. """
    
    fig, axes = plt.subplots(nrows=2, figsize=(6, 6))
    
    plottable_df = pd.concat([
        d['df'][['r']].rename(columns={'r': d['name']}) for d in dfs
    ], axis=1)

    sns.lineplot(data=plottable_df[-1200:].sort_index(), ax=axes[0])
    axes[0].ylabel = "Mantel r"
    sns.lineplot(data=plottable_df.sort_index(), ax=axes[1])
    axes[1].ylabel = "Mantel r"
    
    return fig, axes


In [7]:
""" Execute both plots for each comparison pair """

valid_columns = [c for c in orders.columns if ("java_" in c or "pygest_" in c) and (all_results.loc[c, 'plot'])]
for j, y in enumerate(valid_columns):
    for i, x in enumerate(valid_columns):
        if y != x:
            x_peak = all_results.loc[x, 'peak_idx']
            y_peak = all_results.loc[y, 'peak_idx']
            
            f, a = plot_ranks(orders, x, x_peak, y, y_peak)
            f.savefig("order_comparisons/{}-vs-{}.png".format(y, x))
            plt.close(f)
            
            f, a = plot_runs([{'name': name, 'df': all_results.loc[name, 'df']} for name in [y, x]])
            f.savefig("order_comparisons/{}-and-{}.png".format(y, x))
            plt.close(f)



java_whole_12c_10g       vs java_half_12c_10g        : Kendall tau = 0.651, p=0.0000 (dropna 0.651 p=0.0000)
java_whole_12c_10g       vs java_quarter_12c_10g     : Kendall tau = 0.563, p=0.0000 (dropna 0.563 p=0.0000)
java_whole_12c_10g       vs java_parcels_23c_60g     : Kendall tau = 0.502, p=0.0000 (dropna 0.502 p=0.0000)
java_whole_12c_10g       vs pygest_whole_smrt_4c     : Kendall tau = 0.414, p=0.0000 (dropna 0.414 p=0.0000)
java_whole_12c_10g       vs pygest_whole_once_8c     : Kendall tau = 0.329, p=0.0000 (dropna 0.329 p=0.0000)
java_whole_12c_10g       vs pygest_half_smrt_6c      : Kendall tau = 0.345, p=0.0000 (dropna 0.345 p=0.0000)
java_whole_12c_10g       vs pygest_quarter_smrt_4c   : Kendall tau = 0.354, p=0.0000 (dropna 0.354 p=0.0000)
java_whole_12c_10g       vs pygest_glasser_smrt_8c   : Kendall tau = 0.240, p=0.0000 (dropna 0.240 p=0.0000)
java_half_12c_10g        vs java_whole_12c_10g       : Kendall tau = 0.651, p=0.0000 (dropna 0.651 p=0.0000)
java_half_12c_10g  

### Create a few custom plots to support the narrative and complement auto-build plots from above.

In [8]:
# Include three java-optimized curves
f, a = plot_runs([
    {'name': name, 'df': all_results.loc[name, 'df']} for name in [
        'java_whole_12c_10g', 'java_half_12c_10g', 'java_quarter_12c_10g'
    ]
])
f.savefig("order_comparisons/java_trio.png".format(y, x))
plt.close(f)


In [9]:
# Include three pygest-optimized curves
f, a = plot_runs([
    {'name': name, 'df': all_results.loc[name, 'df']} for name in [
        'pygest_whole_smrt_4c', 'pygest_half_smrt_6c', 'pygest_quarter_smrt_4c'
    ]
])
f.savefig("order_comparisons/pygest_trio.png".format(y, x))
plt.close(f)


In [10]:
# Include four curves
f, a = plot_runs([
    {'name': name, 'df': all_results.loc[name, 'df']} for name in [
        'pygest_glasser_smrt_8c', 'pygest_whole_smrt_4c', 'java_parcels_23c_60g', 'java_whole_12c_10g',
    ]
])
f.savefig("order_comparisons/glassers_quartet.png".format(y, x))
plt.close(f)


In [11]:
# Include once vs comparable curves
f, a = plot_runs([
    {'name': name, 'df': all_results.loc[name, 'df']} for name in [
        'pygest_whole_once_8c', 'pygest_whole_smrt_4c', 'java_whole_12c_10g',
    ]
])
f.savefig("order_comparisons/once_trio.png".format(y, x))
plt.close(f)


----

## Old, experimental, and backup code

----

In [12]:
""" If necessary, build a toy dataframe for experimentation. """

littledf = pd.DataFrame(
    data={
        'probe_id': [3456, 6543, 4567, 5678, 1111, 9876, ],
        'r': [0.022, 0.031, 0.050, 0.100, 0.111, 0.055, ]
    },
    index=[0, 1, 2, 3, 4, 5, ],
)
littledf.index.name = 'seq'
littledf


Unnamed: 0_level_0,probe_id,r
seq,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3456,0.022
1,6543,0.031
2,4567,0.05
3,5678,0.1
4,1111,0.111
5,9876,0.055


In [13]:
""" Tweak Leon's results to match our format.
    - He ranks 0-based; we rank 1-based.
    - He gives the final probes the same ranking and r-value; we rank them sequentially with r=0.00.
    - He orders from 0-end ascending; we order descending from 15745 down to 1.
"""

def format_leons_as_mikes(df):
    df.columns = ['seq', 'r', 'probe_id']
    df = df.sort_values('seq')
    df['seq'] = range(1, len(df) + 1)
    return df
    
# results_df_from_split_half_java = format_leons_as_mikes(results_df_from_split_half_java)
# results_df_from_split_quarter_java = format_leons_as_mikes(results_df_from_split_quarter_java)
