# Convert code to run in Leon French's java app

Leon French shared a java optimizer with us March 5, 2019 that is probably faster than our python code, but probably is not re-ranking probes the same way we are. This code will load selected dataframes, align their wellids and probes, and save them out as tsvs so Leon's optimizer can read them.

Spiro got instructions from Leon via email January 30, 2020 to run the optimizer and see if we can save some time running our optimizations. Those instructions informed the format of tsv files generated here.

In [31]:
""" Use pygest to generate SRS-adjusted expression matrix with only glasser-parcellable wellids.
    Generate parcellated version of same data.
    All other data sets have pre-built (for another project) dataframes on disk. Load them later.
"""

import os
import pygest as ge
from pygest.convenience import average_expr_per_parcel
from pygest.rawdata.glasser import glasser_parcel_map


data_dir = "/home/mike/ge_data"
ge_data = ge.Data(data_dir)

# This is an expression matrix SRS-adjusted and unsplit.
# It has 15,745 probes (The fornito selection) and 1280 wellids (all of the 1280 mappable to glasser parcels).
df_expr_whole = ge_data.expression(probes="fornito", samples="glasser", normalize="srs")

print("Whole expression matrix is [({:,} rows of genes) x ({:,} columns of sample locations)].".format(
    df_expr_whole.shape[0], df_expr_whole.shape[1]
))

df_expr_whole_parcellated = average_expr_per_parcel(df_expr_whole, glasser_parcel_map)

print("Whole expression matrix is [({:,} rows of genes) x ({:,} columns of parcels)].".format(
    df_expr_whole_parcellated.shape[0], df_expr_whole_parcellated.shape[1]
))


2020-03-25 14:05:09 [INFO] | PyGEST has initialized logging, and is running on host 'cardano'
2020-03-25 14:05:09 [INFO] | Found 9 donors in /home/mike/ge_data/sourcedata/participants.tsv
Whole expression matrix is [(15,745 rows of genes) x (1,280 columns of sample locations)].
Whole expression matrix is [(15,745 rows of genes) x (177 columns of parcels)].


In [5]:
""" Define selected files, already prepared and saved. """

import os


# This is an expression matrix split into training set #202 after being SRS-adjusted.
# It has 15,745 probes (The fornito selection) and 640 wellids (half of the 1280 mappable to glasser parcels).
# Stored as a python pickle, it's 78MB.
split_dir = "splits/sub-all_hem-A_samp-glasser_prob-fornito/batch-train00202"
expr_file_half = os.path.join(data_dir, split_dir, "parcelby-wellid_splitby-wellid.srs.df")
expr_file_half_parcellated = os.path.join(data_dir, split_dir, "parcelby-glasser_splitby-wellid.srs.df")

# This is an expression matrix split into training set #202 after being SRS-adjusted.
# It has 15,745 probes (The fornito selection) and 640 wellids (half of the 1280 mappable to glasser parcels).
# Stored as a python pickle, it's 39MB.
split_dir = "splits/sub-all_hem-A_samp-glasser_prob-fornito/batch-train00402"
expr_file_quarter = os.path.join(data_dir, split_dir, "parcelby-wellid_splitby-wellid.srs.df")
expr_file_quarter_parcellated = os.path.join(data_dir, split_dir, "parcelby-glasser_splitby-wellid.srs.df")

# This is a connectivity matrix, generated from smoothed HCP data in matlab by Spiro
# then cleaned, verified, and converted by Mike in build_conn_and_connsim_from_text/build_connectivity_matrices.ipynb
conn_file = os.path.join(data_dir, "conn/hcp_niftismooth_conn.df")
conn_file_parcellated = os.path.join(data_dir, "conn/hcp_niftismooth_conn_parby-glasser.df")

# And this is a connectivity-similarity matrix, generated from the matrix just described.
conn_sim_file = os.path.join(data_dir, "conn/hcp_niftismooth_conn_sim.df")
conn_sim_file_parcellated = os.path.join(data_dir, "conn/hcp_niftismooth_conn_parby-glasser_sim.df")


In [6]:
""" Load expression and connectivity-similarity data. """

import pickle


# Already calculated: df_expr_whole, df_expr_whole_parcellated

df_conn = pickle.load(open(conn_file, "rb"))
df_conn_parcellated = pickle.load(open(conn_file_parcellated, "rb"))
df_conn_sim = pickle.load(open(conn_sim_file, "rb"))
df_conn_sim_parcellated = pickle.load(open(conn_sim_file_parcellated, "rb"))
df_expr_half = pickle.load(open(expr_file_half, "rb"))
df_expr_quarter = pickle.load(open(expr_file_quarter, "rb"))


In [14]:
""" Determine locations without complete data across expression and connectivity. """

def overlapping_columns(dfc, dfe):
    """ IN ORDER OF dfc, return list of columns (wellids or parcels) present in both dataframes. """
    
    common_columns = [c for c in dfc.columns if c in dfe.columns]
    print("Found {:,} columns common to both {:,} in expression and {:,} in connectivity similarity.".format(
        len(common_columns), len(dfe.columns), len(dfc.columns),
    ))
    
    return common_columns

    
wellids_whole = overlapping_columns(df_conn_sim, df_expr_whole)
wellids_half = overlapping_columns(df_conn_sim, df_expr_half)
wellids_quarter = overlapping_columns(df_conn_sim, df_expr_quarter)

parcels_whole = overlapping_columns(df_conn_sim_parcellated, df_expr_whole_parcellated)


Found 1,139 columns common to both 1,280 in expression and 2,731 in connectivity similarity.
Found 568 columns common to both 640 in expression and 2,731 in connectivity similarity.
Found 283 columns common to both 320 in expression and 2,731 in connectivity similarity.
Found 176 columns common to both 177 in expression and 176 in connectivity similarity.


In [16]:
""" Exclude locations without complete data across expression and connectivity. """

# Keep only the relevant (common to both) data.
df_expr_whole_pruned = df_expr_whole.loc[:, wellids_whole]
df_conn_whole_pruned = df_conn.loc[wellids_whole, wellids_whole]
df_conn_sim_whole_pruned = df_conn_sim.loc[wellids_whole, wellids_whole]
print("Whole expression went from {} to {} and connectivity-similarity from {} to {}.".format(
    df_expr_whole.shape, df_expr_whole_pruned.shape, df_conn_sim.shape, df_conn_sim_whole_pruned.shape
))

df_expr_half_pruned = df_expr_half.loc[:, wellids_half]
df_conn_half_pruned = df_conn.loc[wellids_half, wellids_half]
df_conn_sim_half_pruned = df_conn_sim.loc[wellids_half, wellids_half]
print("Split-half expression went from {} to {} and connectivity-similarity from {} to {}.".format(
    df_expr_half.shape, df_expr_half_pruned.shape, df_conn_sim.shape, df_conn_sim_half_pruned.shape
))

df_expr_quarter_pruned = df_expr_quarter.loc[:, wellids_quarter]
df_conn_quarter_pruned = df_conn.loc[wellids_quarter, wellids_quarter]
df_conn_sim_quarter_pruned = df_conn_sim.loc[wellids_quarter, wellids_quarter]
print("Split quarter expression went from {} to {} and connectivity-similarity from {} to {}.".format(
    df_expr_quarter.shape, df_expr_quarter_pruned.shape, df_conn_sim.shape, df_conn_sim_quarter_pruned.shape
))

# Do the same for parcellated data.
df_expr_whole_parcellated_pruned = df_expr_whole_parcellated.loc[:, parcels_whole]
df_conn_whole_parcellated_pruned = df_conn_parcellated.loc[parcels_whole, parcels_whole]
df_conn_sim_whole_parcellated_pruned = df_conn_sim_parcellated.loc[parcels_whole, parcels_whole]
print("Whole expression went from {} to {} and connectivity-similarity from {} to {}.".format(
    df_expr_whole_parcellated.shape, df_expr_whole_parcellated_pruned.shape,
    df_conn_sim_parcellated.shape, df_conn_sim_whole_parcellated_pruned.shape
))


Whole expression went from (15745, 1280) to (15745, 1139) and connectivity-similarity from (2731, 2731) to (1139, 1139).
Split-half expression went from (15745, 640) to (15745, 568) and connectivity-similarity from (2731, 2731) to (568, 568).
Split quarter expression went from (15745, 320) to (15745, 283) and connectivity-similarity from (2731, 2731) to (283, 283).
Whole expression went from (15745, 177) to (15745, 176) and connectivity-similarity from (176, 176) to (176, 176).


In [18]:
""" Write the matched set to java-friendly tsvs. """

data_save_path = "./expr_and_conn_data"

def save_df_and_csv(df, name):
    """ Save df as both tsv and df formats """
    df.index.name = ""
    df.to_csv(os.path.join(data_save_path, name + ".tsv"), sep="\t")
    df.to_pickle(os.path.join(data_save_path, name + ".df"))
    

save_df_and_csv(df_expr_whole_pruned, "ahba_expr_whole")
save_df_and_csv(df_conn_whole_pruned, "hcp_conn_whole")
save_df_and_csv(df_conn_sim_whole_pruned, "hcp_conn_sim_whole")
save_df_and_csv(df_expr_half_pruned, "ahba_expr_train202")
save_df_and_csv(df_conn_half_pruned, "hcp_conn_train202")
save_df_and_csv(df_conn_sim_half_pruned, "hcp_conn_sim_train202")
save_df_and_csv(df_expr_quarter_pruned, "ahba_expr_train402")
save_df_and_csv(df_conn_quarter_pruned, "hcp_conn_train402")
save_df_and_csv(df_conn_sim_quarter_pruned, "hcp_conn_sim_train402")
save_df_and_csv(df_expr_whole_parcellated_pruned, "ahba_expr_whole_parby-glasser")
save_df_and_csv(df_conn_whole_parcellated_pruned, "hcp_conn_whole_parby-glasser")
save_df_and_csv(df_conn_sim_whole_parcellated_pruned, "hcp_conn_whole_parby-glasser_sim")


----

## Optionally, explore and clean up the data (This was already taken care of)

----

In [19]:
""" Check for bad columns and rows with np.nans """

def clean_conn_dataframe(conn):
    bad_parcels = []
    for col in conn.columns:
        if sum(np.isnan(conn.loc[:, col].values)) > 1:
            bad_parcels.append(col)
    for row in conn.index:
        if sum(np.isnan(conn.loc[row, :].values)) > 1:
            bad_parcels.append(row)

    print("Bad parcels: [{}]".format(", ".join(set(bad_parcels))))
    good_columns = list(conn.columns)
    for parcel in set(bad_parcels):
        good_columns.remove(parcel)
    
    return conn.loc[good_columns, good_columns]


In [23]:
""" Clean some (already clean) dataframes.
    If we had loaded bad data, or if this is used to handle future problematic data, it can be useful.
"""

import numpy as np


df_conn_parcellated_cleaned = clean_conn_dataframe(df_conn_parcellated)
print("[{} x {}] parcellated dataframe dropped {} nans to become [{} x {}] with {} nans.".format(
    df_conn_parcellated.shape[0], df_conn_parcellated.shape[1],
    sum(sum(np.isnan(df_conn_parcellated.values))),
    df_conn_parcellated_cleaned.shape[0], df_conn_parcellated_cleaned.shape[1],
    sum(sum(np.isnan(df_conn_parcellated_cleaned.values))),
))

df_conn_sim_parcellated_cleaned = clean_conn_dataframe(df_conn_sim_parcellated)
print("[{} x {}] parcellated dataframe dropped {} nans to become [{} x {}] with {} nans.".format(
    df_conn_sim_parcellated.shape[0], df_conn_sim_parcellated.shape[1],
    sum(sum(np.isnan(df_conn_sim_parcellated.values))),
    df_conn_sim_parcellated_cleaned.shape[0], df_conn_sim_parcellated_cleaned.shape[1],
    sum(sum(np.isnan(df_conn_sim_parcellated_cleaned.values))),
))


Bad parcels: []
[176 x 176] parcellated dataframe dropped 0 nans to become [176 x 176] with 0 nans.
Bad parcels: []
[176 x 176] parcellated dataframe dropped 0 nans to become [176 x 176] with 0 nans.


In [29]:
""" Find agreement between parcellated expression and connectivity. """

matching_parcels = [id for id in df_connsim_parcellated_cleaned.columns if id in df_expr_whole_parcellated_pruned.columns]
print("{:,} parcels are in both expression and connectivity-similarity.".format(len(matching_parcels)))

df_expr_whole_parcellated_cleaned = df_expr_whole_parcellated_pruned[matching_parcels]
df_conn_parcellated_pruned = df_conn_parcellated_cleaned.loc[matching_parcels, matching_parcels]
df_conn_sim_parcellated_pruned = df_conn_sim_parcellated_cleaned.loc[matching_parcels, matching_parcels]
print("Parcellated expression went from {} to {} and parcellated connectivity-similarity from {} to {}.".format(
    df_expr_whole_parcellated_pruned.shape, df_expr_whole_parcellated_cleaned.shape,
    df_conn_sim_parcellated_cleaned.shape, df_conn_sim_parcellated_pruned.shape
))


176 parcels are in both expression and connectivity-similarity.
Parcellated expression went from (15745, 176) to (15745, 176) and parcellated connectivity-similarity from (176, 176) to (176, 176).


In [27]:
""" Save our work
    uncomment this, or build a block like it if saving is necessary.
"""

"""
df_expr_whole_parcellated_pruned.to_csv("./ahba_expr_whole_parby-glasser.tsv", sep="\t")
df_expr_whole_parcellated_pruned.to_pickle("./ahba_expr_whole_parby-glasser.df")
df_conn_parcellated_pruned.to_csv("./hcp_conn_whole_parby-glasser.tsv", sep="\t")
df_conn_parcellated_pruned.to_pickle("./hcp_conn_whole_parby-glasser.df")
df_connsim_parcellated_pruned.to_csv("./hcp_conn_sim_whole_parby-glasser.tsv", sep="\t")
df_connsim_parcellated_pruned.to_pickle("./hcp_conn_sim_whole_parby-glasser.df")

df_expr_parcellated_pruned.to_pickle("/data/cache/glasserparcel-expression.df")
df_connsim_parcellated_pruned.to_pickle("/data/conn/hcp_niftismooth_grandmean_glasser_sim.df")
"""

pass


In [28]:
for df in [df_conn_parcellated_pruned, df_conn_sim_parcellated_pruned, ]:
    print("{} bad infs, {} bad nans".format(sum(sum(np.isinf(df.values))), sum(sum(np.isnan(df.values)))))

0 bad infs, 0 bad nans
0 bad infs, 0 bad nans
