## Voxel PCA 

Updated: Laurel Kinman, 9/29/2022

#### Import necessary modules

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
from sklearn.decomposition import PCA
from cryodrgn import utils
from cryodrgn import mrc

#### Read in data

Data consits of a mask corresponding to the region of interest (a .mrc file), and the relevant volumes to be compared. If volumes come from different cryoDRGN runs,  they should be aligned and amplitude-scaled and stored in different directories (e.g. voldirs[0] contains +antibiotic volumes and voldirs[1] contains -antibiotic volumes). Below, provide the paths to the volume directories and the mask. 

In [None]:
mask = '/path/to/mask' #CHANGE ME
voldirs = ['/path/to/volume/directory1', '/path/to/volume/directory2']  #CHANGE ME
keys = ['plus', 'minus'] #CHANGE ME
outdir = '/path/to/desired/output/directory/' #CHANGE ME

In [None]:
mask_data = pd.DataFrame(mrc.parse_mrc(mask)[0].flatten())
coi = mask_data[mask_data[0] == 1].index

In [None]:
voldf1 = pd.DataFrame(columns = coi)
voldf2 = pd.DataFrame(columns = coi)
dfs_dict = {keys[0]: voldf1, keys[1]: voldf2}

for i,j in enumerate(voldirs):
    for file in os.listdir(j):
        if 'vol' in file:
            vol_name = file.split('.mrc')[-2] + f'_{keys[i]}'
            data = mrc.parse_mrc(j + file)[0].flatten()
            dfs_dict[keys[i]].loc[vol_name] = data[coi]

allvols = pd.concat([dfs_dict[keys[0]], dfs_dict[keys[1]]])
utils.save_pkl(allvols, outdir + 'allvols.pkl') 

In [None]:
allvols

#### Run PCA and visualize marginal distributions

In [None]:
pca = PCA(random_state = 0, n_components = 10)
pca.fit(allvols)
pc = pca.transform(allvols)

In [None]:
plt.bar(np.arange(0, 10), pca.explained_variance_ratio_[0:10])
plt.xlabel('Principal component')
plt.ylabel('Explained variance')

In [None]:
comp = 0 #CHANGE ME
sns.kdeplot(pc[:len(dfs_dict[keys[0]]), comp], color = '#fc8d62', label = keys[0], alpha = 0.2, shade = True)
sns.kdeplot(pc[len(dfs_dict[keys[0]]):, comp], color = '#8da0cb', label = keys[1], alpha = 0.2, shade = True)
plt.legend(loc = 'upper right')
plt.ylabel('Frequency')
plt.xlabel(f'PC{comp+1}')

#### Write ChimeraX scripts

The following cells will write out .py files that can be opened in Chimera. The scripts will automatically open volumes sampled evenly across the selected principal component from the selected subset of data (either the first volume set or the second volume set). 

In [None]:
def find_nearest(array, value):
    array = np.asarray(array)
    idx = (np.abs(array - value)).argmin()
    
    return idx

def find_pc_traj(pcs, dim, start = 5, stop = 95, num_vols = 20):
    percentiles = np.linspace(start, stop, num_vols)
    pc_percentiles = [np.percentile(pcs[:, dim], i) for i in percentiles]
    inds = [find_nearest(pcs[:, dim], i) for i in pc_percentiles]
    
    return inds


def write_traj_script(traj_list, output, subset):
    if subset == 1:
        traj_list = [i + len(dfs_dict[keys[0]]) for i in traj_list]
    vols = allvols.index[traj_list]
    
    if len(vols) > 0:
        with open(output, 'w') as f:
            f.write('from chimerax.core.commands import run\n')
            for i in vols:
                vol_num = i.split('_')[1]
                for k,j in enumerate(dfs_dict):
                    if j in i:
                        f.write('run(session, "open ' + voldirs[k] + 'vol_' + vol_num + '.mrc")\n')
            
    else:
        print('No volumes match the specified PC criteria')
            
    return vols

In [None]:
comp = 0 #CHANGE ME
chosen_subset = 1 #CHANGE ME
subsets = [pc[:len(dfs_dict[keys[0]])], pc[len(dfs_dict[keys[1]]):]]
pc_traj = find_pc_traj(subsets[chosen_subset], comp)
write_traj_script(pc_traj, outdir + f'pc{str(comp)}_traversal_subset{str(chosen_subset)}.py', subset = chosen_subset)