# Paired guide correlation analysis

In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import scipy.stats as stats
import sklearn.linear_model as lm
import sklearn.metrics as metrics
import itertools
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# EDIT THIS TO YOUR PATH WITH DATA
data_path = '/mnt/c/Users/minch/Data/bmdc/'

### Read AnnData

In [3]:
adata = sc.read(data_path + 'processed_bmdc_counts.h5ad')
adata.layers['counts'] = adata.X
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
num_genes = adata.shape[1]

### Create the guide design matrix

In [4]:
guide_set = set()
for g in adata.obs['guide_string']:
    guide_set |= set(g.split('-'))
guide_list = list(guide_set)
num_guides = len(guide_list)

In [5]:
design_matrix = pd.DataFrame(
    data=np.zeros((adata.shape[0], len(guide_list))), 
    index=adata.obs.index,
    columns=guide_list)

In [6]:
for cell_barcode, row in adata.obs.iterrows():
    guides = row['guide_string'].split('-')
    for g in guides:
        design_matrix.loc[cell_barcode,g] = 1

In [7]:
design_matrix.to_csv(data_path + 'analysis/design_matrix.csv')

### Save the baseline, PCA matrix

latent_0_0 is just a placeholder for PCA.

In [9]:
for ndim in [3, 5, 10, 20]:
    cell_state_matrix = pd.DataFrame(adata.obsm['X_pca'][:, :ndim])
    cell_state_matrix.to_csv(data_path + 'analysis/latent_0_{}.csv'.format(ndim))