# Prepare Allen data


Data stored in ./data/loom_allen_kb/

In [2]:
%%capture
%pip install scanpy
%pip install loompy
%pip install leidenalg

In [1]:
# System
import os, pathlib, time, gc

# Math
import numpy as np
import pandas as pd

# Plots
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import seaborn as sns

import anndata

In [2]:
# mount to drive and change directory
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/scBIVI_mc/scBIVI/scBIVI/
!ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/scBIVI_mc/scBIVI/scBIVI
 analysis.py		    run_scBIVI.py
 Analysis_scBIVI.ipynb	    run.sh
 bivae.py		    scBIVI-10-combined-4-train-history.pdf
 data			   'scBIVI (1).ipynb'
 data_expanded		    scBIVI-3-combined-4-train-history.pdf
 distribution_2.py	    scBIVI-6-combined-4-train-history.pdf
 distribution.py	    scBIVIcustom-10-combined-4-train-history.pdf
 distribution_uni.py	    scBIVIcustom-3-combined-4-train-history.pdf
 download_data.sh	    scBIVIcustom-6-combined-4-train-history.pdf
 experiment_colab.py	    scBIVI.ipynb
 gena_snippets		    scBIVI.py
 out			    scBIVIuncorr-10-combined-4-train-history.pdf
 Prepare_Allen_data.ipynb   scBIVIuncorr-3-combined-4-train-history.pdf
 preprocess.py		    scBIVIuncorr-6-combined-4-train-history.pdf
 __pycache__		    scUNIVI.ipynb
 README.txt		    scUNIVI.py
 reformat_model.ipynb	    test_clu

# Load and Preprocess data 


1. Access metadata to find cluster assignment, donor, gender etc.

Metadata on github repo. 

In [None]:
name = 'B08'

adata = anndata.read_loom(f'./data/loom_allen_kb/allen_{name}/allen_{name}_raw.loom')

if 'gene_name' in adata.var.columns:
    adata.var_names = adata.var['gene_name'].to_list()

adata.var_names_make_unique()



In [None]:
# first, access metadata
url = 'https://raw.githubusercontent.com/pachterlab/BYVSTZP_2020/master/reference/10xv3/sample_metadata.csv'
md = pd.read_csv(url)
url = 'https://raw.githubusercontent.com/pachterlab/BYVSTZP_2020/master/reference/10xv3/cluster.membership.csv'
df_cluster_membership = pd.read_csv(url,skiprows = 0)

In [None]:
df_cluster_membership['barcode'] = [df_cluster_membership.loc[i,'Unnamed: 0'].split('-')[0] for i in range(len(df_cluster_membership))]
df_cluster_membership['library_id'] = [df_cluster_membership.loc[i,'Unnamed: 0'][-18:] for i in range(len(df_cluster_membership))]
df_cluster_membership['ID_barcode'] = df_cluster_membership.barcode + '-' + df_cluster_membership.library_id

In [None]:
unique_library_ids = md.library_id.unique()

donor_dict = {}
for don in md.Donor.unique():
  don_ = str(don)
  unique = md.query('Donor=='+don_)['library_id'].unique()
  donor_dict[don] = unique

In [None]:
donor_dict

{426003: array(['L8TX_181211_01_G12', 'L8TX_181211_01_H12', 'L8TX_181211_01_A01',
        'L8TX_181211_01_F01'], dtype=object),
 427378: array(['L8TX_181211_01_B01', 'L8TX_181211_01_C01', 'L8TX_181211_01_D01',
        'L8TX_181211_01_E01'], dtype=object),
 457909: array(['L8TX_190430_01_F08', 'L8TX_190430_01_G08'], dtype=object),
 457911: array(['L8TX_190430_01_A08', 'L8TX_190430_01_B08'], dtype=object)}

In [None]:
print(name)

for don in donor_dict:
  id_list = donor_dict[don]
  for id in id_list:
    if name in id:
      this_don = don
      this_id = id
      print(this_don,this_id)

sex = md.query(f'Donor == {this_don}')['Gender'].unique()[0]

adata.obs['library_id'] = this_id
adata.obs['donor'] = this_don
adata.obs['Sex'] = sex
adata.obs['ID_barcode'] = adata.obs.barcode + '-' + adata.obs.library_id
adata.obs['Cluster'] = float("nan")

B08
457911 L8TX_190430_01_B08


In [None]:
barcode_list = []

for b in adata.obs.ID_barcode.values:
  if b in df_cluster_membership.ID_barcode.values:
    barcode_list.append(df_cluster_membership[df_cluster_membership['ID_barcode']==b].values[0,1])
  else:
    barcode_list.append(float("nan"))

In [None]:
adata.obs['Cluster'] = barcode_list
adata_subset = adata[adata.obs['Cluster'].isin(range(200))]
# YAYYYY finally have some biologically validated clusters


  df[key] = c
  df[key] = c


# Reformat unspliced and spliced

In [3]:
# matrices saved now: A01, B01, B08, C01
name = 'B01'
adata = anndata.read(f'./data/loom_allen_kb/allen_{name}/counts_subset.loom')

In [4]:
adata

AnnData object with n_obs × n_vars = 9772 × 32285
    obs: 'barcode', 'library_id', 'donor', 'Sex', 'ID_barcode', 'Cluster'
    var: 'gene_id', 'gene_name'
    layers: 'matrix', 'spliced', 'unspliced'

In [5]:
# reformat the data so that the unspliced/spliced matrices are concatenated instead of stored as different models

# store old adata object
adata_old = adata

# extract spliced and unspliced layers, save as adata object
adata_spliced   = anndata.AnnData(adata.layers['spliced'])
adata_unspliced = anndata.AnnData(adata.layers['unspliced'])

# store the same variable names (gene_id, gene_name)
adata_spliced.var = adata.var.copy()
adata_unspliced.var = adata.var.copy()
# add new variable name, spliced or unspliced 
adata_spliced.var['Spliced']   = True
adata_unspliced.var['Spliced'] = False
# add a -u to unspliced genes and gene ids so that the var names will be unique
adata_unspliced.var_names = adata_unspliced.var_names + '-u'

# concatenate the unspliced and spliced matrices along the first axis, or that of the cell
adata = anndata.concat([adata_spliced,adata_unspliced],axis=1)
## Change AnnData expression to raw counts for negative binomial distribution
adata.layers["counts"] = adata.X.copy() # preserve counts

# Update obs,var
adata.obs = adata_old.obs.copy()

In [6]:
adata

AnnData object with n_obs × n_vars = 9772 × 64570
    obs: 'barcode', 'library_id', 'donor', 'Sex', 'ID_barcode', 'Cluster'
    var: 'gene_id', 'gene_name', 'Spliced'
    layers: 'counts'

In [7]:
# write out to a loom file
adata.write_loom(f'./data/loom_allen_kb/allen_{name}/counts_subset_concat.loom')