<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/Supp_Fig_9/Supp_Fig_9ab/get_gene_weights.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get predictive genes and their weights in the logistic regression model

In [1]:
!pip install -q anndata
import anndata
import pickle
import pandas as pd
import numpy as np

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/119.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━[0m [32m112.6/119.2 kB[0m [31m3.4 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.2/119.2 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25h

Load count matrices from Caltech Data (generated [here](https://github.com/pachterlab/LSCHWCP_2023/tree/main/Notebooks/align_macaque_PBMC_data/1_virus_no_mask) (virus) and [here](https://github.com/pachterlab/LSCHWCP_2023/tree/main/Notebooks/Supp_Fig_3/Supp_Fig_3abc) (macaque)):

In [2]:
!wget https://data.caltech.edu/records/sh33z-hrx98/files/virus_no_mask.h5ad?download=1
!mv virus_no_mask.h5ad?download=1 virus_no_mask.h5ad

!wget https://data.caltech.edu/records/sh33z-hrx98/files/macaque_QC_norm_leiden_celltypes.h5ad?download=1
!mv macaque_QC_norm_leiden_celltypes.h5ad?download=1 macaque_QC_norm_leiden_celltypes.h5ad

!wget https://data.caltech.edu/records/sh33z-hrx98/files/supp_viruses_hv_genes_fullM_cov_donor_time_l2.pickle?download=1
!mv supp_viruses_hv_genes_fullM_cov_donor_time_l2.pickle?download=1 supp_viruses_hv_genes_fullM_cov_donor_time_l2.pickle

--2023-12-12 18:55:39--  https://data.caltech.edu/records/sh33z-hrx98/files/virus_no_mask.h5ad?download=1
Resolving data.caltech.edu (data.caltech.edu)... 35.155.11.48
Connecting to data.caltech.edu (data.caltech.edu)|35.155.11.48|:443... connected.
HTTP request sent, awaiting response... 302 FOUND
Location: https://s3.us-west-2.amazonaws.com/caltechdata/32/a5/1c1a-bb66-4f66-a133-60763da8d716/data?response-content-type=application%2Foctet-stream&response-content-disposition=attachment%3B%20filename%3Dvirus_no_mask.h5ad&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIARCVIVNNAP7NNDVEA%2F20231212%2Fus-west-2%2Fs3%2Faws4_request&X-Amz-Date=20231212T185539Z&X-Amz-Expires=60&X-Amz-SignedHeaders=host&X-Amz-Signature=bee1d8eab6a5d90fd920e51d9a4272e29c3506b96800dadcf68a6e0aa63a15d4 [following]
--2023-12-12 18:55:39--  https://s3.us-west-2.amazonaws.com/caltechdata/32/a5/1c1a-bb66-4f66-a133-60763da8d716/data?response-content-type=application%2Foctet-stream&response-content-disposition=atta

In [3]:
virus_adata = anndata.read_h5ad("virus_no_mask.h5ad")
host_adata = anndata.read_h5ad("macaque_QC_norm_leiden_celltypes.h5ad")

  utils.warn_names_duplicates("obs")
  utils.warn_names_duplicates("obs")


Load model from Caltech Data (the models were built and tested in [this notebok](https://github.com/pachterlab/LSCHWCP_2023/blob/main/Notebooks/Figure_8/Figure_8bc/run_regressions.ipynb):

In [4]:
# !wget
result_dict_path = "supp_viruses_hv_genes_fullM_cov_donor_time_l2.pickle"

Filter matrices:

In [7]:
genes_kind = "hv"

In [8]:
# Add unique barcode column
host_adata.obs['unique_bc'] = host_adata.obs['sample_barcode'].astype('str') + host_adata.obs['barcode'].astype('str')
virus_adata.obs['unique_bc'] = virus_adata.obs['sample_barcode'].astype('str') + virus_adata.obs.index.astype('str')
host_adata.obs.index = host_adata.obs['unique_bc']
virus_adata.obs.index = virus_adata.obs['unique_bc']

# Only keep cells present in both matrices
virus_adata.obs['unique_bc'] = virus_adata.obs['sample_barcode'].astype('str') + virus_adata.obs.index.astype('str')
host_adata.obs.index = host_adata.obs['unique_bc']
virus_adata.obs.index = virus_adata.obs['unique_bc']

# Remove non macaque genes
host_adata = host_adata[host_adata.obs["species"] == "macaca_mulatta" , host_adata.var["species"] == "macaca_mulatta"]
# Remove null cell types
virus_adata = virus_adata[virus_adata.obs["celltype"].notnull(), :]

# Filter the host anndata matrix to contain only cells in filtered virus adata
host_adata = host_adata[host_adata.obs.unique_bc.isin(virus_adata.obs.unique_bc),:]

# Filter the host anndata matrix to only contain macaque genes and the viral cells
if genes_kind == 'all': # options: 'all', 'hv', 'threshN' with N being the lowest count sum over all cells to keep a gene
    host_adata = host_adata[host_adata.obs.unique_bc.isin(virus_adata.obs.unique_bc),:]
elif genes_kind == 'hv':
    host_adata = host_adata[host_adata.obs.unique_bc.isin(virus_adata.obs.unique_bc),host_adata.var.highly_variable==True]

  host_adata.obs['unique_bc'] = host_adata.obs['sample_barcode'].astype('str') + host_adata.obs['barcode'].astype('str')
  virus_adata.obs['unique_bc'] = virus_adata.obs['sample_barcode'].astype('str') + virus_adata.obs.index.astype('str')


Extract predictive genes and their weights:

In [13]:
top_viruses = ['u10', 'u102540', 'u11150', 'u202260', 'u39566', 'u134800', 'u102324']

In [14]:
with open(result_dict_path, 'rb') as handle:
    results_dict =  pickle.load(handle)

virus_genes = host_adata.var.gene_id
weighted_gene_df = pd.DataFrame()

for v in top_viruses:
    index = np.where(np.array(results_dict['viruses'])==v)[0][0]
    virus_weights = results_dict['weights'][index,:len(virus_genes)].flatten()
    sorted_genes = [x for y, x in sorted(zip(virus_weights, virus_genes))][::-1]
    sorted_weights = [y for y, x in sorted(zip(virus_weights, virus_genes))][::-1]
    weighted_gene_df[f'{v} Ensembl ID'] = sorted_genes
    weighted_gene_df[f'{v} weight'] = sorted_weights


weighted_gene_df.to_csv('gene_weights.csv')