<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/Supp_Fig_3/Supp_Fig_3abc/get_gene_weights.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Get predictive genes and their weights in the logistic regression model

In [None]:
!pip install -q anndata
import anndata
import pickle
import pandas as pd
import numpy as np

Load count matrices from Caltech Data (generated [here](https://github.com/pachterlab/LSCHWCP_2023/tree/main/Notebooks/align_macaque_PBMC_data/1_virus_no_mask) (virus) and [here](https://github.com/pachterlab/LSCHWCP_2023/tree/main/Notebooks/Supp_Fig_3/Supp_Fig_3abc) (macaque)):

In [None]:
# !wget virus_no_mask.h5ad
# !wget macaque_QC_norm_leiden_celltypes.h5ad

In [None]:
virus_adata = anndata.read_h5ad("virus_no_mask.h5ad")
host_adata = anndata.read_h5ad("macaque_QC_norm_leiden_celltypes.h5ad")

Load model from Caltech Data (the models were built and tested in [this notebok](https://github.com/pachterlab/LSCHWCP_2023/blob/main/Notebooks/Figure_8/Figure_8bc/run_regressions.ipynb):

In [None]:
# !wget
result_dict_path = "supp_viruses_hv_genes_fullM_cov_donor_time_l2.pickle"

Filter matrices:

In [None]:
# Only keep cells present in both matrices
virus_adata.obs['unique_bc'] = virus_adata.obs['sample_barcode'].astype('str') + virus_adata.obs.index.astype('str')
host_adata.obs.index = host_adata.obs['unique_bc']
virus_adata.obs.index = virus_adata.obs['unique_bc']

# Remove non macaque genes
host_adata = host_adata[host_adata.obs["species"] == "macaca_mulatta" , host_adata.var["species"] == "macaca_mulatta"]
# Remove null cell types
virus_adata = virus_adata[virus_adata.obs["celltype"].notnull(), :]

# Filter the host anndata matrix to contain only cells in filtered virus adata
host_adata = host_adata[host_adata.obs.unique_bc.isin(virus_adata.obs.unique_bc),:]

# Filter the host anndata matrix to only contain macaque genes and the viral cells
if genes_kind == 'all': # options: 'all', 'hv', 'threshN' with N being the lowest count sum over all cells to keep a gene
    host_adata = host_adata[host_adata.obs.unique_bc.isin(virus_adata.obs.unique_bc),:]
elif genes_kind == 'hv':
    host_adata = host_adata[host_adata.obs.unique_bc.isin(virus_adata.obs.unique_bc),host_adata.var.highly_variable==True]



Extract predictive genes and their weights:

In [None]:
top_viruses_top7 = ['u10', 'u102540', 'u11150', 'u202260', 'u39566', 'u134800', 'u102324']

In [None]:
with open(result_dict_path, 'rb') as handle:
    results_dict =  pickle.load(handle)

virus_genes = host_adata.var.gene_id
weighted_gene_df = pd.DataFrame()

for v in top_viruses:
    index = np.where(np.array(results_dict['viruses'])==v)[0][0]
    virus_weights = results_dict['weights'][index,:len(virus_genes)].flatten()
    sorted_genes = [x for y, x in sorted(zip(virus_weights, virus_genes))][::-1]
    sorted_weights = [y for y, x in sorted(zip(virus_weights, virus_genes))][::-1]
    weighted_gene_df[f'{v} Ensembl ID'] = sorted_genes
    weighted_gene_df[f'{v} weight'] = sorted_weights


weighted_gene_df.to_csv('gene_weights.csv')