In [5]:
import pickle
import torch
import scanpy as sc

In [7]:
import pickle
import json
import os
import pandas as pd
import numpy as np

# Define data directory
DATA_DIR = '/orcd/data/omarabu/001/Omnicell_datasets/GenePT_emebdding_v2'

# Load embeddings
with open(os.path.join(DATA_DIR, 'GenePT_gene_embedding_ada_text.pickle'), 'rb') as f:
    gene_emb_ada = pickle.load(f)

with open(os.path.join(DATA_DIR, 'GenePT_gene_protein_embedding_model_3_text.pickle.'), 'rb') as f:
    gene_prot_emb = pickle.load(f)

# Load summaries
with open(os.path.join(DATA_DIR, 'NCBI_summary_of_genes.json'), 'r') as f:
    ncbi_summaries = json.load(f)

with open(os.path.join(DATA_DIR, 'NCBI_UniProt_summary_of_genes.json'), 'r') as f:
    uniprot_summaries = json.load(f)

# Print info about loaded data
print("Data shapes:")
print(f"Gene ADA embeddings shape: {np.array(gene_emb_ada).shape}")
print(f"Gene protein embeddings shape: {np.array(gene_prot_emb).shape}")
print(f"Number of NCBI summaries: {len(ncbi_summaries)}")
print(f"Number of UniProt summaries: {len(uniprot_summaries)}")

Data shapes:
Gene ADA embeddings shape: ()
Gene protein embeddings shape: ()
Number of NCBI summaries: 33703
Number of UniProt summaries: 33703


In [12]:
gene_emb_ada["IFNAR2"]

[-0.040125515311956406,
 -0.0074463700875639915,
 -0.03513554856181145,
 -0.0018037190893664956,
 -0.009388342499732971,
 0.024422550573945045,
 -0.04377796873450279,
 0.029399657621979713,
 -0.014326867647469044,
 -0.016063068062067032,
 0.017709242179989815,
 -0.0031540971249341965,
 0.008063685148954391,
 0.016178814694285393,
 0.01316940039396286,
 -0.012790008448064327,
 0.00855239387601614,
 -0.0010827137157320976,
 0.013516640290617943,
 0.010539378970861435,
 -0.019316835328936577,
 0.028036419302225113,
 -0.009613404981791973,
 -0.020667213946580887,
 -0.020435720682144165,
 0.021824680268764496,
 0.0072341677732765675,
 -0.04979679360985756,
 -0.01382529828697443,
 -0.012191983871161938,
 -0.0005662745679728687,
 -0.013285147026181221,
 -0.006751889828592539,
 0.00846879929304123,
 -0.003761767176911235,
 -0.014314006082713604,
 0.006227814592421055,
 -0.0012185553787276149,
 0.024036727845668793,
 -0.009073253720998764,
 0.037424761801958084,
 0.009696999564766884,
 -0.00136

In [13]:
from omnicell.data.catalogue import Catalogue

In [19]:
dd = Catalogue.get_dataset_details("repogle_k562_essential_raw")

In [20]:
adata = sc.read_h5ad(dd.path, backed="r+")

In [21]:
adata.obs[dd.pert_key].unique()

['NAF1', 'BUB1', 'UBL5', 'C9orf16', 'TIMM9', ..., 'RPL7A', 'NUP155', 'FDPS', 'RBM22', 'POLR3A']
Length: 2058
Categories (2058, object): ['AAAS', 'AAMP', 'AARS', 'AARS2', ..., 'ZRSR2', 'ZW10', 'ZWINT', 'non-targeting']

In [22]:
gene_emb_keys = gene_emb_ada.keys()

In [23]:
#Check which perts are in the gene embeddings with a set intersection
pert_in_gene_emb = set(adata.obs[dd.pert_key].unique()).intersection(gene_emb_keys)

In [25]:
len(pert_in_gene_emb)

2037

In [27]:
#Perts not in gene embeddings
pert_not_in_gene_emb = set(adata.obs[dd.pert_key].unique()).difference(gene_emb_keys)

In [28]:
pert_not_in_gene_emb

{'ALG1L',
 'C12orf45',
 'C1orf109',
 'C7orf26',
 'C9orf16',
 'CCDC130',
 'CCDC144NL',
 'DDN',
 'FAM207A',
 'FRG2',
 'INTS2',
 'MBTPS1',
 'PHB',
 'POLR2B',
 'SPATA5',
 'SPATA5L1',
 'TDGF1',
 'WDR61',
 'WDR92',
 'ZNF720',
 'non-targeting'}