In [1]:
import pickle
import torch
import scanpy as sc

In [2]:
import pickle
import json
import os
import pandas as pd
import numpy as np

# Define data directory
DATA_DIR = '/orcd/data/omarabu/001/Omnicell_datasets/GenePT_emebdding_v2'

# Load embeddings
with open(os.path.join(DATA_DIR, 'GenePT_gene_embedding_ada_text.pickle'), 'rb') as f:
    gene_emb_ada = pickle.load(f)

with open(os.path.join(DATA_DIR, 'GenePT_gene_protein_embedding_model_3_text.pickle.'), 'rb') as f:
    gene_prot_emb = pickle.load(f)

# Load summaries
with open(os.path.join(DATA_DIR, 'NCBI_summary_of_genes.json'), 'r') as f:
    ncbi_summaries = json.load(f)

with open(os.path.join(DATA_DIR, 'NCBI_UniProt_summary_of_genes.json'), 'r') as f:
    uniprot_summaries = json.load(f)

# Print info about loaded data
print("Data shapes:")
print(f"Gene ADA embeddings shape: {np.array(gene_emb_ada).shape}")
print(f"Gene protein embeddings shape: {np.array(gene_prot_emb).shape}")
print(f"Number of NCBI summaries: {len(ncbi_summaries)}")
print(f"Number of UniProt summaries: {len(uniprot_summaries)}")

Data shapes:
Gene ADA embeddings shape: ()
Gene protein embeddings shape: ()
Number of NCBI summaries: 33703
Number of UniProt summaries: 33703


In [3]:
print(len(gene_emb_ada.keys()))
print(len(gene_prot_emb.keys()))

gene_prot_emb.keys()


#Saving all the keys in a list and writing it to a file
keys = gene_prot_emb.keys()
with open('keys.txt', 'w') as f:
    for item in keys:
        f.write("%s\n" % item)

93800
133736


In [4]:
from omnicell.data.catalogue import Catalogue

In [5]:
dd = Catalogue.get_dataset_details("repogle_k562_essential_raw")

In [6]:
adata = sc.read_h5ad(dd.path, backed="r+")

In [12]:
adata.obs[dd.pert_key].unique()

['NAF1', 'BUB1', 'UBL5', 'C9orf16', 'TIMM9', ..., 'RPL7A', 'NUP155', 'FDPS', 'RBM22', 'POLR3A']
Length: 2058
Categories (2058, object): ['AAAS', 'AAMP', 'AARS', 'AARS2', ..., 'ZRSR2', 'ZW10', 'ZWINT', 'non-targeting']

In [13]:
keys = gene_prot_emb.keys()


In [7]:
gene_prot_emb["IFNAR1"]

[-0.038623977452516556,
 0.011514193378388882,
 0.004291553050279617,
 0.010175848379731178,
 -0.013029509223997593,
 -0.057471420615911484,
 0.0020365521777421236,
 -0.023470815271139145,
 -0.003036163281649351,
 0.016624238342046738,
 -0.034022726118564606,
 -0.015330135822296143,
 0.04320311173796654,
 -0.02543962001800537,
 -0.029554201290011406,
 0.0445525161921978,
 -0.027275696396827698,
 0.012576020322740078,
 0.009772132150828838,
 0.0008924605790525675,
 0.015650896355509758,
 -0.005884294863790274,
 0.011646921746432781,
 -0.012454353272914886,
 -0.003047224134206772,
 0.0065866494551301,
 -0.01589423231780529,
 -0.01023115124553442,
 -0.05406472086906433,
 -0.011315100826323032,
 -0.0536222942173481,
 -0.001791834132745862,
 0.007377489935606718,
 -0.029930265620350838,
 -0.01622605323791504,
 -0.02530689164996147,
 -0.018438193947076797,
 0.0035449557472020388,
 -0.00019684596918523312,
 -0.0440879687666893,
 0.02654569037258625,
 0.06008174642920494,
 -0.00525936484336853

In [14]:
#Check which perts are in the gene embeddings with a set intersection
pert_in_gene_emb = set(adata.obs[dd.pert_key].unique()).intersection(keys)

In [15]:
len(pert_in_gene_emb)

2038

In [17]:
#Perts not in gene embeddings
pert_not_in_gene_emb = set(adata.obs[dd.pert_key].unique()).difference(keys)

In [18]:
pert_not_in_gene_emb

{'ALG1L',
 'C12orf45',
 'C1orf109',
 'C7orf26',
 'C9orf16',
 'CCDC130',
 'CCDC144NL',
 'DDN',
 'FRG2',
 'INTS2',
 'MBTPS1',
 'PHB',
 'POLR2B',
 'SPATA5',
 'SPATA5L1',
 'TDGF1',
 'WDR61',
 'WDR92',
 'ZNF720',
 'non-targeting'}