# Build positive simulation dataset

In [None]:
import scanpy as sc
import os
import pickle
import numpy as np
from scipy.sparse import csr_matrix
def manipulate(adata,genes,lg2fc):
    if 'raw' not in adata.layers.keys():
        adata.layers['raw'] = adata.X.copy().toarray()
    else:
        adata.X = adata.layers['raw'].copy().toarray()
    adata.layers['simu'] = adata.X.copy()
    adata.obs = adata.obs.reset_index()
    gene_indices = []
    tempX = adata.layers['simu'].toarray()
    np.random.shuffle(tempX)
    adata.layers['simu'] = tempX
    for i in list(adata.obs['stage'].unique()):
        temp = adata.obs[adata.obs['stage'] == i].index.tolist()
        temptemp = adata.obs[(adata.obs['stage'] == i)& (adata.obs['name.simple'].str.startswith('Fibroblast'))].index.tolist()
        for each in genes:
            direction = each.split(':')[1]
            each = each.split(':')[0]
            gene_index = adata.var.index.tolist().index(each)
            gene_indices.append(gene_index)
            if direction == '+':
                adata.layers['simu'][temptemp,gene_index] += (4-int(i))*(lg2fc)
            else:
                adata.layers['simu'][temptemp,gene_index] += (int(i)+1)*(lg2fc)

        #if gene is not gene_index, add a gaussian noise to adata.layers['simu'][temp,:]
    all_genes = adata.var.index.tolist()
    #remove the genes that are not in the gene_indices
    gene_indices = set(gene_indices)
    gene_indices = list(gene_indices)
    all_genes = [i for i in range(len(all_genes))]
    for each in gene_indices:
        all_genes.remove(each)
    adata.layers['simu'][adata.layers['simu']<0] = 0 
    adata.X = csr_matrix(adata.layers['simu'])
    adata.obs.set_index('index',inplace=True)
    return adata

adata = sc.read_h5ad('../to_published/mes_4/dataset.h5ad')
candidate = np.load('../data/lowest25_drug_simulation_candidates.npy',allow_pickle=True)
targets = np.load('../data/lowest25_drug_simualtion_targets.npy',allow_pickle=True)

changes = [0.2,0.3,0.4]
random_s = 0
for i in range(0,len(candidate)):
    
    for change in changes:
        random_s +=1
        np.random.seed(random_s)
        adata_copy_copy = adata.copy()
        os.mkdir('../data/drug_simulation_positive/drug_%d_change_%.2f'%(i,change))
        adata_copy = manipulate(adata_copy_copy,targets[i],change)
        
        del adata_copy.layers
        del adata_copy.uns
        del adata_copy.obsp
        adata_copy.write('../data/drug_simulation_positive/drug_%d_change_%.2f/dataset.h5ad'%(i,change),compression='gzip',compression_opts=9)
        np.save('../data/drug_simulation_positive/drug_%d_change_%.2f/genes.npy'%(i,change),np.array(targets[i],dtype='object'))
            

# Build negative simulation dataset

In [None]:
import scanpy as sc
import os
import pickle
import numpy as np
from scipy.sparse import csr_matrix
def manipulate(adata,genes,lg2fc):
    if 'raw' not in adata.layers.keys():
        adata.layers['raw'] = adata.X.copy().toarray()
    else:
        adata.X = adata.layers['raw'].copy().toarray()
    adata.layers['simu'] = adata.X.copy()
    adata.obs = adata.obs.reset_index()
    gene_indices = []
    tempX = adata.layers['simu'].toarray()
    np.random.shuffle(tempX)
    adata.layers['simu'] = tempX
    for i in list(adata.obs['stage'].unique()):
        templg2fc = np.random.normal(0, 0.1*lg2fc)
        # stage == i and name.simple=='FibroblastAveolar'
        temp = adata.obs[adata.obs['stage'] == i].index.tolist()
       
        temptemp = adata.obs[(adata.obs['stage'] == i)& (adata.obs['name.simple'].str.startswith('Fibroblast'))].index.tolist()
        
        for each in genes:
            
            direction = each.split(':')[1]
            each = each.split(':')[0]
            gene_index = adata.var.index.tolist().index(each)
            gene_indices.append(gene_index)
            if direction == '+':
                #build a vector with the same shape as adata.layers['simu'][temp,gene_index] and assign 0 or 1 randomly to each element
                adata.layers['simu'][temptemp,gene_index] += np.random.normal(0, lg2fc*lg2fc/900,adata.layers['simu'][temptemp,gene_index].shape)
               
            else:
            adata.layers['simu'][temptemp,gene_index] += np.random.normal(0, lg2fc*lg2fc/900,adata.layers['simu'][temptemp,gene_index].shape)
           
    adata.layers['simu'][adata.layers['simu']<0] = 0 
    all_genes = adata.var.index.tolist()

    gene_indices = set(gene_indices)
    gene_indices = list(gene_indices)
    all_genes = [i for i in range(len(all_genes))]
    for each in gene_indices:
        all_genes.remove(each)
   
    adata.X = csr_matrix(adata.layers['simu'])
    adata.obs.set_index('index',inplace=True)
    return adata

adata = sc.read_h5ad('../to_published/mes_4/dataset.h5ad')
candidate = np.load('lowest25_drug_simulation_candidates.npy',allow_pickle=True)
targets = np.load('lowest25_drug_simualtion_targets.npy',allow_pickle=True)
changes = [0.2,0.3,0.4]
random_s = 0
for i in range(len(candidate)):
    
    for change in changes:
        random_s +=1
        np.random.seed(random_s)
        adata_copy_copy = adata.copy()
        
        os.mkdir('../data/drug_simulation_negative/drug_%d_change_%.2f'%(i,change))
        
        adata_copy = manipulate(adata_copy_copy,targets[i],change)
        
        del adata_copy.layers
        del adata_copy.uns
        del adata_copy.obsp
        adata_copy.write('../data/drug_simulation_negative/drug_%d_change_%.2f/dataset.h5ad'%(i,change),compression='gzip',compression_opts=9)
        np.save('../data/drug_simulation_negative/drug_%d_change_%.2f/genes.npy'%(i,change),np.array(targets[i],dtype='object'))
        print('drug_simulation_negative')
        