# TF info preprocessing
Preprocessing the TF-motif databases from SCENIC to use them as input in CellOracle.

In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
import seaborn as sns

## 1. Load Expression Matrix

In [2]:
adata = sc.read_h5ad("./inputs/pancreas_data.h5ad") #this is the pancreas dataset from scvelo (already preprocessed)

In [3]:
#list of all the genes in our dataset
genes = np.asarray(adata.to_df().columns)
genes

array(['Sntg1', 'Snhg6', 'Ncoa2', ..., 'Ddx3y', 'Eif2s3y', 'Erdr1'],
      dtype=object)

In [4]:
#Create .txt with the list of Pancreas genes
#with open('pancreas_genes.txt', 'w') as f:
#    for item in list(adata.to_df().columns):
#        f.write("%s\n" % item)

## 2. Load List of Transcription Factors

In [5]:
TFs_all = pd.read_csv("./inputs/mm_mgi_tfs.txt", header = None) #list of TFs for mus musculus from SCENIC
TFs_pancreas = np.intersect1d(genes, TFs_all, assume_unique=False, return_indices=False) #list of TFs for pancreas

## 3. Load Motif - TF table

In [6]:
#TF-motif list, filter to have only the genes that appear in our dataset
TF_info = pd.read_table("../scenic/inputs/motifs-v9-nr.mgi-m0.001-o0.0.tbl", usecols = ["#motif_id",'gene_name'])
TF_info = TF_info[TF_info['gene_name'].isin(TFs_pancreas)].reset_index(drop = True)
TF_info.head()

Unnamed: 0,#motif_id,gene_name
0,bergman__EcR_usp,Nr1h4
1,bergman__EcR_usp,Nr2f6
2,bergman__EcR_usp,Vdr
3,bergman__Eip74EF,Egr1
4,bergman__Eip74EF,Ehf


## 4. Load Ranking databases (Motif - genes)

In [7]:
#Motif ranking database
feather1=pd.read_feather("../scenic/inputs/mm9-500bp-upstream-10species.mc9nr.feather")
feather1 = feather1[feather1.features.isin(TF_info['#motif_id'].unique())].reset_index(drop = True)
names1 = feather1.features
feather1 = (feather1.drop(["features"], axis = 1) <= 1500).astype(int).drop(feather1.columns[feather1.sum()==0], axis=1)
feather1['features'] = np.asarray(names1)
feather1.head()

Unnamed: 0,0610007C21Rik,0610007L01Rik,0610007P08Rik,0610007P14Rik,0610007P22Rik,0610008F07Rik,0610009B14Rik,0610009B22Rik,0610009D07Rik,0610009O20Rik,...,Zxdc,Zyg11a,Zyg11b,Zyx,Zzef1,Zzz3,a,l7Rn6,rp9,features
0,0,0,0,0,0,1,0,0,0,0,...,0,1,0,1,0,0,1,0,0,hocomoco__CEBPG_MOUSE.H11MO.0.B
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,flyfactorsurvey__tai_Clk_SANGER_5_FBgn0023076
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,jaspar__MA0509.1
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,transfac_pro__M07695
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,1,0,0,0,0,0,cisbp__M4475


## 5. Merge everything to obtain dictionary with target genes and transcription factors

In [8]:
#map region with transcription factor
feather1 =TF_info.merge(feather1, left_on="#motif_id", right_on="features", how="right")

In [9]:
#keep only transcription factors, region was only for the mapping
TF_genes = feather1.groupby("gene_name").sum()

In [21]:
#only select the ones that are in our dataset
TF_genes = (TF_genes.T[TF_genes.T.index.isin(genes)]).T
TF_genes

Unnamed: 0_level_0,0610010F05Rik,0610011F06Rik,0610030E20Rik,1110002L01Rik,1110034G24Rik,1500009L16Rik,1700011H14Rik,1700026L06Rik,1700086L19Rik,1810041L15Rik,...,Zfp804a,Zfp90,Zfp949,Zfpm1,Zfr2,Zim1,Zmym5,Znrf2,Zswim5,Zwint
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A1cf,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Arg1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
Arid3a,2,31,0,0,21,7,16,89,7,0,...,10,0,0,0,6,0,0,201,26,0
Arid5b,0,7,0,0,0,1,0,1,0,0,...,7,0,0,0,0,0,0,3,1,0
Arx,11,12,0,0,4,9,210,6,0,1,...,58,0,0,0,0,0,1,191,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Zfp710,0,0,0,0,0,0,1,0,0,1,...,0,0,0,1,0,0,0,2,1,0
Zfp90,0,0,0,0,0,3,0,2,0,0,...,0,0,0,0,0,0,0,1,3,0
Zfp949,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
Zfpm1,0,0,0,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
#build the dictionary of TFs and target genes
TF_targets = {}
for i, col in enumerate(TF_genes.columns):
    TF_targets[col] = list(TF_genes[col][TF_genes[col]>0].index)

In [23]:
import json

with open('TF_targets_pancreas.json', 'w') as fp:
    json.dump(TF_targets, fp)

In [2]:
#path = '../scenic/inputs/'
#datasets = {'500-10':'mm9-500bp-upstream-10species.mc9nr.feather', 
#            '500-7':'mm9-500bp-upstream-7species.mc9nr.feather',
#            '10-10':'mm9-tss-centered-10kb-10species.mc9nr.feather',
#            '10-7':'mm9-tss-centered-10kb-7species.mc9nr.feather',
#            '5-10':'mm9-tss-centered-5kb-10species.mc9nr.feather',
#            '5-7':'mm9-tss-centered-5kb-7species.mc9nr.feather'}