# 0. Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


import seaborn as sns

import os, sys, shutil, importlib, glob
from tqdm.notebook import tqdm

In [2]:
from celloracle import motif_analysis as ma
from celloracle.utility import save_as_pickled_object


In [3]:
# Version check
from celloracle import __version__ as co_version
from gimmemotifs import __version__ as gm_version
from genomepy import __version__ as gp_version

print("celloracle version: ", co_version)
print("gimmemotifs version: ", gm_version)
print("genomepy version: ", gp_version)

celloracle version:  0.5.0
gimmemotifs version:  0.14.4
genomepy version:  0.8.4


# 1. Load data


In [4]:
# Load annotated peak data.
peaks = pd.read_parquet("../01_ATAC-seq_data_processing/option1_scATAC-seq_data_analysis_with_cicero/peak_file.parquet")
peaks.head()

Unnamed: 0,peak_id,gene_short_name
0,chr10_100015291_100017830,Kitl
1,chr10_100486534_100488209,Tmtc3
2,chr10_100588641_100589556,4930430F08Rik
3,chr10_100741247_100742505,Gm35722
4,chr10_101681379_101682124,Mgat4c


# 2. Check data

In [5]:
# Check data
print(f"Number of peak: {len(peaks.peak_id.unique())}")

def getLength(x):
    _, start, end = x["peak_id"].split("_")
    return abs(int(end) - int(start))

df = peaks.apply(lambda x: getLength(x), axis=1)
print(f"Mean peak length: {df.values.mean()}")

Number of peak: 13919
Mean peak length: 1756.1744260204082


## 2.1. Remove short peaks
Short DNA fragment that are less than 5 bases, cannot be used for motif scanning. Therefore, we will remove the short DNA fragments.

In [6]:
peaks = peaks[df>=5]

# 3. Instantiate TFinfo object and search for TF binding motifs
The motif analysis module has a custom class; TFinfo. The TFinfo object converts a peak data into a DNA sequences and scans the DNA sequences searching for TF binding motifs. Then, the results of motif scan will be filtered and converted into either a python dictionary or a depending on your preference. This TF information is necessary for GRN inference.

## 3.1 check reference genome installation

In [7]:
# PLEASE make sure that you are setting correct ref genome.
ref_genome = "mm10"

genome_installation = ma.is_genome_installed(ref_genome=ref_genome)
print(ref_genome, "installation: ", genome_installation)

mm10 installation:  True


## 3.2. Install reference genome (if refgenome is not installed)

In [8]:
if not genome_installation:
    import genomepy
    genomepy.install_genome(ref_genome, "UCSC")
else:
    print(ref_genome, "is installed.")

mm10 is installed.


## 3.3. Instantiate TFinfo object

In [9]:
# Instantiate TFinfo object
tfi = ma.TFinfo(peak_data_frame=peaks, # peak info calculated from ATAC-seq data
                ref_genome=ref_genome) 

# 4. Scan motifs and save object

You can set TF binding motif information as an argument: tfi.scan(motifs=motifs)

If you don't set motifs or set None, default motifs will be loaded automatically.

- For mouse and human, "gimme.vertebrate.v5.0." will be used as a default motifs. 
- For another species, a species specific TF binding motif data extracted from CisDB ver2.0 will be used.



## 4.1. [Optional step] Load motifs
If you want to use another non-default motifs, you need to load and set the motifs when you scan motifs: tfi.scan(motifs=motifs)
    
### 4.1.1. [Optional step] Load motif data from gimmemotifs dataset

Many other motif databases are included with GimmeMotifs. https://gimmemotifs.readthedocs.io/en/master/overview.html
You can load them as follows.

In [10]:
# Get folder path that stores motif data.
import os, glob
from gimmemotifs.motif import MotifConfig
config = MotifConfig()
motif_dir = config.get_motif_dir()

# Get list for motif data name
motifs_data_name = [i for i in os.listdir(motif_dir) if i.endswith(".pfm")]
motifs_data_name.sort()
motifs_data_name

['CIS-BP.pfm',
 'ENCODE.pfm',
 'HOCOMOCOv10_HUMAN.pfm',
 'HOCOMOCOv10_MOUSE.pfm',
 'HOCOMOCOv11_HUMAN.pfm',
 'HOCOMOCOv11_MOUSE.pfm',
 'HOMER.pfm',
 'IMAGE.pfm',
 'JASPAR2018.pfm',
 'JASPAR2018_fungi.pfm',
 'JASPAR2018_insects.pfm',
 'JASPAR2018_nematodes.pfm',
 'JASPAR2018_plants.pfm',
 'JASPAR2018_urochordates.pfm',
 'JASPAR2018_vertebrates.pfm',
 'JASPAR2020.pfm',
 'JASPAR2020_fungi.pfm',
 'JASPAR2020_insects.pfm',
 'JASPAR2020_nematodes.pfm',
 'JASPAR2020_plants.pfm',
 'JASPAR2020_urochordates.pfm',
 'JASPAR2020_vertebrates.pfm',
 'RSAT_insects.pfm',
 'RSAT_plants.pfm',
 'RSAT_vertebrates.pfm',
 'SwissRegulon.pfm',
 'factorbook.pfm',
 'gimme.vertebrate.v5.0.pfm']

In [11]:
# You can load motif files with "read_motifs"
from gimmemotifs.motif import read_motifs

path = os.path.join(motif_dir, "JASPAR2018_plants.pfm")
motifs = read_motifs(path)

# Check first 10 motifs
motifs[:10]

[MA0020.1_Dof2_AAAGCn,
 MA0021.1_Dof3_AAAGyn,
 MA0034.1_Gam1_nnyAACCGmC,
 MA0044.1_HMG-1_sTTGTnyTy,
 MA0045.1_HMG-I/Y_nwAnAAAnrnmrAmAy,
 MA0053.1_MNB1A_AAAGC,
 MA0054.1_myb.Ph3_TAACnGTTw,
 MA0064.1_PBF_AAAGy,
 MA0082.1_squamosa_mCAwAwATrGwAAn,
 MA0096.1_bZIP910_mTGACGT]

### 4.1.2 [Optional step] Load motif data from celloracle motif dataset

Celloracle provides many motif dataset that was generated from CisDB. http://cisbp.ccbr.utoronto.ca/

These motifs were organized by each species. Please select motifs for your species.

In [12]:
# Check available motifs
ma.MOTIFS_LIST

['CisDB_ver2_Mus_musculus.pfm',
 'CisDB_ver2_Saccharomyces_cerevisiae.pfm',
 'CisDB_ver2_Danio_rerio.pfm',
 'CisDB_ver2_Homo_sapiens.pfm']

In [13]:
# Load motifs from celloracle dataset.
motifs = ma.load_motifs("CisDB_ver2_Mus_musculus.pfm")

# Check first 10 motifs
motifs[:10]

[M00008_2.00_nnnAAww,
 M00044_2.00_nrTAAACAn,
 M00056_2.00_TAATAAAT,
 M00060_2.00_nnnTTCnnn,
 M00111_2.00_nGCCynnGGs,
 M00112_2.00_CCTsrGGCnA,
 M00113_2.00_nsCCnnAGGs,
 M00114_2.00_nnGCCynnGG,
 M00115_2.00_nnATnAAAn,
 M00116_2.00_nnAATATTAnn]

### 4.1.3. [Optional step] Use custom motif data
We made a instruction for how to load / make motifs.
If you want to use another motifs, Please look into the notebook.

## 4.2. Motif scan

In [14]:
%%time
# Scan motifs. !!CAUTION!! This step may take several hours if you have many peaks!
tfi.scan(fpr=0.02, 
            motifs=None,  # If you enter None, default motifs will be loaded.
            verbose=True)

# Save tfinfo object
tfi.to_hdf5(file_path="test1.celloracle.tfinfo")

No motif data entered. Loading default motifs for your species ...
 Default motif for vertebrate: gimme.vertebrate.v5.0. 
 For more information, please go https://gimmemotifs.readthedocs.io/en/master/overview.html
Initiating scanner...
Calculating FPR-based threshold. This step may take substantial time when you load new motifs or new ref-genome. It will be done quicker on the second time.
Convert peak info into DNA sequences ...
Scanning motifs ... It may take several hours if you proccess many peaks.


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))




KeyboardInterrupt: 

In [15]:
# Check motif scan results
tfi.scanned_df.head()

AttributeError: 'NoneType' object has no attribute 'head'

We have the score for each sequence and motif_id pair.
In the next step we will filter the motifs with low score.

# 5. Filtering motifs

In [None]:
# Reset filtering 
tfi.reset_filtering()

# Do filtering
tfi.filter_motifs_by_score(threshold=10.5)

# Do post filtering process. Convert results into several file format.
tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)



# 6. Get Final results

## 6.1. Get resutls as a dictionary

In [None]:
td = tfi.to_dictionary(dictionary_type="targetgene2TFs")


## 6.2. Get results as a dataframe

In [None]:
df = tfi.to_dataframe()
df.head()

# 7. Save TFinfo as dictionary or dataframe
We'll use this information when making the GRNs. Save the results.

In [None]:
folder = "TFinfo_outputs"
os.makedirs(folder, exist_ok=True)

# save TFinfo as a dictionary
td = tfi.to_dictionary(dictionary_type="targetgene2TFs")
save_as_pickled_object(td, os.path.join(folder, "TFinfo_targetgene2TFs.pickled"))

# save TFinfo as a dataframe
df = tfi.to_dataframe()
df.to_parquet(os.path.join(folder, "TFinfo_dataframe.parquet"))