# 0. Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns

import os, sys, shutil, importlib, glob
from tqdm.notebook import tqdm

%config InlineBackend.figure_format = 'retina'

plt.rcParams['figure.figsize'] = (15,7)
plt.rcParams["savefig.dpi"] = 600


In [2]:
from celloracle import motif_analysis as ma
from celloracle.utility import save_as_pickled_object

  mpl.use("Agg", warn=False)


# 1. Load data


In [4]:
# Load annotated peak data.
peaks = pd.read_parquet("../01_ATAC-seq_data_processing/option1_scATAC-seq_data_analysis_with_cicero/peak_file.parquet")
peaks.head()

Unnamed: 0,peak_id,gene_short_name
0,chr10_100015291_100017830,Kitl
1,chr10_100486534_100488209,Tmtc3
2,chr10_100588641_100589556,4930430F08Rik
3,chr10_100741247_100742505,Gm35722
4,chr10_101681379_101682124,Mgat4c


# 2. Check data

In [5]:
# Check data
print(f"number of peak: {len(peaks.peak_id.unique())}")

def getLength(x):
    a, b, c = x["peak_id"].split("_")
    return int(c) - int(b)

df = peaks.apply(lambda x: getLength(x), axis=1)
print(f"mean peak length: {df.values.mean()}")

number of peak: 13919
mean peak length: 1756.1744260204082


## 2.1. Remove short peaks
Short DNA fragment that are less than 5 bases, cannot be used for motif scanning. Therefore, we will remove the short DNA fragments.

In [6]:
peaks = peaks[df>=5]

# 3. Instantiate TFinfo object and search for TF binding motifs
The motif analysis module has a custom class; TFinfo. The TFinfo object converts a peak data into a DNA sequences and scans the DNA sequences searching for TF binding motifs. Then, the results of motif scan will be filtered and converted into either a python dictionary or a depending on your preference. This TF information is necessary for GRN inference.

## 3.1 check reference genome installation

In [7]:
# PLEASE make sure that you are setting correct ref genome.
ref_genome = "mm10"

ma.is_genome_installed(ref_genome=ref_genome)

genome mm10 is not installed in this environment.
Please install genome using genomepy.
e.g.
    >>> import genomepy
    >>> genomepy.install_genome("mm9", "UCSC")


False

## 3.2. Install reference genome (if refgenome is not installed)

In [9]:
# check again
ma.is_genome_installed(ref_genome=ref_genome)

True

In [14]:
# Instantiate TFinfo object
tfi = ma.TFinfo(peak_data_frame=peaks, # peak info calculated from ATAC-seq data
                ref_genome=ref_genome) 

# 4. Scan motifs and save object

This step may take long time

In [15]:
%%time
# Scan motifs
tfi.scan(fpr=0.02, verbose=True)

# Save tfinfo object
tfi.to_hdf5(file_path="test.celloracle.tfinfo")

initiating scanner ...


2019-09-22 23:00:18,604 - INFO - Using background: genome mm10 with length 200
2019-09-22 23:00:18,986 - INFO - Determining FPR-based threshold


getting DNA sequences ...
scanning motifs ...


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


CPU times: user 52min 23s, sys: 36.8 s, total: 53min
Wall time: 52min 58s


In [16]:
# Check motif scan results
tfi.scanned_df.head()

Unnamed: 0,seqname,motif_id,factors_direct,factors_indirect,score,pos,strand
0,chr10_100015291_100017830,GM.5.0.Homeodomain.0001,TGIF1,"ENSG00000234254, TGIF1",10.311002,1003,1
1,chr10_100015291_100017830,GM.5.0.Mixed.0001,,"SRF, EGR1",7.925873,481,1
2,chr10_100015291_100017830,GM.5.0.Mixed.0001,,"SRF, EGR1",7.321375,911,-1
3,chr10_100015291_100017830,GM.5.0.Mixed.0001,,"SRF, EGR1",7.276585,811,-1
4,chr10_100015291_100017830,GM.5.0.Nuclear_receptor.0002,NR2C2,"NR2C2, Nr2c2",9.067331,449,-1


We have the score for each sequence and motif_id pair.
In the next step we will filter the motifs with low score.

# 5. Filtering motifs

In [17]:
# Reset filtering 
tfi.reset_filtering()

# Do filtering
tfi.filter_motifs_by_score(threshold=10.5)

# Do post filtering process. Convert results into several file format.
tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)

peaks were filtered: 12934005 -> 2285279
1. converting scanned results into one-hot encoded dataframe.


HBox(children=(IntProgress(value=0, max=13919), HTML(value='')))


2. converting results into dictionaries.
converting scan results into dictionaries...


HBox(children=(IntProgress(value=0, max=14804), HTML(value='')))




HBox(children=(IntProgress(value=0, max=1090), HTML(value='')))




# 6. Get Final results

## 6.1. Get resutls as a dictionary

In [18]:
td = tfi.to_dictionary(dictionary_type="targetgene2TFs")


## 6.2. Get results as a dataframe

In [20]:
df = tfi.to_dataframe()
df.head()

Unnamed: 0,peak_id,gene_short_name,9430076c15rik,Ac002126.6,Ac012531.1,Ac226150.2,Afp,Ahr,Ahrr,Aire,...,Znf784,Znf8,Znf816,Znf85,Zscan10,Zscan16,Zscan22,Zscan26,Zscan31,Zscan4
0,chr10_100015291_100017830,Kitl,0,0,0,0,0,1,1,0,...,0,0,0,0,0,0,0,0,1,0
1,chr10_100486534_100488209,Tmtc3,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,0
2,chr10_100588641_100589556,4930430F08Rik,0,0,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
3,chr10_100741247_100742505,Gm35722,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,chr10_101681379_101682124,Mgat4c,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


# 7. Save TFinfo as dictionary or dataframe
We'll use this information when making the GRNs. Save the results.

In [21]:
folder = "TFinfo_outputs"
os.makedirs(folder, exist_ok=True)

# save TFinfo as a dictionary
td = tfi.to_dictionary(dictionary_type="targetgene2TFs")
save_as_pickled_object(td, os.path.join(folder, "TFinfo_targetgene2TFs.pickled"))

# save TFinfo as a dataframe
df = tfi.to_dataframe()
df.to_parquet(os.path.join(folder, "TFinfo_dataframe.parquet"))