In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy, os, sys, shutil, importlib, glob
from tqdm.notebook import tqdm
from celloracle import motif_analysis as ma
import genomepy
#os.chdir("/Users/erickernfeld/Dropbox/pharynx_celloracle")

In [2]:
ref_genome = "mm10"

genome_installation = ma.is_genome_installed(ref_genome=ref_genome)
print(ref_genome, "installation: ", genome_installation)

mm10 installation:  True


In [3]:
save_dir = "../intermediate_results"

In [4]:
base_dir = "/Users/LoboM/Dropbox/sharedUMass_Macrina_Rene/celloracle_pharynx_2021_v2_hyphen_cisbp2/results"
#import os
#os.mkdir(save_dir)
peaks = pd.read_csv(save_dir + "/peaks_plus_loops_corr_45.csv",index_col=0)

In [6]:
peaks.shape

(35418, 2)

In [7]:
def decompose_chrstr(peak_str):
    """
    Args:
        peak_str (str): peak_str. e.g. 'chr1_3094484_3095479'

    Returns:
        tuple: chromosome name, start position, end position
    """

    *chr_, start, end = peak_str.split("_")
    chr_ = "_".join(chr_)
    return chr_, start, end

from genomepy import Genome

def check_peak_foamat(peaks_df, ref_genome):
    """
    Check peak fomat.
     (1) Check chromosome name.
     (2) Check peak size (length) and remove sort DNAs (<5bp)

    """

    df = peaks_df.copy()

    n_peaks_before = df.shape[0]

    # Decompose peaks and make df
    decomposed = [decompose_chrstr(peak_str) for peak_str in df["peak_id"]]
    df_decomposed = pd.DataFrame(np.array(decomposed))
    df_decomposed.columns = ["chr", "start", "end"]
    df_decomposed["start"] = df_decomposed["start"].astype(np.int)
    df_decomposed["end"] = df_decomposed["end"].astype(np.int)

    # Load genome data
    genome_data = Genome(ref_genome)
    all_chr_list = list(genome_data.keys())


    # DNA length check
    lengths = np.abs(df_decomposed["end"] - df_decomposed["start"])


    # Filter peaks with invalid chromosome name
    n_threshold = 5
    df = df[(lengths >= n_threshold) & df_decomposed.chr.isin(all_chr_list)]

    # DNA length check
    lengths = np.abs(df_decomposed["end"] - df_decomposed["start"])

    # Data counting
    n_invalid_length = len(lengths[lengths < n_threshold])
    n_peaks_invalid_chr = n_peaks_before - df_decomposed.chr.isin(all_chr_list).sum()
    n_peaks_after = df.shape[0]

    #
    print("Peaks before filtering: ", n_peaks_before)
    print("Peaks with invalid chr_name: ", n_peaks_invalid_chr)
    print("Peaks with invalid length: ", n_invalid_length)
    print("Peaks after filtering: ", n_peaks_after)

    return df

In [8]:
peaks = check_peak_foamat(peaks, ref_genome)

Peaks before filtering:  35418
Peaks with invalid chr_name:  0
Peaks with invalid length:  0
Peaks after filtering:  35418


In [9]:
tfi = ma.TFinfo(peak_data_frame=peaks, # peak info calculated from ATAC-seq data
                       ref_genome=ref_genome)

In [10]:
%%time
# Scan motifs. !!CAUTION!! This step may take several hours if you have many peaks!
tfi.scan(fpr=0.02,
            motifs=None,  # If you enter None, default motifs will be loaded.
            verbose=True)



No motif data entered. Loading default motifs for your species ...
 Default motif for vertebrate: gimme.vertebrate.v5.0. 
 For more information, please go https://gimmemotifs.readthedocs.io/en/master/overview.html 

Initiating scanner... 

Calculating FPR-based threshold. This step may take substantial time when you load a new ref-genome. It will be done quicker on the second time. 

Convert peak info into DNA sequences ... 

Scanning motifs ... It may take several hours if you proccess many peaks. 



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))


CPU times: user 35min 16s, sys: 49.9 s, total: 36min 6s
Wall time: 1h 10min 59s


In [11]:
# Save tfinfo object
tfi.to_hdf5(file_path="/Users/LoboM/Dropbox/gimmemotifs_db.celloracle.tfinfo")

In [11]:
tfi.scanned_df.head()

Unnamed: 0,seqname,motif_id,factors_direct,factors_indirect,score,pos,strand
0,chr10_100015544_100016044,GM.5.0.Mixed.0001,,"EGR1, SRF",7.925873,228,1
1,chr10_100015544_100016044,GM.5.0.Mixed.0001,,"EGR1, SRF",6.92596,2,1
2,chr10_100015544_100016044,GM.5.0.Mixed.0001,,"EGR1, SRF",6.900454,231,1
3,chr10_100015544_100016044,GM.5.0.Nuclear_receptor.0002,NR2C2,"Nr2c2, NR2C2",9.067331,196,-1
4,chr10_100015544_100016044,GM.5.0.Myb_SANT.0001,"MYBL1, MYBL2","MYBL1, MYBL2",6.596899,416,-1


In [12]:
tfi.reset_filtering()

In [13]:
tfi.filter_motifs_by_score(threshold=10.0) 

Filtering finished: 8670059 -> 1836340


In [14]:
tfi.make_TFinfo_dataframe_and_dictionary(verbose=True)

1. Converting scanned results into one-hot encoded dataframe.


HBox(children=(FloatProgress(value=0.0, max=31545.0), HTML(value='')))


2. Converting results into dictionaries.


HBox(children=(FloatProgress(value=0.0, max=16649.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=1093.0), HTML(value='')))




In [15]:
td = tfi.to_dictionary(dictionary_type="targetgene2TFs")

In [16]:
df = tfi.to_dataframe()
df.head()

Unnamed: 0,peak_id,gene_short_name,9430076c15rik,Ac002126.6,Ac012531.1,Ac226150.2,Afp,Ahr,Ahrr,Aire,...,Znf784,Znf8,Znf816,Znf85,Zscan10,Zscan16,Zscan22,Zscan26,Zscan31,Zscan4
0,chr10_100015544_100016044,Kitl,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,chr10_100487191_100487691,Tmtc3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,chr10_100487769_100488269,Tmtc3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,chr10_100589009_100589509,4930430F08Rik,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,chr10_100741990_100742490,Gm35722,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [24]:
folder = "/Users/LoboM/Dropbox/"
os.makedirs(folder, exist_ok=True)
from celloracle.utility import save_as_pickled_object
# save TFinfo as a dictionary
td = tfi.to_dictionary(dictionary_type="targetgene2TFs")
save_as_pickled_object(td, os.path.join(folder, "TFinfo_targetgene2TFs.pickled"))

# save TFinfo as a dataframe
df = tfi.to_dataframe()
df.to_parquet(os.path.join(folder, "TFinfo_dataframe.parquet"))

In [25]:
df = tfi.to_dataframe()

In [26]:
df

Unnamed: 0,peak_id,gene_short_name,9430076c15rik,Ac002126.6,Ac012531.1,Ac226150.2,Afp,Ahr,Ahrr,Aire,...,Znf784,Znf8,Znf816,Znf85,Zscan10,Zscan16,Zscan22,Zscan26,Zscan31,Zscan4
0,chr10_100015544_100016044,Kitl,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,chr10_100487191_100487691,Tmtc3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,chr10_100487769_100488269,Tmtc3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,chr10_100589009_100589509,4930430F08Rik,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,chr10_100741990_100742490,Gm35722,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35413,chrX_99471074_99471574,Pja1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35414,chrX_99820093_99820593,Tmem28,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35415,chrX_99820602_99821102,Tmem28,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
35416,chrX_99821117_99821617,Tmem28,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
df.to_csv("/Users/LoboM/Dropbox/peak_motif_matrix.csv")