# 0. Overview: How to Make custom motifs for celloracle motif scan.

In this notebook, we introduce how to make motif dataset for celloracle motif analysis.
Celloracle uses list of motif object in gimmemotifs package. See gimmemotifs documentation for more details. (https://gimmemotifs.readthedocs.io/en/master/api.html#)

Here, we get motif data from CisBP (version2).http://cisbp.ccbr.utoronto.ca

We will extract motif information for a specific species and save as XXX.pfm and XXX.motif2facrors.txt file. 
These files can be read with read_motifs function in gimmemotifs.


In [1]:

import numpy as np
import pandas as pd
import seaborn as sns
import sys, os

import matplotlib.pyplot as plt
%matplotlib inline
plt.rcParams["figure.figsize"] = [20,10]

from gimmemotifs.motif import Motif,read_motifs

# 1. Download full dataset for TF dataset from CisBP dadabase

Go the URL below for the download link for the latest data.  http://cisbp.ccbr.utoronto.ca/entireDownload.php

In [None]:
! wget http://cisbp.ccbr.utoronto.ca/data/2.00/DataFiles/Bulk_downloads/EntireDataset/PWMs.zip
! unzip PWMs.zip

In [3]:
!wget http://cisbp.ccbr.utoronto.ca/data/2.00/DataFiles/Bulk_downloads/EntireDataset/TF_Information_all_motifs.txt.zip
! unzip TF_Information_all_motifs.txt.zip

--2020-08-03 16:34:06--  http://cisbp.ccbr.utoronto.ca/data/2.00/DataFiles/Bulk_downloads/EntireDataset/TF_Information_all_motifs.txt.zip
Resolving cisbp.ccbr.utoronto.ca (cisbp.ccbr.utoronto.ca)... 142.150.52.218
Connecting to cisbp.ccbr.utoronto.ca (cisbp.ccbr.utoronto.ca)|142.150.52.218|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 115455298 (110M) [application/zip]
Saving to: ‘TF_Information_all_motifs.txt.zip’


2020-08-03 16:34:25 (5.73 MB/s) - ‘TF_Information_all_motifs.txt.zip’ saved [115455298/115455298]

Archive:  TF_Information_all_motifs.txt.zip
  inflating: TF_Information_all_motifs.txt  


In [4]:
# Load TF information as a dataframe.
df = pd.read_table("TF_Information_all_motifs.txt")
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,TF_ID,Family_ID,TSource_ID,Motif_ID,MSource_ID,DBID,TF_Name,TF_Species,TF_Status,Family_Name,...,MSource_Year,PMID,MSource_Version,SR_Model,SR_NoThreshold,TfSource_Name,TfSource_URL,TfSource_Year,TfSource_Month,TfSource_Day
0,T000001_2.00,F001_2.00,TS12_2.00,.,.,BRADI2G60554,BRADI2G60554,Brachypodium_distachyon,N,"ABF1,B3",...,.,.,.,SequenceIdentity,True,Ensembl,http://www.ensembl.org/,2018,Dec,8
1,T000002_2.00,F001_2.00,TS12_2.00,.,.,LPERR05G06870,LPERR05G06870,Leersia_perrieri,N,"ABF1,B3",...,.,.,.,SequenceIdentity,True,Ensembl,http://www.ensembl.org/,2018,Dec,8
2,T000003_2.00,F002_2.00,TS04_2.00,.,.,CPAG_02544,CPAG_02544,Candida_parapsilosis,N,ABF1,...,.,.,.,SequenceIdentity,True,Broad,http://www.broadinstitute.org/,2016,May,1
3,T000004_2.00,F002_2.00,TS04_2.00,.,.,PTSG_00627,PTSG_00627,Salpingoeca_rosetta,N,ABF1,...,.,.,.,SequenceIdentity,True,Broad,http://www.broadinstitute.org/,2016,May,1
4,T000005_2.00,F002_2.00,TS04_2.00,.,.,WUBG_06707,WUBG_06707,Wuchereria_bancrofti,N,ABF1,...,.,.,.,SequenceIdentity,True,Broad,http://www.broadinstitute.org/,2016,May,1


In [14]:
df.shape

(10879322, 28)

# 2. Define custom functions

In [15]:
# All process will be done inside these function.

from datetime import datetime
import glob

def read_pwn_and_convert_into_list(path):
    # read pwn as df
    pwm = pd.read_csv(path, delimiter="\t")
    
    # convert into list of str
    li = []
    for i in pwm.iterrows():
        i = i[1].values[1:]
        i = "\t".join(i.astype("str")) + "\n"
        li.append(i)
        
    return li

def make_motif_file_from_cisbp_data(pwm_folder_path, tfinfo_df, species):
    
    data_ = tfinfo_df[tfinfo_df.TF_Species == species]
    data_name = "CisBP_ver2_" + species
    
    ## 1. Make file: motif2factors.txt
    
    # Select information 
    df_factors = data_[["Motif_ID", "TF_Name", "MSource_Type", "TF_Status"]]
    df_factors = df_factors[df_factors.TF_Status != "N"]

    # Formatting
    df_factors.columns = 'Motif\tFactor\tEvidence\tCurated'.split("\t")
    df_factors["Curated"] = [{"D": "Y", "I": "N"}[i] for i in df_factors["Curated"]]
    df_factors = df_factors.sort_values(by="Motif")
    
    ## 2. Make file: pfm file
    comments = f"# CIS-BP motif database (v2.0), retrieved by Celloracle\n"
    comments += "# Retrieved from: http://cisbp.ccbr.utoronto.ca/data/2.00/DataFiles/Bulk_downloads/EntireDataset/PWMs.zip\n"
    comments += f"#Date: {datetime.now().ctime()}\n"
    
    # Get list of motif name
    paths_pwm = glob.glob(os.path.join(pwm_folder_path, "*.txt"))
    paths_pwm.sort()
    motif_names = [path.split("/")[-1].replace(".txt", "") for path in paths_pwm]
    motifs = np.intersect1d(motif_names, df_factors.Motif.unique())
    
    print(motifs.shape)
    
    # Intersect motif information with pwm information
    df_factors = df_factors[df_factors.Motif.isin(motifs)]
    
    # Load, convert, and save pwm info
    output = data_name + ".pfm"

    motifs_non_zero = []
    with open(output, "w") as f:

        for motif_name in motifs:
            
            path = os.path.join(pwm_folder_path, motif_name + ".txt")
            pwm = read_pwn_and_convert_into_list(path=path) # Load and convert
            if pwm:
                motifs_non_zero.append(motif_name)
                pwm = [f">{motif_name}\n"] + pwm
                for i in pwm: # Save pfm
                    f.write(i)
                

    # Intersect motif information with pwm information
    df_factors = df_factors[df_factors.Motif.isin(motifs_non_zero)]
    
    # Save factor info
    df_factors.to_csv(f'{data_name}.motif2factors.txt', sep='\t', index=False)
    
    print(df_factors.shape, len(motifs_non_zero))
    

# 3.  Pick up motif information for one species and save as gimmemotif pfm file format.



In [17]:
# Check species in this dataset
species_list = df.TF_Species.unique()
species_list.sort()

for i in species_list:
    print(i)


Acanthamoeba_castellanii
Acanthamoeba_polyphaga_mimivirus
Acanthocheilonema_viteae
Acipenser_baerii
Acremonium_chrysogenum
Acropora_formosa
Acropora_millepora
Acyrthosiphon_pisum
Aedes_aegypti
Aegilops_tauschii
Agaricus_bisporus
Ailuropoda_melanoleuca
Albugo_laibachii
Alligator_sinensis
Allomyces_macrogynus
Alternaria_brassicicola
Amanita_muscaria
Amborella_trichopoda
Amphimedon_queenslandica
Anas_platyrhynchos
Ancylostoma_caninum
Ancylostoma_ceylanicum
Ancylostoma_duodenale
Angiostrongylus_cantonensis
Angiostrongylus_costaricensis
Anisakis_simplex
Anncaliia_algerae
Anolis_carolinensis
Anopheles_albimanus
Anopheles_arabiensis
Anopheles_atroparvus
Anopheles_christyi
Anopheles_coluzzii
Anopheles_culicifacies
Anopheles_darlingi
Anopheles_dirus
Anopheles_epiroticus
Anopheles_farauti
Anopheles_funestus
Anopheles_gambiae
Anopheles_maculatus
Anopheles_melas
Anopheles_merus
Anopheles_minimus
Anopheles_quadriannulatus
Anopheles_sinensis
Anopheles_stephensi
Antirrhinum_majus
Apis_mellifera
Aplys

In [18]:
# Pick up motif information for one species and save as gimmemotif pfm file format.

species = "Danio_rerio"
make_motif_file_from_cisbp_data(pwm_folder_path="pwms", tfinfo_df=df, species=species)

(6133,)
(109560, 4) 5298


# 4. Check results



In [19]:
# Read motifs 

from gimmemotifs.motif import read_motifs

path = f"CisBP_ver2_{species}.pfm"
custom_motifs = read_motifs(path)
custom_motifs[:10]

[M00008_2.00_nnnAAww,
 M00045_2.00_GTAAACAA,
 M00056_2.00_TAATAAAT,
 M00066_2.00_nsGTTGCyAn,
 M00070_2.00_nrAACAATAnn,
 M00111_2.00_nGCCynnGGs,
 M00112_2.00_CCTsrGGCnA,
 M00113_2.00_nsCCnnAGGs,
 M00114_2.00_nnGCCynnGG,
 M00115_2.00_nnATnAAAn]

In [None]:
# Delete downloaded data

In [1]:
! rm -r TF_Information_all_motifs*
! rm PWMs.*
! rm -r pwms
