In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
#import time

import os, sys, shutil, importlib, glob, subprocess
from tqdm.notebook import  tqdm

%config InlineBackend.figure_format = 'retina'

plt.rcParams['figure.figsize'] = (15,7)
plt.rcParams["savefig.dpi"] = 200


In [40]:
tmp = "./tmp"
output = "./output"

os.makedirs(tmp, exist_ok=True)
os.makedirs(output, exist_ok=True)

In [41]:
# Please set reference genome name
ref_genome = "danRer7"

# 1. Download and unzip annotation data 

In [42]:
# URLs for genome annotation data
url_dictionary = {"mm10": "http://homer.ucsd.edu/homer/data/genomes/mm10.v6.0.zip",
                  "mm9": "http://homer.ucsd.edu/homer/data/genomes/mm9.v6.0.zip",
                  "hg19": "http://homer.ucsd.edu/homer/data/genomes/hg19.v6.0.zip",
                  "hg38": "http://homer.ucsd.edu/homer/data/genomes/hg38.v6.0.zip",
                  "sacCer2": "http://homer.ucsd.edu/homer/data/genomes/sacCer2.v6.4.zip",
                  "sacCer3": "http://homer.ucsd.edu/homer/data/genomes/sacCer3.v6.4.zip", #S.cerevisiae
                  "danRer10": "http://homer.ucsd.edu/homer/data/genomes/danRer10.v6.4.zip", # Zebrafish
                  "danRer11": "http://homer.ucsd.edu/homer/data/genomes/danRer11.v6.4.zip", # Zebrafish
                  "danRer7": "http://homer.ucsd.edu/homer/data/genomes/danRer7.v6.4.zip" # Zebrafish
                 }


In [43]:
ls

 0.592361685195105.clean.pos
 0.592361685195105.pos
 make_tss_referenece_from_homer_data.ipynb
'make_tss_referenece_from_homer_data-S cerevisiae.ipynb'
 make_tss_referenece_from_homer_data_zebrafish_danRer7.ipynb
 [0m[01;34moutput[0m/
 [01;34mtmp[0m/


In [44]:
url_dictionary[ref_genome]

'http://homer.ucsd.edu/homer/data/genomes/danRer7.v6.4.zip'

In [45]:
# download data
cmd = f"wget {url_dictionary[ref_genome]}"
os.system(cmd)

0

In [46]:
ls

 0.592361685195105.clean.pos
 0.592361685195105.pos
 [0m[01;31mdanRer7.v6.4.zip[0m
 make_tss_referenece_from_homer_data.ipynb
'make_tss_referenece_from_homer_data-S cerevisiae.ipynb'
 make_tss_referenece_from_homer_data_zebrafish_danRer7.ipynb
 [01;34moutput[0m/
 [01;34mtmp[0m/


In [47]:
# Unzip data
cmd = "unzip " + url_dictionary[ref_genome].split("/")[-1]
os.system(cmd)
#! unzip sacCer3.v6.4.zip

0

In [48]:
ls

 0.592361685195105.clean.pos
 0.592361685195105.pos
 [0m[01;31mdanRer7.v6.4.zip[0m
 [01;34mdata[0m/
 make_tss_referenece_from_homer_data.ipynb
'make_tss_referenece_from_homer_data-S cerevisiae.ipynb'
 make_tss_referenece_from_homer_data_zebrafish_danRer7.ipynb
 [01;34moutput[0m/
 [01;34mtmp[0m/


# 2. Make tss bed file

In [49]:
def make_tss_bed_file(ref_genome):

    tss = pd.read_csv(f"data/genomes/{ref_genome}/{ref_genome}.basic.annotation",
                      header=None, delimiter="\t")
    tss = tss[tss[5] == "P"]
    print("1. raw_tss_data")
    print(tss.head())


    print("2. save tss info as a bed file")
    tss = tss.reset_index(drop=False)
    tss[[1, 2, 3, "index", 5, 4]].to_csv(os.path.join(tmp, f"{ref_genome}_tss.bed"), 
                                         sep='\t', header=False, index=False)
    
    print(" tss bed file was saved as " + os.path.join(tmp, f"{ref_genome}_tss.bed"))


make_tss_bed_file(ref_genome=ref_genome)

1. raw_tss_data
                              0     1      2      3  4  5      6
1   promoter-TSS (NM_001089558)  chr1   2800   3900  +  P   3534
18     promoter-TSS (NM_131819)  chr1  14962  16062  +  P  14002
36     promoter-TSS (NM_173228)  chr1  22260  23360  +  P  15479
53     promoter-TSS (NM_201462)  chr1  31022  32122  +  P  12147
72  promoter-TSS (NM_001326520)  chr1  39293  40393  +  P   6695
2. save tss info as a bed file
 tss bed file was saved as ./tmp/danRer7_tss.bed


# 2. Process peaks with homer

In [50]:
# command
input_bed = os.path.join(tmp, f"{ref_genome}_tss.bed")
out_bed = os.path.join(tmp, f"{ref_genome}_tss_with_annot.bed")
command = f'annotatePeaks.pl {input_bed} {ref_genome} >{out_bed}'

print(command)

annotatePeaks.pl ./tmp/danRer7_tss.bed danRer7 >./tmp/danRer7_tss_with_annot.bed


In [51]:
# Install genome data
ref_genome

'danRer7'

In [52]:
! perl /home/k/anaconda3/envs/pandas1/share/homer-4.11-1/.//configureHomer.pl -install danRer7


	Current base directory for HOMER is /home/k/anaconda3/envs/pandas1/share/homer-4.11-1/.//

	Will install danRer7
--2020-07-14 17:47:00--  http://homer.ucsd.edu/homer/update.txt
Resolving homer.ucsd.edu (homer.ucsd.edu)... 169.228.63.226
Connecting to homer.ucsd.edu (homer.ucsd.edu)|169.228.63.226|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17859 (17K) [text/plain]
Saving to: ‘/home/k/anaconda3/envs/pandas1/share/homer-4.11-1/.///update.txt’


2020-07-14 17:47:00 (339 KB/s) - ‘/home/k/anaconda3/envs/pandas1/share/homer-4.11-1/.///update.txt’ saved [17859/17859]

	Updating Settings...
`wget -O 0.0543408597159249.tmp http://homer.ucsd.edu/homer/configureHomer.pl`;
--2020-07-14 17:47:00--  http://homer.ucsd.edu/homer/configureHomer.pl
Resolving homer.ucsd.edu (homer.ucsd.edu)... 169.228.63.226
Connecting to homer.ucsd.edu (homer.ucsd.edu)|169.228.63.226|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27477 (27K) [application/x-perl]
S

In [53]:
# process tss file with homer
!annotatePeaks.pl ./tmp/danRer7_tss.bed danRer7 >./tmp/danRer7_tss_with_annot.bed


	Peak file = ./tmp/danRer7_tss.bed
	Genome = danRer7
	Organism = zebrafish
	Peak/BED file conversion summary:
		BED/Header formatted lines: 16180
		peakfile formatted lines: 0
		Duplicated Peak IDs: 0

	Peak File Statistics:
		Total Peaks: 16180
		Redundant Peak IDs: 0
		Peaks lacking information: 0 (need at least 5 columns per peak)
		Peaks with misformatted coordinates: 0 (should be integer)
		Peaks with misformatted strand: 0 (should be either +/- or 0/1)

	Peak file looks good!

	Reading Positions...
	-----------------------
	Finding Closest TSS...
	Annotating:..............................................................................................................................................................................................
		Annotation	Number of peaks	Total size (bp)	Log2 Ratio (obs/exp)	LogP enrichment (+values depleted)
		3UTR	0.0	7084442	-6.414	84.066
		ncRNA	0.0	61348	-0.788	0.726
		TTS	0.0	16480804	-7.633	196.245
		pseudo	0.0	20182	-0.309	0.239
		Exon

# 3.load and process

In [54]:
out_bed

'./tmp/danRer7_tss_with_annot.bed'

In [55]:
def process_tss_info():

    # load file
    tss_with_annot = pd.read_csv(out_bed, delimiter="\t", index_col=0)

    # process
    tss_with_annot.Start = tss_with_annot.Start - 1
    tss_with_annot.index.name = None
    tss_with_annot = tss_with_annot.reset_index(drop=False)

    # select info
    tss_with_annot = tss_with_annot[["Chr", "Start", "End", "Gene Name", 'Distance to TSS', "Strand"]]

    return tss_with_annot


In [56]:
tss_ref = process_tss_info()
tss_ref.head()

Unnamed: 0,Chr,Start,End,Gene Name,Distance to TSS,Strand
0,chr24,37835285,37836385,grnb,-450,+
1,chr9,10228462,10229562,ugt1a7,-450,-
2,chr1,51145499,51146599,herc3,-450,+
3,chr3,42958733,42959833,axin1,-450,+
4,chr20,36844613,36845713,heca,-450,+


In [57]:
path = os.path.join(output, f"{ref_genome}_tss_info.bed")
tss_ref.to_csv(path,  sep='\t', header=False, index=False)




In [65]:
ls

 make_tss_referenece_from_homer_data.ipynb
'make_tss_referenece_from_homer_data-S cerevisiae.ipynb'
 make_tss_referenece_from_homer_data_zebrafish_danRer7.ipynb
 [0m[01;34moutput[0m/


In [59]:
for i in [ref_genome + "*", "data", "tmp"]:
    cmd = "rm -r " + i
    print(os.system(cmd))

0
0
0


In [60]:
ls output/

danRer7_tss_info.bed


In [61]:
# move data
cmd = f"mv {path} ../../../../../celloracle/motif_analysis/tss_ref_data/"
os.system(cmd)

0

In [62]:
ls ../../../../../celloracle/motif_analysis/tss_ref_data/

danRer10_tss_info.bed  hg19_tss_info.bed  mm9_tss_info.bed
danRer11_tss_info.bed  hg38_tss_info.bed  sacCer2_tss_info.bed
danRer7_tss_info.bed   mm10_tss_info.bed  sacCer3_tss_info.bed
