In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
#import time

import os, sys, shutil, importlib, glob, subprocess
from tqdm.notebook import  tqdm

%config InlineBackend.figure_format = 'retina'

plt.rcParams['figure.figsize'] = (15,7)
plt.rcParams["savefig.dpi"] = 200


In [4]:
tmp = "./tmp"
output = "./output"

os.makedirs(tmp, exist_ok=True)
os.makedirs(output, exist_ok=True)

In [5]:
# Please set reference genome name
ref_genome = "sacCer3"

# 1. Download and unzip annotation data 

In [6]:
# URLs for genome annotation data
url_dictionary = {"mm10": "http://homer.ucsd.edu/homer/data/genomes/mm10.v6.0.zip",
                  "mm9": "http://homer.ucsd.edu/homer/data/genomes/mm9.v6.0.zip",
                  "hg19": "http://homer.ucsd.edu/homer/data/genomes/hg19.v6.0.zip",
                  "hg38": "http://homer.ucsd.edu/homer/data/genomes/hg38.v6.0.zip",
                  "sacCer2": "http://homer.ucsd.edu/homer/data/genomes/sacCer2.v6.4.zip",
                  "sacCer3": "http://homer.ucsd.edu/homer/data/genomes/sacCer3.v6.4.zip"#S.cerevisiae
                 }


In [7]:
url_dictionary[ref_genome]

'http://homer.ucsd.edu/homer/data/genomes/sacCer3.v6.4.zip'

In [8]:
# download data
! wget http://homer.ucsd.edu/homer/data/genomes/sacCer3.v6.4.zip

--2020-07-06 16:10:30--  http://homer.ucsd.edu/homer/data/genomes/sacCer3.v6.4.zip
Resolving homer.ucsd.edu (homer.ucsd.edu)... 169.228.63.226
Connecting to homer.ucsd.edu (homer.ucsd.edu)|169.228.63.226|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5213978 (5.0M) [application/zip]
Saving to: ‘sacCer3.v6.4.zip.1’


2020-07-06 16:10:32 (3.55 MB/s) - ‘sacCer3.v6.4.zip.1’ saved [5213978/5213978]



In [9]:
# Unzip data
! unzip sacCer3.v6.4.zip

Archive:  sacCer3.v6.4.zip
   creating: data/genomes/sacCer3/
  inflating: data/genomes/sacCer3/chrV.fa  
  inflating: data/genomes/sacCer3/chrII.fa  
  inflating: data/genomes/sacCer3/sacCer3.basic.annotation  
  inflating: data/genomes/sacCer3/chrXII.fa  
  inflating: data/genomes/sacCer3/sacCer3.splice5p  
  inflating: data/genomes/sacCer3/sacCer3.miRNA  
  inflating: data/genomes/sacCer3/chrXIV.fa  
  inflating: data/genomes/sacCer3/sacCer3.aug  
  inflating: data/genomes/sacCer3/sacCer3.stop  
  inflating: data/genomes/sacCer3/chrI.fa  
  inflating: data/genomes/sacCer3/chrIII.fa  
  inflating: data/genomes/sacCer3/chrVIII.fa  
   creating: data/genomes/sacCer3/annotations/
   creating: data/genomes/sacCer3/annotations/basic/
 extracting: data/genomes/sacCer3/annotations/basic/centromeres.ann.txt  
  inflating: data/genomes/sacCer3/annotations/basic/introns.ann.txt  
  inflating: data/genomes/sacCer3/annotations/basic/coding.ann.txt  
  inflating: data/genomes/sacCer3/annotations/

# 2. Make tss bed file

In [9]:
def make_tss_bed_file(ref_genome):

    tss = pd.read_csv(f"data/genomes/{ref_genome}/{ref_genome}.basic.annotation",
                      header=None, delimiter="\t")
    tss = tss[tss[5] == "P"]
    print("1. raw_tss_data")
    print(tss.head())


    print("2. save tss info as a bed file")
    tss = tss.reset_index(drop=False)
    tss[[1, 2, 3, "index", 5, 4]].to_csv(os.path.join(tmp, f"{ref_genome}_tss.bed"), 
                                         sep='\t', header=False, index=False)
    
    print(" tss bed file was saved as " + os.path.join(tmp, f"{ref_genome}_tss.bed"))


make_tss_bed_file(ref_genome=ref_genome)

1. raw_tss_data
                          0     1     2      3  4  5  6
0    promoter-TSS (YAL069W)  chrI  -665    435  +  P  1
1  promoter-TSS (YAL068W-A)  chrI   436    638  +  P  2
3  promoter-TSS (YAL067W-A)  chrI  1480   2068  +  P  4
4    promoter-TSS (YAL068C)  chrI  2069   3169  -  P  3
9    promoter-TSS (YAL067C)  chrI  8916  10016  -  P  5
2. save tss info as a bed file
 tss bed file was saved as ./tmp/sacCer3_tss.bed


# 2. Process peaks with homer

In [10]:
# command
input_bed = os.path.join(tmp, f"{ref_genome}_tss.bed")
out_bed = os.path.join(tmp, f"{ref_genome}_tss_with_annot.bed")
command = f'annotatePeaks.pl {input_bed} {ref_genome} >{out_bed}'

print(command)

annotatePeaks.pl ./tmp/sacCer3_tss.bed sacCer3 >./tmp/sacCer3_tss_with_annot.bed


In [None]:
# Install genome data

In [13]:
! perl /home/k/anaconda3/envs/pandas1/share/homer-4.11-1/.//configureHomer.pl -install sacCer3


	Current base directory for HOMER is /home/k/anaconda3/envs/pandas1/share/homer-4.11-1/.//

	Will install sacCer3
--2020-07-02 14:26:12--  http://homer.ucsd.edu/homer/update.txt
Resolving homer.ucsd.edu (homer.ucsd.edu)... 169.228.63.226
Connecting to homer.ucsd.edu (homer.ucsd.edu)|169.228.63.226|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17859 (17K) [text/plain]
Saving to: ‘/home/k/anaconda3/envs/pandas1/share/homer-4.11-1/.///update.txt’


2020-07-02 14:26:12 (340 KB/s) - ‘/home/k/anaconda3/envs/pandas1/share/homer-4.11-1/.///update.txt’ saved [17859/17859]

	Updating Settings...
`wget -O 0.377529841455729.tmp http://homer.ucsd.edu/homer/configureHomer.pl`;
--2020-07-02 14:26:12--  http://homer.ucsd.edu/homer/configureHomer.pl
Resolving homer.ucsd.edu (homer.ucsd.edu)... 169.228.63.226
Connecting to homer.ucsd.edu (homer.ucsd.edu)|169.228.63.226|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 27477 (27K) [application/x-perl]
Sa

In [14]:
# process tss file with homer
!annotatePeaks.pl ./tmp/sacCer3_tss.bed sacCer3 >./tmp/sacCer3_tss_with_annot.bed


	Peak file = ./tmp/sacCer3_tss.bed
	Genome = sacCer3
	Organism = yeast
	Peak/BED file conversion summary:
		BED/Header formatted lines: 6371
		peakfile formatted lines: 0
		Duplicated Peak IDs: 0

	Peak File Statistics:
		Total Peaks: 6371
		Redundant Peak IDs: 0
		Peaks lacking information: 0 (need at least 5 columns per peak)
		Peaks with misformatted coordinates: 0 (should be integer)
		Peaks with misformatted strand: 0 (should be either +/- or 0/1)

	Peak file looks good!

	Reading Positions...
	-----------------------
	Finding Closest TSS...
	Annotating:.................
		Annotation	Number of peaks	Total size (bp)	Log2 Ratio (obs/exp)	LogP enrichment (+values depleted)
		TTS	0.0	2952159	-10.998	1773.286
		Exon	0.0	3377468	-11.261	2074.881
		Intron	0.0	12503	-2.919	6.559
		Intergenic	0.0	227969	-6.940	120.665
		Promoter	6371.0	5580781	1.123	-4957.102
	NOTE: If this part takes more than 2 minutes, there is a good chance
		your machine ran out of memory: consider hitting ctrl+C and

# 3.load and process

In [12]:
out_bed

'./tmp/sacCer3_tss_with_annot.bed'

In [13]:
def process_tss_info():

    # load file
    tss_with_annot = pd.read_csv(out_bed, delimiter="\t", index_col=0)

    # process
    tss_with_annot.Start = tss_with_annot.Start - 1
    tss_with_annot.index.name = None
    tss_with_annot = tss_with_annot.reset_index(drop=False)

    # select info
    tss_with_annot = tss_with_annot[["Chr", "Start", "End", "Gene Name", 'Distance to TSS', "Strand"]]

    return tss_with_annot


In [14]:
tss_ref = process_tss_info()
tss_ref.head()

Unnamed: 0,Chr,Start,End,Gene Name,Distance to TSS,Strand
0,chrXII,1012246,1013023,,51,-
1,chrXII,789676,790776,MID2,-450,+
2,chrVII,196303,197403,RAD54,-450,-
3,chrI,189193,190293,YAT1,-450,+
4,chrX,531062,532162,RAD7,-450,+


In [18]:
tss_ref

Unnamed: 0,Chr,Start,End,Gene Name,Distance to TSS,Strand
0,chrXIV,224369,225469,CNM67,-450,-
1,chrIV,500100,500778,REG1,440,+
2,chrIX,307829,308929,,196,-
3,chrII,28199,29299,YBL100W-C,322,-
4,chrXIV,470377,471001,END3,413,+
...,...,...,...,...,...,...
6366,chrIX,201120,201461,KTR7,752,-
6367,chrXII,159405,160505,SSL1,-94,-
6368,chrXI,345520,346620,YKL050C,-450,-
6369,chrIV,2762,3862,,-450,+


In [19]:
tss_ref.to_csv(os.path.join(output, f"{ref_genome}_tss_info.bed"),  sep='\t', header=False, index=False)


