# TSS_bed_generate.ipynb
## This script takes in refseq annotation files of genes <br> and genes filtered for start codons and calculates <br> a window around either the 5' end of the gene <br> annotation or the start codon

### Import libraries

In [7]:
import math

### Define refseq input, output basename, and desired TSS window

In [12]:
file1 = [
    "/home/lsanford/Documents/data/genomes/hg38/" "hg38_refseq_genenames_included.bed"
]
file2 = ["/home/lsanford/Documents/data/genomes/hg38/" "hg38_refseq_start_codons.txt"]
basename = "hg38_refseq"
window = 1000

### Dump input files into variables

In [13]:
with open(file1[0]) as f:
    regions = []
    for line in f:
        reg = tuple(line.strip().split("\t"))
        regions.append(reg)

with open(file2[0]) as f:
    starts = []
    starts_idx = []
    for line in f:
        sc = tuple(line.strip().split("\t"))
        starts.append(sc)
        starts_idx.append(sc[5].split('"')[1])

### Depending on accession (NM or NR), take most 5' coordinate <br> based on strand and (for NM) start codon, calculate windows <br> around them, and store windows in a printable bed format

In [58]:
TSS_start = []
TSS_5prime = []

for i in range(len(regions)):

    # Find strand
    strand = regions[i][5]

    # NM entries have start codons
    if regions[i][3].split("_")[0] == "NM":
        accession = regions[i][3]

        # Find location in start codon table of accession
        idx_loc = starts_idx.index(accession)

        # Find most 5' corrdinate of whole gene annotation or start codon
        if strand == "+":
            five_prime = int(regions[i][1])
            start_base = int(starts[idx_loc][2])
        else:
            five_prime = int(regions[i][2])
            start_base = int(starts[idx_loc][3])

    # NR entries have no start codon annotations
    # Use 5' most coordinate for both tables
    else:

        if strand == "+":
            five_prime = int(regions[i][1])
            start_base = five_prime
        else:
            five_prime = int(regions[i][2])
            start_base = five_prime

    # Calculate windows around 5'-most coords and store in printable list
    low = int(five_prime - window / 2)
    high = int(five_prime + window / 2)
    nr = [
        str(regions[i][0]),
        str(low),
        str(high),
        str(regions[i][3]),
        str(regions[i][4]),
        str(regions[i][5]),
    ]
    TSS_5prime.append("".join(["\t".join(nr), "\n"]))

    # Calculate windows around start bases and store in printable list
    low = int(start_base - window / 2)
    high = int(start_base + window / 2)
    nr = [
        str(regions[i][0]),
        str(low),
        str(high),
        str(regions[i][3]),
        str(regions[i][4]),
        str(regions[i][5]),
    ]
    TSS_start.append("".join(["\t".join(nr), "\n"]))

### Make outfile names

In [60]:
indir = file1.split("/")
outdir = "/".join(indir[0:-1])
outfile_5prime = "".join([outdir, "/", basename, "_5prime_", str(window), "bp.bed"])
outfile_startcodons = "".join(
    [outdir, "/", basename, "_startcodons_", str(window), "bp.bed"]
)

### Export data

In [62]:
with open(outfile_5prime, "wt") as f:
    for i in range(len(TSS_5prime)):
        f.write(TSS_5prime[i])

with open(outfile_startcodons, "wt") as f:
    for i in range(len(TSS_start)):
        f.write(TSS_start[i])