<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/ebola_ref/generate_ebov_ref.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
bp = "PRJNA665227"
fastq_folder = f"/home/laura/projects/virus-watch-data/{bp}/raw"
out_folder = f"{fastq_folder}/../ebola"

### Build Ebola genome fasta and gtf
We will split the complete genome into 500 bp pieces and give each of them a number stored in the gtf.

Ebola genome ViralProj14703 (linked to NC_002549.1) downloaded from https://www.ncbi.nlm.nih.gov/data-hub/genome/?taxon=186538

In [1]:
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/ebola_ref/GCA_000848505.1_ViralProj14703_genomic.fna
ebov_fna = "GCA_000848505.1_ViralProj14703_genomic.fna"

--2023-12-08 01:41:25--  https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/ebola_ref/GCA_000848505.1_ViralProj14703_genomic.fna
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19260 (19K) [text/plain]
Saving to: ‘GCA_000848505.1_ViralProj14703_genomic.fna’


2023-12-08 01:41:25 (5.03 MB/s) - ‘GCA_000848505.1_ViralProj14703_genomic.fna’ saved [19260/19260]



In [3]:
!pip install -q biopython
from Bio import SeqIO
import textwrap

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
# Load raw genome
records = list(SeqIO.parse(ebov_fna, "fasta"))
print(records[0].id)
print("genome length: ", len(records[0].seq))

AF086833.2
genome length:  18959


In [5]:
ebola_fasta = "ebov_GCA_000848505.fasta"
ebola_gtf = "ebov_GCA_000848505.gtf"

In [6]:
# Split genome into 500 bp pieces
sequences = textwrap.wrap(str(records[0].seq), 500)

Create new fasta and gtf files:

In [7]:
with open(ebola_gtf, "w") as gtf, open(ebola_fasta, "w") as dna:
    genome_name = "GCA_000848505"
    genome_date = "Oct_2000"

    # Add header lines to GTF
    gtf.write(
        f"#!genome-build {genome_name}.1\n#!genome-version {genome_name}\n#!genome-date {genome_date}\n#!genome-build-accession {genome_name}\n#!genebuild-last-updated {genome_date}\n")

    start = 1
    for i, seq in enumerate(sequences):
        source = genome_name
        features = ["gene", "transcript", "exon", "CDS"]
        frames = [".", ".", ".", "0"]
        end = start  + len(seq)
        gene_id = i

        for feature, frame in zip(features, frames):
            if feature == "gene":
                gtf.write(
                    f'{gene_id}\t{source}\t{feature}\t{start} {end} .\t+\t{frame}\tgene_id "{gene_id}"; gene_version "1"; gene_name "{gene_id}"; gene_source "palmdb"; gene_biotype "protein_coding";\n'
                    )
            if feature == "transcript":
                gtf.write(
                    f'{gene_id}\t{source}\t{feature}\t{start} {end} .\t+\t{frame}\tgene_id "{gene_id}"; gene_version "1"; transcript_id "{gene_id}T"; transcript_version "1"; gene_name "{gene_id}"; gene_source "palmdb"; gene_biotype "protein_coding"; transcript_name "{gene_id}"; transcript_source "palmdb"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS{gene_id}"; tag "basic";\n'
                )
            if feature == "exon":
                gtf.write(
                    f'{gene_id}\t{source}\t{feature}\t{start} {end} .\t+\t{frame}\tgene_id "{gene_id}"; gene_version "1"; transcript_id "{gene_id}T"; transcript_version "1"; exon_number "1"; gene_name "{gene_id}"; gene_source "palmdb"; gene_biotype "protein_coding"; transcript_name "{gene_id}"; transcript_source "palmdb"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS{gene_id}"; exon_id "{gene_id}E"; exon_version "1"; tag "basic";\n'
                )
            if feature == "CDS":
                gtf.write(
                    f'{gene_id}\t{source}\t{feature}\t{start} {end} .\t+\t{frame}\tgene_id "{gene_id}"; gene_version "1"; transcript_id "{gene_id}T"; transcript_version "1"; exon_number "1"; gene_name "{gene_id}"; gene_source "palmdb"; gene_biotype "protein_coding"; transcript_name "{gene_id}"; transcript_source "palmdb"; transcript_biotype "protein_coding"; tag "CCDS"; ccds_id "CCDS{gene_id}"; protein_id "{gene_id}P"; protein_version "1"; tag "basic";\n'
                    )

        # Build dna/genome file
        dna.write(f">{gene_id} dna:chromosome chromosome:{genome_name}:{gene_id}:{start}:{end}:1 REF\n")
        dna.write(f"{seq}\n")

### Generate Ebola reference index

In [8]:
ebola_index = "ebov_GCA_000848505.idx"
ebola_t2g = "ebov_GCA_000848505_t2g.txt"
ebola_f1 = "ebov_GCA_000848505_f1.fa"

In [None]:
!pip install -q kb-python

In [10]:
!kb ref \
    -i $ebola_index \
    -g $ebola_t2g \
    -f1 $ebola_f1 \
    -t 2 \
    $ebola_fasta $ebola_gtf

[2023-12-08 01:43:08,022]    INFO [ref] Preparing ebov_GCA_000848505.fasta, ebov_GCA_000848505.gtf
[2023-12-08 01:43:08,031]    INFO [ref] Splitting genome ebov_GCA_000848505.fasta into cDNA at /content/tmp/tmpxncz8n5q
[2023-12-08 01:43:08,037]    INFO [ref] Concatenating 1 cDNAs to ebov_GCA_000848505_f1.fa
[2023-12-08 01:43:08,039]    INFO [ref] Creating transcript-to-gene mapping at ebov_GCA_000848505_t2g.txt
[2023-12-08 01:43:08,040]    INFO [ref] Indexing ebov_GCA_000848505_f1.fa to ebov_GCA_000848505.idx
