In [None]:
params_resource_dir = "../resources/"

# Datasets - General datasets

## Overview

Manipulating Genome annotations and FASTA files.


- [Genome data](#genome-data)
- [Full example: from a transcript of interest to extracted CDS sequences](#integrating-everything-intervals-and-sequences)

## Imports

In [None]:
import dataclasses
import os
import sys
from io import StringIO
from numbers import Number
from pathlib import Path
from typing import Literal

import numpy as np
import pandas as pd
import pybedtools as pbt
import pyBigWig as pbw
import pydantic.dataclasses
import pyfaidx
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from pydantic.dataclasses import dataclass

## Initialization

In [None]:
resource_dir = Path(params_resource_dir)
if not resource_dir.exists():
    raise FileNotFoundError("Resource directory does not exist: " + str(resource_dir))

print("Using resources from:", resource_dir)

Using resources from: ../resources


## Genome data

### Genome chromosome sizes

In [4]:
filepath = resource_dir / "general" / "genome" / "hg38.chrom.sizes"
print('Loading the "genome file" from:', filepath)

genome_file = pd.read_csv(filepath, header=None, sep="\t")
genome_file.columns = ["chrom", "size"]

main_autosomes = [f"chr{i}" for i in range(1, 23)]
sex_chromosomes = ["chrX", "chrY"]
mitochondrial_chromosome = ["chrM"]
main_chromosomes = list(set(main_autosomes) | set(sex_chromosomes) | set(mitochondrial_chromosome))

print(f"Main chromosomes: {', '.join(main_autosomes + sex_chromosomes + mitochondrial_chromosome)}")
print("\n")

display(genome_file.head(5).assign(size=lambda x: x["size"].map("{:_}".format)))

print(
    f"Total number of positions from the main chromosomes: {genome_file.loc[genome_file['chrom'].isin(main_chromosomes), 'size'].sum():_}"
)

print("\n")

other_chromosomes = sorted(list(set(genome_file.chrom) - set(main_chromosomes)))

print(f"Number of additional chromosome regions: {len(other_chromosomes):_}")
print("Examples: " + ", ".join(other_chromosomes[:3]))

Loading the "genome file" from: ../resources/general/genome/hg38.chrom.sizes
Main chromosomes: chr1, chr2, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr20, chr21, chr22, chrX, chrY, chrM




Unnamed: 0,chrom,size
0,chr1,248_956_422
1,chr2,242_193_529
2,chr3,198_295_559
3,chr4,190_214_555
4,chr5,181_538_259


Total number of positions from the main chromosomes: 3_088_286_401


Number of additional chromosome regions: 430
Examples: chr10_GL383545v1_alt, chr10_GL383546v1_alt, chr10_KI270824v1_alt


### Genome annotations

The genome has been annotated with multiple features and properties, which can be 
represented as genomic intervals, i.e. the coordinate system with :

- chromosome
- start position
- end position

Additionally, the **strand** ('plus' or 'minus', represented as '+' and '-' or as integers `1` and `-1`) can be specified.

Importantly: the genomic intervals are usually represented as 0-based, half-open intervals,
but this depends on the file format and the convention adopted by the 
database.

Gene annotations are the most obvious example of annotations: defining the 
genomic intervals associated with an experimentally characterized entity.


Multiple annotations can overlap:

- either on the same strand: 
    - this may be the case for sub-annotations related to a "meta" annotation ;
    - e.g. a gene can actually be *transcribed* into multiple *transcripts*.
    - e.g. a transcript can be "segmented" into *exons* (implicitly separated by *introns*), further segmented into *CDS* intervals and *UTR* (UnTranslated Region) intervals.

- or on different strands: biologically, the two strands can bear genes. Furthermore, specific annotations (e.g. bias in sequence content) can be strand-specific.


Multiple online reference resources can provide with different annotations for the same genome.

E.g. different resources apply different criteria to define the same gene.

The provided resources for gene annotations are here:

- GENCODE: a comprehensive annotation of the (human) genome.
- MANE: a high-quality curated set of human transcripts, that attempts to provide a single representative transcript for each gene, that is consistent with different resources (including GENCODE).

Different data formats exist to represent genomic annotations within a genome:

- **General Feature Format** and **GTF** (General Transfer Format): tabular format, where genomic intervals are 1-based, closed intervals (i.e. the first position is 1, and the end position is included in the interval).
    - Different versions of the GFF format exist, the latest being GFF3.
    - GTF is a specific version of GFF.

- **BED** (Browser Extensible Data): tabular format, where genomic intervals are 0-based, half-open intervals (i.e. the first position is 0, and the end position is not included in the interval).
    - Commonly represented as a 3-column (chrom, start, end), 4 column (chrom, start, end, name) or 6-column (chrom, start, end, name, score, strand) format.
    - Additional columns can be added (up to 12) to represent additional information, notably for visualization purposes.

Such file formats can be compressed with `bgzip` (a block compression format) and indexed with `tabix` (a tool to index tabular data), and can be queried with specific libraries to enable fast access to specific genomic intervals.

In [6]:
bed6_cols = ["chrom", "start", "end", "name", "score", "strand"]
gff_columns = [
    "seqid",
    "source",
    "biotype",
    "start",
    "end",
    "score",
    "strand",
    "phase",
    "attributes",
]

#### Example - MANE gene annotations

In [8]:
filepath = resource_dir / "general" / "gene_annotations.hg38.MANE_v1.4" / "MANE.GRCh38.v1.4.ensembl_genomic.gff.gz"

tmp_gff = pd.read_csv(filepath, sep="\t", compression="gzip", header=None, comment="#", nrows=10, names=gff_columns)
display(tmp_gff.head(5))

Unnamed: 0,seqid,source,biotype,start,end,score,strand,phase,attributes
0,chr1,ensembl_havana,gene,3069168,3438621,.,+,.,ID=ENSG00000142611.17;gene_id=ENSG00000142611....
1,chr1,ensembl_havana,transcript,3069203,3438621,.,+,.,ID=ENST00000270722.10;Parent=ENSG00000142611.1...
2,chr1,ensembl_havana,exon,3069203,3069296,.,+,.,ID=exon:ENST00000270722.10:1;Parent=ENST000002...
3,chr1,ensembl_havana,CDS,3069260,3069296,.,+,0,ID=CDS:ENST00000270722.10;Parent=ENST000002707...
4,chr1,ensembl_havana,start_codon,3069260,3069262,.,+,0,ID=start_codon:ENST00000270722.10;Parent=ENST0...


Two BED files are provided, extracted from the GFF file.

In [10]:
filepath = (
    resource_dir / "general" / "gene_annotations.hg38.MANE_v1.4" / "MANE.GRCh38.v1.4.ensembl_genomic.genes.bed.gz"
)
tmp1 = pd.read_csv(
    filepath,
    sep="\t",
    compression="gzip",
    header=None,
    comment="#",
    nrows=100,
    names=bed6_cols,
)
display(tmp1.head(5))

filepath = (
    resource_dir / "general" / "gene_annotations.hg38.MANE_v1.4" / "MANE.GRCh38.v1.4.ensembl_genomic.transcripts.bed.gz"
)
tmp2 = pd.read_csv(
    filepath,
    sep="\t",
    compression="gzip",
    header=None,
    comment="#",
    nrows=100,
    names=bed6_cols,
)
display(tmp2.head(5))


Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,65418,71585,ENSG00000186092.7,.,+
1,chr1,450739,451678,ENSG00000284733.2,.,-
2,chr1,685715,686654,ENSG00000284662.2,.,-
3,chr1,923922,944575,ENSG00000187634.13,.,+
4,chr1,944202,959309,ENSG00000188976.11,.,-


Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,65418,71585,ENST00000641515.2,.,+
1,chr1,450739,451678,ENST00000426406.4,.,-
2,chr1,685715,686654,ENST00000332831.5,.,-
3,chr1,923922,944574,ENST00000616016.5,.,+
4,chr1,944202,959256,ENST00000327044.7,.,-


Intervals can be intersected with the `BedTool` library (and its `pyBedTools` Python wrapper).

In [11]:
bt_tmp1 = pbt.BedTool.from_dataframe(tmp1)
bt_tmp2 = pbt.BedTool.from_dataframe(tmp2)

intersect = bt_tmp1.intersect(bt_tmp2, s=True, wao=True).to_dataframe(
    names=["a." + c for c in bed6_cols] + ["b." + c for c in bed6_cols] + ["overlap_size"]
)

display(intersect.head(5))

if any(intersect["a.name"].value_counts() != 1):
    raise ValueError("The gene IDs column has non-unique values. This is unexpected.")

display(intersect["a.name"].value_counts().sort_values().head(3))


if any(intersect["b.name"].value_counts() != 1):
    raise ValueError("The transcript IDs column has non-unique values. This is unexpected.")

display(intersect["b.name"].value_counts().sort_values().head(3))


Unnamed: 0,a.chrom,a.start,a.end,a.name,a.score,a.strand,b.chrom,b.start,b.end,b.name,b.score,b.strand,overlap_size
0,chr1,65418,71585,ENSG00000186092.7,.,+,chr1,65418,71585,ENST00000641515.2,.,+,6167
1,chr1,450739,451678,ENSG00000284733.2,.,-,chr1,450739,451678,ENST00000426406.4,.,-,939
2,chr1,685715,686654,ENSG00000284662.2,.,-,chr1,685715,686654,ENST00000332831.5,.,-,939
3,chr1,923922,944575,ENSG00000187634.13,.,+,chr1,923922,944574,ENST00000616016.5,.,+,20652
4,chr1,944202,959309,ENSG00000188976.11,.,-,chr1,944202,959256,ENST00000327044.7,.,-,15054


a.name
ENSG00000041988.16    1
ENSG00000007923.17    1
ENSG00000171735.21    1
Name: count, dtype: int64

b.name
ENST00000054650.9     1
ENST00000377577.10    1
ENST00000303635.12    1
Name: count, dtype: int64

Here we have intersected the pre-extracted `biotype="gene"` annotations labeled with their gene IDs with the pre-extracted `biotype="transcript"` annotations labeled with their transcript IDs.

The `value_counts()` operation highlights that the MANE dataset is curated, providing with a single transcript for each gene.

#### Example - GENCODE gene annotations

In [12]:
filepath = resource_dir / "general" / "gene_annotations.hg38.gencode_v40" / "gencode.v40.annotation.gff3.gz"

tmp1 = pd.read_csv(filepath, sep="\t", compression="gzip", header=None, comment="#", nrows=100, names=gff_columns)
display(tmp1.head(5))


Unnamed: 0,seqid,source,biotype,start,end,score,strand,phase,attributes
0,chr1,HAVANA,gene,11869,14409,.,+,.,ID=ENSG00000223972.5;gene_id=ENSG00000223972.5...
1,chr1,HAVANA,transcript,11869,14409,.,+,.,ID=ENST00000456328.2;Parent=ENSG00000223972.5;...
2,chr1,HAVANA,exon,11869,12227,.,+,.,ID=exon:ENST00000456328.2:1;Parent=ENST0000045...
3,chr1,HAVANA,exon,12613,12721,.,+,.,ID=exon:ENST00000456328.2:2;Parent=ENST0000045...
4,chr1,HAVANA,exon,13221,14409,.,+,.,ID=exon:ENST00000456328.2:3;Parent=ENST0000045...


Additionally, metadata files were generated to help processing these files.

In [13]:
filepath = resource_dir / "general" / "gene_annotations.hg38.gencode_v40" / "gene_metadata.tsv.gz"

tmp1 = pd.read_csv(filepath, sep="\t", compression="gzip", header=0)

print("Mapping gene IDs to transcript ID(s), gene name, etc.")
display(tmp1.head(5))

###
print("\n")

filepath = resource_dir / "general" / "gene_annotations.hg38.gencode_v40" / "genetype_metadata.tsv"

print("Mapping gene types to full descriptions.")
tmp1 = pd.read_csv(filepath, sep="\t", header=0)
display(tmp1.sample(5).sort_values(by="genetype"))


###
print("\n")

filepath = resource_dir / "general" / "gene_annotations.hg38.gencode_v40" / "simplified_grouped_genetype.tsv"

print("Table mapping gene types to simplified groups.")
tmp1 = pd.read_csv(filepath, sep="\t", header=0)
display(tmp1.sample(5).sort_values(by="group_genetype"))


Mapping gene IDs to transcript ID(s), gene name, etc.


Unnamed: 0,transcript_id,gene_id,gene_name,gene_type,protein_id,official_name,official_id
0,ENST00000456328.2,ENSG00000223972.5,DDX11L1,transcribed_unprocessed_pseudogene,,DDX11L1,HGNC:37102
1,ENST00000450305.2,ENSG00000223972.5,DDX11L1,transcribed_unprocessed_pseudogene,,DDX11L1,HGNC:37102
2,ENST00000488147.1,ENSG00000227232.5,WASH7P,unprocessed_pseudogene,,WASH7P,HGNC:38034
3,ENST00000619216.1,ENSG00000278267.1,MIR6859-1,miRNA,,MIR6859-1,HGNC:50039
4,ENST00000473358.1,ENSG00000243485.5,MIR1302-2HG,lncRNA,,MIR1302-2HG,HGNC:52482




Mapping gene types to full descriptions.


Unnamed: 0,genetype,Definition
11,IG_J_pseudogene,Inactivated immunoglobulin gene.
34,miRNA_pseudogene,Non-coding RNA predicted to be pseudogene by t...
42,non_coding,Transcript which is known from the literature ...
39,protein_coding,Contains an open reading frame (ORF).
38,retained_intron,Alternatively spliced transcript believed to c...




Table mapping gene types to simplified groups.


Unnamed: 0,genetype,simplified_genetype,group_genetype
23,ribozyme,ncRNA,ncRNA
24,sRNA,ncRNA,ncRNA
2,IG_J_gene,IG_TR_gene,protein_coding
56,translated_unprocessed_pseudogene,pseudogene,pseudogene
55,translated_processed_pseudogene,pseudogene,pseudogene


Finally, a set of files with non-overlapping intervals were generated (one for each strand). 

These intervals provide for each segment its most important "biotype", i.e. the type of gene-related annotation representing the most important biological function at this genomic interval.

e.g. if two genes overlap at a given interval, where one gene has a CDS (coding sequence) annotated while the other gene has a UTR (untranslated region) annotated, the CDS will be considered as the most important biotype at this interval.


In [14]:
# NOTE: we provide the comprehensive set :


selected_annotation_set: Literal["complete", "proteincoding_only"] = "proteincoding_only"

filepath_minus = (
    resource_dir
    / "general"
    / "gene_annotations.hg38.gencode_v40"
    / f"{selected_annotation_set}.non-overlap.annotated.minus.bed.gz"
)
filepath_plus = (
    resource_dir
    / "general"
    / "gene_annotations.hg38.gencode_v40"
    / f"{selected_annotation_set}.non-overlap.annotated.plus.bed.gz"
)

tmp1 = pd.read_csv(filepath_minus, sep="\t", compression="gzip", header=None, comment="#", names=bed6_cols, nrows=100)
tmp2 = pd.read_csv(filepath_plus, sep="\t", compression="gzip", header=None, comment="#", names=bed6_cols, nrows=100)

display(tmp1.head(10))
display(tmp2.head(10))


Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,450739,451675,stop_codon;OR4F29;protein_coding;ENSG000002847...,.,-
1,chr1,451675,451678,start_codon;OR4F29;protein_coding;ENSG00000284...,.,-
2,chr1,685715,686651,stop_codon;OR4F16;protein_coding;ENSG000002846...,.,-
3,chr1,686651,686654,start_codon;OR4F16;protein_coding;ENSG00000284...,.,-
4,chr1,944202,944203,three_prime_UTR;NOC2L;protein_coding;ENSG00000...,.,-
5,chr1,944203,944204,three_prime_UTR;NOC2L;protein_coding;ENSG00000...,.,-
6,chr1,944204,944693,three_prime_UTR;NOC2L;protein_coding;ENSG00000...,.,-
7,chr1,944693,944696,stop_codon;NOC2L;protein_coding;ENSG0000018897...,.,-
8,chr1,944696,944800,CDS;NOC2L;protein_coding;ENSG00000188976.11,.,-
9,chr1,944800,945041,intron;NOC2L;protein_coding;ENSG00000188976.11,.,-


Unnamed: 0,chrom,start,end,name,score,strand
0,chr1,65418,65433,five_prime_UTR;OR4F5;protein_coding;ENSG000001...,.,+
1,chr1,65433,65519,intron;OR4F5;protein_coding;ENSG00000186092.7,.,+
2,chr1,65519,65564,five_prime_UTR;OR4F5;protein_coding;ENSG000001...,.,+
3,chr1,65564,65567,start_codon;OR4F5;protein_coding;ENSG000001860...,.,+
4,chr1,65567,65573,CDS;OR4F5;protein_coding;ENSG00000186092.7,.,+
5,chr1,65573,69036,intron;OR4F5;protein_coding;ENSG00000186092.7,.,+
6,chr1,69036,70005,CDS;OR4F5;protein_coding;ENSG00000186092.7,.,+
7,chr1,70005,70008,stop_codon;OR4F5;protein_coding;ENSG00000186092.7,.,+
8,chr1,70008,71585,three_prime_UTR;OR4F5;protein_coding;ENSG00000...,.,+
9,chr1,923922,924431,five_prime_UTR;SAMD11;protein_coding;ENSG00000...,.,+


### Genome sequence

Nucleotide (and protein) sequences can be stored in text files following the FASTA format.

Such files can be compressed with `bgzip` and indexed with `samtools faidx` to enable fast access to specific sequences.

We can then use dedicated libraries to query the file, e.g. to retrieve the sequence of a specific genomic interval.

Here with the `pyfaidx` library, we can retrieve the sequence of a specific genomic interval.

In [16]:
fa = pyfaidx.Fasta(resource_dir / "general" / "genome" / "hg38.fa.gz")

In [None]:
from typing import Protocol



@dataclass
class GenomicInterval:
    chrom: str
    start: int
    end: int
    name: str
    score: int | float | str
    strand: Literal["+", "-"]

    def __post_init__(self):
        # Enforce [0, end) interval.
        if self.start < 0:
            raise ValueError(f"Start position {self.start} must be non-negative.")
        if self.start >= self.end:
            raise ValueError(f"Start position {self.start} must be less than end position {self.end}.")

#### Example - Plus (or "forward") strand sequence

In [18]:
# First example is on the FORWARD, PLUS strand.
example_interval = GenomicInterval(
    chrom="chr1", start=65418, end=65433, name="five_prime_UTR;OR4F5", score=".", strand="+"
)

# CAUTION : query is 1-based, inclusive at end.
query_result = str(
    fa.get_seq(
        name=example_interval.chrom,
        start=example_interval.start + 1,
        end=example_interval.end,
        rc=example_interval.strand == "-",
    )
)

# To simplify : one can use a more classic slicing syntax, with 0-based indexing.
query_result_alt = fa["chr1"][example_interval.start : example_interval.end]
print(query_result_alt)

print("Verify that length of query result is correct:")
print(
    f"{len(query_result):_} == {example_interval.end - example_interval.start:_}? "
    f"{len(query_result) == example_interval.end - example_interval.start}"
)

print("\n")
print("Verify that the query result is the same for both syntaxes:")
print(f"{(query_result == query_result_alt)=}")
print(query_result)
print(query_result_alt)

CCCAGATCTCTTCAG
Verify that length of query result is correct:
15 == 15? True


Verify that the query result is the same for both syntaxes:
(query_result == query_result_alt)=True
CCCAGATCTCTTCAG
CCCAGATCTCTTCAG


#### Example - Minus (or "reverse") strand sequence

In [20]:
# Second example is on the REVERSE, MINUS strand.
example_interval = GenomicInterval(
    chrom="chr1", start=944204, end=944224, name="(last 20 nt) three_prime_UTR;NOC2L", score=".", strand="-"
)

print(example_interval)

# CAUTION : query is 1-based, inclusived at end.
query_result_minus = fa.get_seq(
    name=example_interval.chrom,
    start=example_interval.start + 1,
    end=example_interval.end,
    rc=example_interval.strand == "-",
)

print("\n")
print("Query result:")
print(query_result_minus.__repr__())

# We can test the alternative syntax. Here we apply the two methods to reverse and complement.
query_result_minus_alt = (fa[example_interval.chrom][example_interval.start : example_interval.end]).reverse.complement

###

print("\n")
print("Checking the alternative syntax:")
print(f"{(query_result_minus_alt == query_result_minus)=}")

print(query_result_minus_alt)
print(f"{(len(query_result_minus)== (example_interval.end - example_interval.start))=}")

###

# We can make clear that the sequence is indeed a reverse-complement of the forward strand.

map_complement = {"A": "T", "T": "A", "C": "G", "G": "C", "N": "N"}
query_result_plus = fa[example_interval.chrom][example_interval.start : example_interval.end]

print("\n")
print("Displaying the fact that the sequence is a reverse-complement of the forward strand:")
print("5'  " + "-".join(str(query_result_plus)[:10]) + "..." + "  3'")
print(
    "    "
    + "".join(
        ["| " for a, b in zip(str(query_result_plus), str(query_result_minus)[::-1]) if map_complement[a] == b][:10]
    )
)
print("3'  " + "-".join(str(query_result_minus)[-10:][::-1]) + "..." + "  5'")

GenomicInterval(chrom='chr1', start=944204, end=944224, name='(last 20 nt) three_prime_UTR;NOC2L', score='.', strand='-')


Query result:
>chr1:944224-944205 (complement)
GGAGGCAGGGCCATTGTGTT


Checking the alternative syntax:
(query_result_minus_alt == query_result_minus)=True
GGAGGCAGGGCCATTGTGTT
(len(query_result_minus)== (example_interval.end - example_interval.start))=True


Displaying the fact that the sequence is a reverse-complement of the forward strand:
5'  A-A-C-A-C-A-A-T-G-G...  3'
    | | | | | | | | | | 
3'  T-T-G-T-G-T-T-A-C-C...  5'


## Integrating everything: intervals and sequences

Another library that can be used to deal with biology-related data is BioPython.

Notably here, we can use it to deal with FASTA sequences.

Here, we will :

1. extract all the regions for a given transcript from the MANE annotations.
2. focus on the CDS (coding sequence) intervals.
3. query the FASTA file to retrieve the sequence of these intervals.
4. concatenate them and translate the resulting sequence into a protein sequence.
5. export to a mock output file the CDS sequence intervals.



In [None]:
### STEP 1

gff_filepath = resource_dir / "general" / "gene_annotations.hg38.MANE_v1.4" / "MANE.GRCh38.v1.4.ensembl_genomic.gff.gz"
bed_filepath = (
    resource_dir / "general" / "gene_annotations.hg38.MANE_v1.4" / "MANE.GRCh38.v1.4.ensembl_genomic.transcripts.bed.gz"
)

bed_mane = pd.read_csv(
    bed_filepath,
    sep="\t",
    compression="gzip",
    header=None,
    comment="#",
    names=bed6_cols,
    nrows=1,
)


# NOTE: here we build a BedTool object directly from the filepath.
# BedTool is able to detect it is a GFF format and will parse it accordingly.
bt_gff_mane = pbt.BedTool(gff_filepath)
bt_bed_mane = pbt.BedTool.from_dataframe(bed_mane.iloc[[0], :])

gff_extract = bt_gff_mane.intersect(bt_bed_mane, s=True).to_dataframe().sort_values(by=["start", "end"])

### STEP 2

# We now extract the intervals with CDS biotypes.
cds_gff = gff_extract[gff_extract["feature"] == "CDS"].copy()

# Reminder: GFF are 1-based, inclusive intervals. So to convert into 0-based,
# you simply subtract 1 from the start position.

cds_bed = cds_gff.loc[:, ["seqname", "start", "end", "score", "strand"]].copy()
cds_bed["start"] = cds_bed["start"] - 1  # Convert to 0-based start position.
cds_bed["name"] = cds_gff["feature"] + ";" + bed_mane["name"].values[0]  # Add transcript name to CDS name.

display(cds_bed.head(3))

### STEP 3 - get sequences

seqrecords: list[SeqRecord] = []

for _, row in cds_bed.iterrows():
    seq = fa.get_seq(name=row["seqname"], start=row["start"] + 1, end=row["end"], rc=row["strand"] == "-")

    # Make a Bio.SeqRecord object for each sequence.
    seq = Seq(str(seq))
    # Convert to RNA.
    seq = seq.transcribe()

    if len(seq) != (row["end"] - row["start"]):
        raise ValueError(
            f"Length of sequence {len(seq)} does not match expected length {row['end'] - row['start']} for "
            f"chrom={row['seqname']}, start={row['start'] + 1}, end={row['end']}, strand={row['strand']}."
        )

    seqrec = SeqRecord(
        Seq(str(seq)),
        id=row["name"],
        description=f"chrom={row['seqname']}, start={row['start'] + 1}, end={row['end']}, strand={row['strand']}, score={score}",
    )

    seqrecords.append(seqrec)


### STEP 4 - Example of translation of the sequence with BioPython.

# We concatenate the seq records into a single sequence.
concatenated_seq = Seq("".join([str(seqrec.seq) for seqrec in seqrecords]))
print("Concatenated sequence length:", len(concatenated_seq))
print(f"{len(concatenated_seq.translate())=}")


### STEP 5 - Export to fake FASTA file, and read it back.

fasta_output = StringIO()
SeqIO.write(seqrecords, fasta_output, "fasta")
fasta_output.seek(0)  # Reset the StringIO object to the beginning.

print("\n")
print("Demonstration: reading back the exported SeqRecords.")
reader = SeqIO.parse(fasta_output, "fasta")  # Read the first record from the StringIO object.
for seqrec in reader:
    print(seqrec.id, seqrec.description, len(seqrec.seq))

Unnamed: 0,seqname,start,end,score,strand,name
4,chr1,65564,65573,.,+,CDS;ENST00000641515.2
7,chr1,69036,70008,.,+,CDS;ENST00000641515.2


Concatenated sequence length: 981
len(concatenated_seq.translate())=327


Demonstration: reading back the exported SeqRecords.
CDS;ENST00000641515.2 CDS;ENST00000641515.2 chrom=chr1, start=65565, end=65573, strand=+, score=. 9
CDS;ENST00000641515.2 CDS;ENST00000641515.2 chrom=chr1, start=69037, end=70008, strand=+, score=. 972
