Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 1 addition & 3 deletions .github/workflows/tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,7 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install flake8 pytest pytest-cov coveralls pylint
pip install -r requirements.txt
pip install .
pip install ".[dev]"
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
# Cursor
.cursor/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

gtfparse
========

Parsing tools for GTF (gene transfer format) files.

# Example usage
Expand Down
10 changes: 6 additions & 4 deletions gtfparse/attribute_parsing.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,17 +13,19 @@
import logging
from collections import OrderedDict
from sys import intern
from typing import List, Optional, Union, Any

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)



def expand_attribute_strings(
attribute_strings,
quote_char="'",
missing_value="",
usecols=None):
attribute_strings: List[str],
quote_char: str = "'",
missing_value: Any = "",
usecols: Optional[List[str]] = None
) -> OrderedDict[str, List[Any]]:
"""
The last column of GTF has a variable number of key value pairs
of the format: "key1 value1; key2 value2;"
Expand Down
10 changes: 6 additions & 4 deletions gtfparse/create_missing_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

import logging
from collections import OrderedDict
from typing import Dict, List, Any, Optional, Union

import pandas as pd

Expand All @@ -20,10 +21,11 @@


def create_missing_features(
dataframe,
unique_keys={},
extra_columns={},
missing_value=None):
dataframe: pd.DataFrame,
unique_keys: Dict[str, str] = {},
extra_columns: Dict[str, List[str]] = {},
missing_value: Any = None
) -> pd.DataFrame:
"""
Helper function used to construct a missing feature such as 'transcript'
or 'gene'. Some GTF files only have 'exon' and 'CDS' entries, but have
Expand Down
91 changes: 75 additions & 16 deletions gtfparse/read_gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@

import logging
from os.path import exists
from typing import Optional, List, Union, Set, Dict, Any, Callable
from io import StringIO, TextIOWrapper

import polars

Expand Down Expand Up @@ -89,10 +91,56 @@
}

def parse_with_polars_lazy(
filepath_or_buffer,
split_attributes=True,
features=None,
fix_quotes_columns=["attribute"]):
filepath_or_buffer: Union[str, StringIO, TextIOWrapper],
split_attributes: bool = True,
features: Optional[List[str]] = None,
fix_quotes_columns: List[str] = ["attribute"]
) -> polars.LazyFrame:
"""
Parse a GTF file using Polars lazy evaluation for memory efficiency.

This function reads a GTF (Gene Transfer Format) file and returns a Polars
LazyFrame with the parsed data. The lazy evaluation allows for efficient
processing of large files by deferring computation until explicitly requested.

Parameters
----------
filepath_or_buffer : str, StringIO, or TextIOWrapper
Path to the GTF file or a file-like buffer object containing GTF data.

split_attributes : bool, default True
If True, splits the attribute column on semicolons and creates an
'attribute_split' column containing a list of attribute strings.

features : list of str, optional
If provided, only rows with feature types in this list will be included
in the output. If None, all features are included.

fix_quotes_columns : list of str, default ["attribute"]
Column names where quote-related formatting issues should be fixed.
This addresses common formatting problems in GTF files like trailing
semicolons in quoted values.

Returns
-------
polars.LazyFrame
A Polars LazyFrame containing the parsed GTF data with the standard
GTF columns: seqname, source, feature, start, end, score, strand,
frame, and attribute (plus attribute_split if split_attributes=True).

Raises
------
ParsingError
If the GTF file doesn't have the expected number of columns.

Examples
--------
>>> df_lazy = parse_with_polars_lazy("example.gtf")
>>> df = df_lazy.collect() # Execute the lazy computation

>>> # Filter for specific features
>>> df_lazy = parse_with_polars_lazy("example.gtf", features=["gene", "exon"])
"""
# use a global string cache so that all strings get intern'd into
# a single numbering system
polars.enable_string_cache()
Expand Down Expand Up @@ -139,7 +187,16 @@ def parse_gtf(
filepath_or_buffer,
split_attributes=True,
features=None,
fix_quotes_columns=["attribute"]):
fix_quotes_columns=["attribute"]
) -> polars.DataFrame:
"""
Parse a GTF file using Polars lazy evaluation for memory efficiency.

This function reads a GTF (Gene Transfer Format) file and returns a Polars
DataFrame with the parsed data. The lazy evaluation allows for efficient
processing of large files by deferring computation until explicitly requested.
"""

df_lazy = parse_with_polars_lazy(
filepath_or_buffer=filepath_or_buffer,
split_attributes=split_attributes,
Expand All @@ -152,9 +209,10 @@ def parse_gtf_pandas(*args, **kwargs):


def parse_gtf_and_expand_attributes(
filepath_or_buffer,
restrict_attribute_columns=None,
features=None):
filepath_or_buffer: Union[str, StringIO, TextIOWrapper],
restrict_attribute_columns: Optional[Union[List[str], Set[str]]] = None,
features: Optional[List[str]] = None
) -> polars.DataFrame:
"""
Parse lines into column->values dictionary and then expand
the 'attribute' column into multiple columns. This expansion happens
Expand Down Expand Up @@ -193,14 +251,15 @@ def parse_gtf_and_expand_attributes(


def read_gtf(
filepath_or_buffer,
expand_attribute_column=True,
infer_biotype_column=False,
column_converters={},
column_cast_types={},
usecols=None,
features=None,
result_type='polars'):
filepath_or_buffer: Union[str, StringIO, TextIOWrapper],
expand_attribute_column: bool = True,
infer_biotype_column: bool = False,
column_converters: Dict[str, Callable] = {},
column_cast_types: Dict[str, Any] = {},
usecols: Optional[List[str]] = None,
features: Optional[List[str]] = None,
result_type: str = 'polars'
) -> Union[polars.DataFrame, Dict[str, Any]]:
"""
Parse a GTF into a dictionary mapping column names to sequences of values.

Expand Down
48 changes: 32 additions & 16 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,27 +1,43 @@
[build-system]
requires = ["setuptools>=64", "wheel"]
build-backend = "setuptools.build_meta"

[project]
name = "gtfparse"
version = "2.5.1"
requires-python = ">=3.7"
authors = [ {name="Alex Rubinsteyn", email="alex.rubinsteyn@unc.edu" } ]
authors = [ {name="Alex Rubinsteyn", email="alex.rubinsteyn@unc.edu"} ]
description = "Parsing library for extracting data frames of genomic features from GTF files"
readme = "README.md"
license = {text = "Apache Software License"}
classifiers = [
'Development Status :: 4 - Beta',
'Environment :: Console',
'Operating System :: OS Independent',
'Intended Audience :: Science/Research',
'License :: OSI Approved :: Apache Software License',
'Programming Language :: Python',
'Topic :: Scientific/Engineering :: Bio-Informatics',
"Development Status :: 4 - Beta",
"Environment :: Console",
"Operating System :: OS Independent",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python",
"Topic :: Scientific/Engineering :: Bio-Informatics",
]
dependencies = [
"polars>=0.20.2",
"pyarrow>=18.0.0",
"pandas>=2.1.0",
]
readme = "README.md"
dynamic = ["version", "dependencies"]

[tool.setuptools.dynamic]
version = {attr = "gtfparse.__version__"}
dependencies = {file = ["requirements.txt"]}

[tool.setuptools]
packages = ["gtfparse"]
[project.optional-dependencies]
dev = [
"pytest",
"pytest-cov",
"flake8",
"pylint",
"coveralls",
]

[project.urls]
"Homepage" = "https://github.com/openvax/gtfparse"
"Bug Tracker" = "https://github.com/openvax/gtfparse"

[tool.setuptools.packages.find]
where = ["."]
include = ["gtfparse*"]
3 changes: 0 additions & 3 deletions requirements.txt

This file was deleted.