openvax · nick-youngblut · May 27, 2025 · May 27, 2025 · May 27, 2025 · May 27, 2025
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -28,9 +28,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install flake8 pytest pytest-cov coveralls pylint
-          pip install -r requirements.txt
-          pip install .
+          pip install ".[dev]"
       - name: Lint with flake8
         run: |
           # stop the build if there are Python syntax errors or undefined names

diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,6 @@
+# Cursor
+.cursor/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/README.md b/README.md
@@ -6,6 +6,7 @@
 
 gtfparse
 ========
+
 Parsing tools for GTF (gene transfer format) files.
 
 # Example usage

diff --git a/gtfparse/attribute_parsing.py b/gtfparse/attribute_parsing.py
@@ -13,17 +13,19 @@
 import logging
 from collections import OrderedDict
 from sys import intern
+from typing import List, Optional, Union, Any
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
 
 def expand_attribute_strings(
-        attribute_strings,
-        quote_char="'",
-        missing_value="",
-        usecols=None):
+        attribute_strings: List[str],
+        quote_char: str = "'",
+        missing_value: Any = "",
+        usecols: Optional[List[str]] = None
+    ) -> OrderedDict[str, List[Any]]:
     """
     The last column of GTF has a variable number of key value pairs
     of the format: "key1 value1; key2 value2;"

diff --git a/gtfparse/create_missing_features.py b/gtfparse/create_missing_features.py
@@ -12,6 +12,7 @@
 
 import logging
 from collections import OrderedDict
+from typing import Dict, List, Any, Optional, Union
 
 import pandas as pd
 
@@ -20,10 +21,11 @@
 
 
 def create_missing_features(
-        dataframe,
-        unique_keys={},
-        extra_columns={},
-        missing_value=None):
+        dataframe: pd.DataFrame,
+        unique_keys: Dict[str, str] = {},
+        extra_columns: Dict[str, List[str]] = {},
+        missing_value: Any = None
+    ) -> pd.DataFrame:
     """
     Helper function used to construct a missing feature such as 'transcript'
     or 'gene'. Some GTF files only have 'exon' and 'CDS' entries, but have

diff --git a/gtfparse/read_gtf.py b/gtfparse/read_gtf.py
@@ -12,6 +12,8 @@
 
 import logging
 from os.path import exists
+from typing import Optional, List, Union, Set, Dict, Any, Callable
+from io import StringIO, TextIOWrapper
 
 import polars 
 
@@ -89,10 +91,56 @@
 }
 
 def parse_with_polars_lazy(
-        filepath_or_buffer,
-        split_attributes=True,
-        features=None,
-        fix_quotes_columns=["attribute"]):
+        filepath_or_buffer: Union[str, StringIO, TextIOWrapper],
+        split_attributes: bool = True,
+        features: Optional[List[str]] = None,
+        fix_quotes_columns: List[str] = ["attribute"]
+    ) -> polars.LazyFrame:
+    """
+    Parse a GTF file using Polars lazy evaluation for memory efficiency.
+
+    This function reads a GTF (Gene Transfer Format) file and returns a Polars
+    LazyFrame with the parsed data. The lazy evaluation allows for efficient
+    processing of large files by deferring computation until explicitly requested.
+
+    Parameters
+    ----------
+    filepath_or_buffer : str, StringIO, or TextIOWrapper
+        Path to the GTF file or a file-like buffer object containing GTF data.
+
+    split_attributes : bool, default True
+        If True, splits the attribute column on semicolons and creates an
+        'attribute_split' column containing a list of attribute strings.
+
+    features : list of str, optional
+        If provided, only rows with feature types in this list will be included
+        in the output. If None, all features are included.
+
+    fix_quotes_columns : list of str, default ["attribute"]
+        Column names where quote-related formatting issues should be fixed.
+        This addresses common formatting problems in GTF files like trailing
+        semicolons in quoted values.
+
+    Returns
+    -------
+    polars.LazyFrame
+        A Polars LazyFrame containing the parsed GTF data with the standard
+        GTF columns: seqname, source, feature, start, end, score, strand,
+        frame, and attribute (plus attribute_split if split_attributes=True).
+
+    Raises
+    ------
+    ParsingError
+        If the GTF file doesn't have the expected number of columns.
+
+    Examples
+    --------
+    >>> df_lazy = parse_with_polars_lazy("example.gtf")
+    >>> df = df_lazy.collect()  # Execute the lazy computation
+
+    >>> # Filter for specific features
+    >>> df_lazy = parse_with_polars_lazy("example.gtf", features=["gene", "exon"])
+    """
     # use a global string cache so that all strings get intern'd into
     # a single numbering system
     polars.enable_string_cache()
@@ -139,7 +187,16 @@ def parse_gtf(
         filepath_or_buffer, 
         split_attributes=True, 
         features=None,
-        fix_quotes_columns=["attribute"]):
+        fix_quotes_columns=["attribute"]
+    ) -> polars.DataFrame:
+    """
+    Parse a GTF file using Polars lazy evaluation for memory efficiency.
+
+    This function reads a GTF (Gene Transfer Format) file and returns a Polars
+    DataFrame with the parsed data. The lazy evaluation allows for efficient
+    processing of large files by deferring computation until explicitly requested.
+    """
+
     df_lazy = parse_with_polars_lazy(
         filepath_or_buffer=filepath_or_buffer,
         split_attributes=split_attributes,
@@ -152,9 +209,10 @@ def parse_gtf_pandas(*args, **kwargs):
 
 
 def parse_gtf_and_expand_attributes(
-        filepath_or_buffer,
-        restrict_attribute_columns=None,
-        features=None):
+        filepath_or_buffer: Union[str, StringIO, TextIOWrapper],
+        restrict_attribute_columns: Optional[Union[List[str], Set[str]]] = None,
+        features: Optional[List[str]] = None
+    ) -> polars.DataFrame:
     """
     Parse lines into column->values dictionary and then expand
     the 'attribute' column into multiple columns. This expansion happens
@@ -193,14 +251,15 @@ def parse_gtf_and_expand_attributes(
 
 
 def read_gtf(
-        filepath_or_buffer,
-        expand_attribute_column=True,
-        infer_biotype_column=False,
-        column_converters={},
-        column_cast_types={},
-        usecols=None,
-        features=None,
-        result_type='polars'):
+        filepath_or_buffer: Union[str, StringIO, TextIOWrapper],
+        expand_attribute_column: bool = True,
+        infer_biotype_column: bool = False,
+        column_converters: Dict[str, Callable] = {},
+        column_cast_types: Dict[str, Any] = {},
+        usecols: Optional[List[str]] = None,
+        features: Optional[List[str]] = None,
+        result_type: str = 'polars'
+    ) -> Union[polars.DataFrame, Dict[str, Any]]:
     """
     Parse a GTF into a dictionary mapping column names to sequences of values.
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,27 +1,43 @@
+[build-system]
+requires = ["setuptools>=64", "wheel"]
+build-backend = "setuptools.build_meta"
+
 [project]
 name = "gtfparse"
+version = "2.5.1"
 requires-python = ">=3.7"
-authors = [ {name="Alex Rubinsteyn", email="alex.rubinsteyn@unc.edu" } ]
+authors = [ {name="Alex Rubinsteyn", email="alex.rubinsteyn@unc.edu"} ]
 description = "Parsing library for extracting data frames of genomic features from GTF files"
+readme = "README.md"
+license = {text = "Apache Software License"}
 classifiers = [
-            'Development Status :: 4 - Beta',
-            'Environment :: Console',
-            'Operating System :: OS Independent',
-            'Intended Audience :: Science/Research',
-            'License :: OSI Approved :: Apache Software License',
-            'Programming Language :: Python',
-            'Topic :: Scientific/Engineering :: Bio-Informatics',
+    "Development Status :: 4 - Beta",
+    "Environment :: Console",
+    "Operating System :: OS Independent",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python",
+    "Topic :: Scientific/Engineering :: Bio-Informatics",
+]
+dependencies = [
+    "polars>=0.20.2",
+    "pyarrow>=18.0.0",
+    "pandas>=2.1.0",
 ]
-readme = "README.md"
-dynamic = ["version", "dependencies"]
-
-[tool.setuptools.dynamic]
-version = {attr = "gtfparse.__version__"}
-dependencies = {file = ["requirements.txt"]}
 
-[tool.setuptools]
-packages = ["gtfparse"]
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "pytest-cov",
+    "flake8",
+    "pylint",
+    "coveralls",
+]
 
 [project.urls]
 "Homepage" = "https://github.com/openvax/gtfparse"
 "Bug Tracker" = "https://github.com/openvax/gtfparse"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["gtfparse*"]
diff --git a/requirements.txt b/requirements.txt