From 83b7d06a499df596e88f3b8d0eea4854bbfadcc2 Mon Sep 17 00:00:00 2001
From: Nick Youngblut <nyoungb2@gmail.com>
Date: Tue, 27 May 2025 13:03:28 -0700
Subject: [PATCH 1/5] Update pyarrow version and add build system

---
 pyproject.toml   | 4 ++++
 requirements.txt | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index b9d8e59..9d0ba80 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -22,6 +22,10 @@ dependencies = {file = ["requirements.txt"]}
 [tool.setuptools]
 packages = ["gtfparse"]
 
+[build-system]
+requires = ["setuptools>=61", "wheel"]
+build-backend = "setuptools.build_meta"
+
 [project.urls]
 "Homepage" = "https://github.com/openvax/gtfparse"
 "Bug Tracker" = "https://github.com/openvax/gtfparse"
diff --git a/requirements.txt b/requirements.txt
index 918ae96..87f2aa2 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,3 @@
 polars>=0.20.2
-pyarrow>=18.0.0
+pyarrow>=16.0.0
 pandas>=2.1.0

From e5f72915646c297f438937f9e3edfd3e52eb48bc Mon Sep 17 00:00:00 2001
From: nick-youngblut <nyoungb2@gmail.com>
Date: Tue, 27 May 2025 13:26:09 -0700
Subject: [PATCH 2/5] Update project configuration and dependencies

- Added .cursor/ to .gitignore.
- Updated pyproject.toml to include build-system, version, optional dependencies, and refined classifiers.
- Removed requirements.txt and adjusted GitHub Actions workflow to install development dependencies directly from pyproject.toml.
---
 .github/workflows/tests.yml |  4 +---
 .gitignore                  |  3 +++
 pyproject.toml              | 48 ++++++++++++++++++++++++-------------
 requirements.txt            |  3 ---
 4 files changed, 36 insertions(+), 22 deletions(-)
 delete mode 100644 requirements.txt

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index bccf635..9b17085 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -28,9 +28,7 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          python -m pip install flake8 pytest pytest-cov coveralls pylint
-          pip install -r requirements.txt
-          pip install .
+          pip install ".[dev]"
       - name: Lint with flake8
         run: |
           # stop the build if there are Python syntax errors or undefined names
diff --git a/.gitignore b/.gitignore
index ba74660..af882ae 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# Cursor
+.cursor/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
diff --git a/pyproject.toml b/pyproject.toml
index b9d8e59..29f0bef 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,27 +1,43 @@
+[build-system]
+requires = ["setuptools>=64", "wheel"]
+build-backend = "setuptools.build_meta"
+
 [project]
 name = "gtfparse"
+version = "2.5.0"
 requires-python = ">=3.7"
-authors = [ {name="Alex Rubinsteyn", email="alex.rubinsteyn@unc.edu" } ]
+authors = [ {name="Alex Rubinsteyn", email="alex.rubinsteyn@unc.edu"} ]
 description = "Parsing library for extracting data frames of genomic features from GTF files"
+readme = "README.md"
+license = {text = "Apache Software License"}
 classifiers = [
-            'Development Status :: 4 - Beta',
-            'Environment :: Console',
-            'Operating System :: OS Independent',
-            'Intended Audience :: Science/Research',
-            'License :: OSI Approved :: Apache Software License',
-            'Programming Language :: Python',
-            'Topic :: Scientific/Engineering :: Bio-Informatics',
+    "Development Status :: 4 - Beta",
+    "Environment :: Console",
+    "Operating System :: OS Independent",
+    "Intended Audience :: Science/Research",
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python",
+    "Topic :: Scientific/Engineering :: Bio-Informatics",
+]
+dependencies = [
+    "polars>=0.20.2",
+    "pyarrow>=18.0.0",
+    "pandas>=2.1.0",
 ]
-readme = "README.md"
-dynamic = ["version", "dependencies"]
-
-[tool.setuptools.dynamic]
-version = {attr = "gtfparse.__version__"}
-dependencies = {file = ["requirements.txt"]}
 
-[tool.setuptools]
-packages = ["gtfparse"]
+[project.optional-dependencies]
+dev = [
+    "pytest",
+    "pytest-cov",
+    "flake8",
+    "pylint",
+    "coveralls",
+]
 
 [project.urls]
 "Homepage" = "https://github.com/openvax/gtfparse"
 "Bug Tracker" = "https://github.com/openvax/gtfparse"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["gtfparse*"]
diff --git a/requirements.txt b/requirements.txt
deleted file mode 100644
index 918ae96..0000000
--- a/requirements.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-polars>=0.20.2
-pyarrow>=18.0.0
-pandas>=2.1.0

From 66a6d2c7b917fcc47cf504b7a8482a113ae0c0d8 Mon Sep 17 00:00:00 2001
From: nick-youngblut <nyoungb2@gmail.com>
Date: Tue, 27 May 2025 13:26:45 -0700
Subject: [PATCH 3/5] bumped version

---
 README.md      | 1 +
 pyproject.toml | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 8f711ba..222c798 100644
--- a/README.md
+++ b/README.md
@@ -6,6 +6,7 @@
 
 gtfparse
 ========
+
 Parsing tools for GTF (gene transfer format) files.
 
 # Example usage
diff --git a/pyproject.toml b/pyproject.toml
index 29f0bef..ed638c5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "gtfparse"
-version = "2.5.0"
+version = "2.5.1"
 requires-python = ">=3.7"
 authors = [ {name="Alex Rubinsteyn", email="alex.rubinsteyn@unc.edu"} ]
 description = "Parsing library for extracting data frames of genomic features from GTF files"

From 27121bfa9998ceac9dfc263e859d75dc9c03c4fd Mon Sep 17 00:00:00 2001
From: nick-youngblut <nyoungb2@gmail.com>
Date: Tue, 27 May 2025 15:46:23 -0700
Subject: [PATCH 4/5] Added type hints

---
 gtfparse/attribute_parsing.py       | 10 ++--
 gtfparse/create_missing_features.py | 10 ++--
 gtfparse/read_gtf.py                | 91 ++++++++++++++++++++++++-----
 3 files changed, 87 insertions(+), 24 deletions(-)

diff --git a/gtfparse/attribute_parsing.py b/gtfparse/attribute_parsing.py
index 0a2af17..96f6a58 100644
--- a/gtfparse/attribute_parsing.py
+++ b/gtfparse/attribute_parsing.py
@@ -13,6 +13,7 @@
 import logging
 from collections import OrderedDict
 from sys import intern
+from typing import List, Optional, Union, Any
 
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -20,10 +21,11 @@
 
 
 def expand_attribute_strings(
-        attribute_strings,
-        quote_char="'",
-        missing_value="",
-        usecols=None):
+        attribute_strings: List[str],
+        quote_char: str = "'",
+        missing_value: Any = "",
+        usecols: Optional[List[str]] = None
+    ) -> OrderedDict[str, List[Any]]:
     """
     The last column of GTF has a variable number of key value pairs
     of the format: "key1 value1; key2 value2;"
diff --git a/gtfparse/create_missing_features.py b/gtfparse/create_missing_features.py
index 8758a6b..b99d6c5 100644
--- a/gtfparse/create_missing_features.py
+++ b/gtfparse/create_missing_features.py
@@ -12,6 +12,7 @@
 
 import logging
 from collections import OrderedDict
+from typing import Dict, List, Any, Optional, Union
 
 import pandas as pd
 
@@ -20,10 +21,11 @@
 
 
 def create_missing_features(
-        dataframe,
-        unique_keys={},
-        extra_columns={},
-        missing_value=None):
+        dataframe: pd.DataFrame,
+        unique_keys: Dict[str, str] = {},
+        extra_columns: Dict[str, List[str]] = {},
+        missing_value: Any = None
+    ) -> pd.DataFrame:
     """
     Helper function used to construct a missing feature such as 'transcript'
     or 'gene'. Some GTF files only have 'exon' and 'CDS' entries, but have
diff --git a/gtfparse/read_gtf.py b/gtfparse/read_gtf.py
index 23245cb..3b65cc4 100644
--- a/gtfparse/read_gtf.py
+++ b/gtfparse/read_gtf.py
@@ -12,6 +12,8 @@
 
 import logging
 from os.path import exists
+from typing import Optional, List, Union, Set, Dict, Any, Callable
+from io import StringIO, TextIOWrapper
 
 import polars 
 
@@ -89,10 +91,56 @@
 }
 
 def parse_with_polars_lazy(
-        filepath_or_buffer,
-        split_attributes=True,
-        features=None,
-        fix_quotes_columns=["attribute"]):
+        filepath_or_buffer: Union[str, StringIO, TextIOWrapper],
+        split_attributes: bool = True,
+        features: Optional[List[str]] = None,
+        fix_quotes_columns: List[str] = ["attribute"]
+    ) -> polars.LazyFrame:
+    """
+    Parse a GTF file using Polars lazy evaluation for memory efficiency.
+    
+    This function reads a GTF (Gene Transfer Format) file and returns a Polars
+    LazyFrame with the parsed data. The lazy evaluation allows for efficient
+    processing of large files by deferring computation until explicitly requested.
+    
+    Parameters
+    ----------
+    filepath_or_buffer : str, StringIO, or TextIOWrapper
+        Path to the GTF file or a file-like buffer object containing GTF data.
+        
+    split_attributes : bool, default True
+        If True, splits the attribute column on semicolons and creates an
+        'attribute_split' column containing a list of attribute strings.
+        
+    features : list of str, optional
+        If provided, only rows with feature types in this list will be included
+        in the output. If None, all features are included.
+        
+    fix_quotes_columns : list of str, default ["attribute"]
+        Column names where quote-related formatting issues should be fixed.
+        This addresses common formatting problems in GTF files like trailing
+        semicolons in quoted values.
+        
+    Returns
+    -------
+    polars.LazyFrame
+        A Polars LazyFrame containing the parsed GTF data with the standard
+        GTF columns: seqname, source, feature, start, end, score, strand,
+        frame, and attribute (plus attribute_split if split_attributes=True).
+        
+    Raises
+    ------
+    ParsingError
+        If the GTF file doesn't have the expected number of columns.
+        
+    Examples
+    --------
+    >>> df_lazy = parse_with_polars_lazy("example.gtf")
+    >>> df = df_lazy.collect()  # Execute the lazy computation
+    
+    >>> # Filter for specific features
+    >>> df_lazy = parse_with_polars_lazy("example.gtf", features=["gene", "exon"])
+    """
     # use a global string cache so that all strings get intern'd into
     # a single numbering system
     polars.enable_string_cache()
@@ -139,7 +187,16 @@ def parse_gtf(
         filepath_or_buffer, 
         split_attributes=True, 
         features=None,
-        fix_quotes_columns=["attribute"]):
+        fix_quotes_columns=["attribute"]
+    ) -> polars.DataFrame:
+    """
+    Parse a GTF file using Polars lazy evaluation for memory efficiency.
+    
+    This function reads a GTF (Gene Transfer Format) file and returns a Polars
+    DataFrame with the parsed data. The lazy evaluation allows for efficient
+    processing of large files by deferring computation until explicitly requested.
+    """
+    
     df_lazy = parse_with_polars_lazy(
         filepath_or_buffer=filepath_or_buffer,
         split_attributes=split_attributes,
@@ -152,9 +209,10 @@ def parse_gtf_pandas(*args, **kwargs):
 
     
 def parse_gtf_and_expand_attributes(
-        filepath_or_buffer,
-        restrict_attribute_columns=None,
-        features=None):
+        filepath_or_buffer: Union[str, StringIO, TextIOWrapper],
+        restrict_attribute_columns: Optional[Union[List[str], Set[str]]] = None,
+        features: Optional[List[str]] = None
+    ) -> polars.DataFrame:
     """
     Parse lines into column->values dictionary and then expand
     the 'attribute' column into multiple columns. This expansion happens
@@ -193,14 +251,15 @@ def parse_gtf_and_expand_attributes(
     
 
 def read_gtf(
-        filepath_or_buffer,
-        expand_attribute_column=True,
-        infer_biotype_column=False,
-        column_converters={},
-        column_cast_types={},
-        usecols=None,
-        features=None,
-        result_type='polars'):
+        filepath_or_buffer: Union[str, StringIO, TextIOWrapper],
+        expand_attribute_column: bool = True,
+        infer_biotype_column: bool = False,
+        column_converters: Dict[str, Callable] = {},
+        column_cast_types: Dict[str, Any] = {},
+        usecols: Optional[List[str]] = None,
+        features: Optional[List[str]] = None,
+        result_type: str = 'polars'
+    ) -> Union[polars.DataFrame, Dict[str, Any]]:
     """
     Parse a GTF into a dictionary mapping column names to sequences of values.
 

From d26818992c45a8ecbcf6caa7cc493e0e42b5804c Mon Sep 17 00:00:00 2001
From: nick-youngblut <nyoungb2@gmail.com>
Date: Tue, 27 May 2025 15:50:08 -0700
Subject: [PATCH 5/5] fixed build system duplication

---
 pyproject.toml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 522dd24..ed638c5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,10 +34,6 @@ dev = [
     "coveralls",
 ]
 
-[build-system]
-requires = ["setuptools>=61", "wheel"]
-build-backend = "setuptools.build_meta"
-
 [project.urls]
 "Homepage" = "https://github.com/openvax/gtfparse"
 "Bug Tracker" = "https://github.com/openvax/gtfparse"