From 83b7d06a499df596e88f3b8d0eea4854bbfadcc2 Mon Sep 17 00:00:00 2001 From: Nick Youngblut Date: Tue, 27 May 2025 13:03:28 -0700 Subject: [PATCH 1/5] Update pyarrow version and add build system --- pyproject.toml | 4 ++++ requirements.txt | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b9d8e59..9d0ba80 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -22,6 +22,10 @@ dependencies = {file = ["requirements.txt"]} [tool.setuptools] packages = ["gtfparse"] +[build-system] +requires = ["setuptools>=61", "wheel"] +build-backend = "setuptools.build_meta" + [project.urls] "Homepage" = "https://github.com/openvax/gtfparse" "Bug Tracker" = "https://github.com/openvax/gtfparse" diff --git a/requirements.txt b/requirements.txt index 918ae96..87f2aa2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ polars>=0.20.2 -pyarrow>=18.0.0 +pyarrow>=16.0.0 pandas>=2.1.0 From e5f72915646c297f438937f9e3edfd3e52eb48bc Mon Sep 17 00:00:00 2001 From: nick-youngblut Date: Tue, 27 May 2025 13:26:09 -0700 Subject: [PATCH 2/5] Update project configuration and dependencies - Added .cursor/ to .gitignore. - Updated pyproject.toml to include build-system, version, optional dependencies, and refined classifiers. - Removed requirements.txt and adjusted GitHub Actions workflow to install development dependencies directly from pyproject.toml. --- .github/workflows/tests.yml | 4 +--- .gitignore | 3 +++ pyproject.toml | 48 ++++++++++++++++++++++++------------- requirements.txt | 3 --- 4 files changed, 36 insertions(+), 22 deletions(-) delete mode 100644 requirements.txt diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index bccf635..9b17085 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -28,9 +28,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install flake8 pytest pytest-cov coveralls pylint - pip install -r requirements.txt - pip install . + pip install ".[dev]" - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names diff --git a/.gitignore b/.gitignore index ba74660..af882ae 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Cursor +.cursor/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/pyproject.toml b/pyproject.toml index b9d8e59..29f0bef 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,27 +1,43 @@ +[build-system] +requires = ["setuptools>=64", "wheel"] +build-backend = "setuptools.build_meta" + [project] name = "gtfparse" +version = "2.5.0" requires-python = ">=3.7" -authors = [ {name="Alex Rubinsteyn", email="alex.rubinsteyn@unc.edu" } ] +authors = [ {name="Alex Rubinsteyn", email="alex.rubinsteyn@unc.edu"} ] description = "Parsing library for extracting data frames of genomic features from GTF files" +readme = "README.md" +license = {text = "Apache Software License"} classifiers = [ - 'Development Status :: 4 - Beta', - 'Environment :: Console', - 'Operating System :: OS Independent', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python', - 'Topic :: Scientific/Engineering :: Bio-Informatics', + "Development Status :: 4 - Beta", + "Environment :: Console", + "Operating System :: OS Independent", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +dependencies = [ + "polars>=0.20.2", + "pyarrow>=18.0.0", + "pandas>=2.1.0", ] -readme = "README.md" -dynamic = ["version", "dependencies"] - -[tool.setuptools.dynamic] -version = {attr = "gtfparse.__version__"} -dependencies = {file = ["requirements.txt"]} -[tool.setuptools] -packages = ["gtfparse"] +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-cov", + "flake8", + "pylint", + "coveralls", +] [project.urls] "Homepage" = "https://github.com/openvax/gtfparse" "Bug Tracker" = "https://github.com/openvax/gtfparse" + +[tool.setuptools.packages.find] +where = ["."] +include = ["gtfparse*"] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 918ae96..0000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -polars>=0.20.2 -pyarrow>=18.0.0 -pandas>=2.1.0 From 66a6d2c7b917fcc47cf504b7a8482a113ae0c0d8 Mon Sep 17 00:00:00 2001 From: nick-youngblut Date: Tue, 27 May 2025 13:26:45 -0700 Subject: [PATCH 3/5] bumped version --- README.md | 1 + pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 8f711ba..222c798 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ gtfparse ======== + Parsing tools for GTF (gene transfer format) files. # Example usage diff --git a/pyproject.toml b/pyproject.toml index 29f0bef..ed638c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "gtfparse" -version = "2.5.0" +version = "2.5.1" requires-python = ">=3.7" authors = [ {name="Alex Rubinsteyn", email="alex.rubinsteyn@unc.edu"} ] description = "Parsing library for extracting data frames of genomic features from GTF files" From 27121bfa9998ceac9dfc263e859d75dc9c03c4fd Mon Sep 17 00:00:00 2001 From: nick-youngblut Date: Tue, 27 May 2025 15:46:23 -0700 Subject: [PATCH 4/5] Added type hints --- gtfparse/attribute_parsing.py | 10 ++-- gtfparse/create_missing_features.py | 10 ++-- gtfparse/read_gtf.py | 91 ++++++++++++++++++++++++----- 3 files changed, 87 insertions(+), 24 deletions(-) diff --git a/gtfparse/attribute_parsing.py b/gtfparse/attribute_parsing.py index 0a2af17..96f6a58 100644 --- a/gtfparse/attribute_parsing.py +++ b/gtfparse/attribute_parsing.py @@ -13,6 +13,7 @@ import logging from collections import OrderedDict from sys import intern +from typing import List, Optional, Union, Any logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -20,10 +21,11 @@ def expand_attribute_strings( - attribute_strings, - quote_char="'", - missing_value="", - usecols=None): + attribute_strings: List[str], + quote_char: str = "'", + missing_value: Any = "", + usecols: Optional[List[str]] = None + ) -> OrderedDict[str, List[Any]]: """ The last column of GTF has a variable number of key value pairs of the format: "key1 value1; key2 value2;" diff --git a/gtfparse/create_missing_features.py b/gtfparse/create_missing_features.py index 8758a6b..b99d6c5 100644 --- a/gtfparse/create_missing_features.py +++ b/gtfparse/create_missing_features.py @@ -12,6 +12,7 @@ import logging from collections import OrderedDict +from typing import Dict, List, Any, Optional, Union import pandas as pd @@ -20,10 +21,11 @@ def create_missing_features( - dataframe, - unique_keys={}, - extra_columns={}, - missing_value=None): + dataframe: pd.DataFrame, + unique_keys: Dict[str, str] = {}, + extra_columns: Dict[str, List[str]] = {}, + missing_value: Any = None + ) -> pd.DataFrame: """ Helper function used to construct a missing feature such as 'transcript' or 'gene'. Some GTF files only have 'exon' and 'CDS' entries, but have diff --git a/gtfparse/read_gtf.py b/gtfparse/read_gtf.py index 23245cb..3b65cc4 100644 --- a/gtfparse/read_gtf.py +++ b/gtfparse/read_gtf.py @@ -12,6 +12,8 @@ import logging from os.path import exists +from typing import Optional, List, Union, Set, Dict, Any, Callable +from io import StringIO, TextIOWrapper import polars @@ -89,10 +91,56 @@ } def parse_with_polars_lazy( - filepath_or_buffer, - split_attributes=True, - features=None, - fix_quotes_columns=["attribute"]): + filepath_or_buffer: Union[str, StringIO, TextIOWrapper], + split_attributes: bool = True, + features: Optional[List[str]] = None, + fix_quotes_columns: List[str] = ["attribute"] + ) -> polars.LazyFrame: + """ + Parse a GTF file using Polars lazy evaluation for memory efficiency. + + This function reads a GTF (Gene Transfer Format) file and returns a Polars + LazyFrame with the parsed data. The lazy evaluation allows for efficient + processing of large files by deferring computation until explicitly requested. + + Parameters + ---------- + filepath_or_buffer : str, StringIO, or TextIOWrapper + Path to the GTF file or a file-like buffer object containing GTF data. + + split_attributes : bool, default True + If True, splits the attribute column on semicolons and creates an + 'attribute_split' column containing a list of attribute strings. + + features : list of str, optional + If provided, only rows with feature types in this list will be included + in the output. If None, all features are included. + + fix_quotes_columns : list of str, default ["attribute"] + Column names where quote-related formatting issues should be fixed. + This addresses common formatting problems in GTF files like trailing + semicolons in quoted values. + + Returns + ------- + polars.LazyFrame + A Polars LazyFrame containing the parsed GTF data with the standard + GTF columns: seqname, source, feature, start, end, score, strand, + frame, and attribute (plus attribute_split if split_attributes=True). + + Raises + ------ + ParsingError + If the GTF file doesn't have the expected number of columns. + + Examples + -------- + >>> df_lazy = parse_with_polars_lazy("example.gtf") + >>> df = df_lazy.collect() # Execute the lazy computation + + >>> # Filter for specific features + >>> df_lazy = parse_with_polars_lazy("example.gtf", features=["gene", "exon"]) + """ # use a global string cache so that all strings get intern'd into # a single numbering system polars.enable_string_cache() @@ -139,7 +187,16 @@ def parse_gtf( filepath_or_buffer, split_attributes=True, features=None, - fix_quotes_columns=["attribute"]): + fix_quotes_columns=["attribute"] + ) -> polars.DataFrame: + """ + Parse a GTF file using Polars lazy evaluation for memory efficiency. + + This function reads a GTF (Gene Transfer Format) file and returns a Polars + DataFrame with the parsed data. The lazy evaluation allows for efficient + processing of large files by deferring computation until explicitly requested. + """ + df_lazy = parse_with_polars_lazy( filepath_or_buffer=filepath_or_buffer, split_attributes=split_attributes, @@ -152,9 +209,10 @@ def parse_gtf_pandas(*args, **kwargs): def parse_gtf_and_expand_attributes( - filepath_or_buffer, - restrict_attribute_columns=None, - features=None): + filepath_or_buffer: Union[str, StringIO, TextIOWrapper], + restrict_attribute_columns: Optional[Union[List[str], Set[str]]] = None, + features: Optional[List[str]] = None + ) -> polars.DataFrame: """ Parse lines into column->values dictionary and then expand the 'attribute' column into multiple columns. This expansion happens @@ -193,14 +251,15 @@ def parse_gtf_and_expand_attributes( def read_gtf( - filepath_or_buffer, - expand_attribute_column=True, - infer_biotype_column=False, - column_converters={}, - column_cast_types={}, - usecols=None, - features=None, - result_type='polars'): + filepath_or_buffer: Union[str, StringIO, TextIOWrapper], + expand_attribute_column: bool = True, + infer_biotype_column: bool = False, + column_converters: Dict[str, Callable] = {}, + column_cast_types: Dict[str, Any] = {}, + usecols: Optional[List[str]] = None, + features: Optional[List[str]] = None, + result_type: str = 'polars' + ) -> Union[polars.DataFrame, Dict[str, Any]]: """ Parse a GTF into a dictionary mapping column names to sequences of values. From d26818992c45a8ecbcf6caa7cc493e0e42b5804c Mon Sep 17 00:00:00 2001 From: nick-youngblut Date: Tue, 27 May 2025 15:50:08 -0700 Subject: [PATCH 5/5] fixed build system duplication --- pyproject.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 522dd24..ed638c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,10 +34,6 @@ dev = [ "coveralls", ] -[build-system] -requires = ["setuptools>=61", "wheel"] -build-backend = "setuptools.build_meta" - [project.urls] "Homepage" = "https://github.com/openvax/gtfparse" "Bug Tracker" = "https://github.com/openvax/gtfparse"