diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index bccf635..9b17085 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -28,9 +28,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install flake8 pytest pytest-cov coveralls pylint - pip install -r requirements.txt - pip install . + pip install ".[dev]" - name: Lint with flake8 run: | # stop the build if there are Python syntax errors or undefined names diff --git a/.gitignore b/.gitignore index ba74660..af882ae 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Cursor +.cursor/ + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/README.md b/README.md index 8f711ba..222c798 100644 --- a/README.md +++ b/README.md @@ -6,6 +6,7 @@ gtfparse ======== + Parsing tools for GTF (gene transfer format) files. # Example usage diff --git a/gtfparse/attribute_parsing.py b/gtfparse/attribute_parsing.py index 0a2af17..96f6a58 100644 --- a/gtfparse/attribute_parsing.py +++ b/gtfparse/attribute_parsing.py @@ -13,6 +13,7 @@ import logging from collections import OrderedDict from sys import intern +from typing import List, Optional, Union, Any logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) @@ -20,10 +21,11 @@ def expand_attribute_strings( - attribute_strings, - quote_char="'", - missing_value="", - usecols=None): + attribute_strings: List[str], + quote_char: str = "'", + missing_value: Any = "", + usecols: Optional[List[str]] = None + ) -> OrderedDict[str, List[Any]]: """ The last column of GTF has a variable number of key value pairs of the format: "key1 value1; key2 value2;" diff --git a/gtfparse/create_missing_features.py b/gtfparse/create_missing_features.py index 8758a6b..b99d6c5 100644 --- a/gtfparse/create_missing_features.py +++ b/gtfparse/create_missing_features.py @@ -12,6 +12,7 @@ import logging from collections import OrderedDict +from typing import Dict, List, Any, Optional, Union import pandas as pd @@ -20,10 +21,11 @@ def create_missing_features( - dataframe, - unique_keys={}, - extra_columns={}, - missing_value=None): + dataframe: pd.DataFrame, + unique_keys: Dict[str, str] = {}, + extra_columns: Dict[str, List[str]] = {}, + missing_value: Any = None + ) -> pd.DataFrame: """ Helper function used to construct a missing feature such as 'transcript' or 'gene'. Some GTF files only have 'exon' and 'CDS' entries, but have diff --git a/gtfparse/read_gtf.py b/gtfparse/read_gtf.py index 23245cb..3b65cc4 100644 --- a/gtfparse/read_gtf.py +++ b/gtfparse/read_gtf.py @@ -12,6 +12,8 @@ import logging from os.path import exists +from typing import Optional, List, Union, Set, Dict, Any, Callable +from io import StringIO, TextIOWrapper import polars @@ -89,10 +91,56 @@ } def parse_with_polars_lazy( - filepath_or_buffer, - split_attributes=True, - features=None, - fix_quotes_columns=["attribute"]): + filepath_or_buffer: Union[str, StringIO, TextIOWrapper], + split_attributes: bool = True, + features: Optional[List[str]] = None, + fix_quotes_columns: List[str] = ["attribute"] + ) -> polars.LazyFrame: + """ + Parse a GTF file using Polars lazy evaluation for memory efficiency. + + This function reads a GTF (Gene Transfer Format) file and returns a Polars + LazyFrame with the parsed data. The lazy evaluation allows for efficient + processing of large files by deferring computation until explicitly requested. + + Parameters + ---------- + filepath_or_buffer : str, StringIO, or TextIOWrapper + Path to the GTF file or a file-like buffer object containing GTF data. + + split_attributes : bool, default True + If True, splits the attribute column on semicolons and creates an + 'attribute_split' column containing a list of attribute strings. + + features : list of str, optional + If provided, only rows with feature types in this list will be included + in the output. If None, all features are included. + + fix_quotes_columns : list of str, default ["attribute"] + Column names where quote-related formatting issues should be fixed. + This addresses common formatting problems in GTF files like trailing + semicolons in quoted values. + + Returns + ------- + polars.LazyFrame + A Polars LazyFrame containing the parsed GTF data with the standard + GTF columns: seqname, source, feature, start, end, score, strand, + frame, and attribute (plus attribute_split if split_attributes=True). + + Raises + ------ + ParsingError + If the GTF file doesn't have the expected number of columns. + + Examples + -------- + >>> df_lazy = parse_with_polars_lazy("example.gtf") + >>> df = df_lazy.collect() # Execute the lazy computation + + >>> # Filter for specific features + >>> df_lazy = parse_with_polars_lazy("example.gtf", features=["gene", "exon"]) + """ # use a global string cache so that all strings get intern'd into # a single numbering system polars.enable_string_cache() @@ -139,7 +187,16 @@ def parse_gtf( filepath_or_buffer, split_attributes=True, features=None, - fix_quotes_columns=["attribute"]): + fix_quotes_columns=["attribute"] + ) -> polars.DataFrame: + """ + Parse a GTF file using Polars lazy evaluation for memory efficiency. + + This function reads a GTF (Gene Transfer Format) file and returns a Polars + DataFrame with the parsed data. The lazy evaluation allows for efficient + processing of large files by deferring computation until explicitly requested. + """ + df_lazy = parse_with_polars_lazy( filepath_or_buffer=filepath_or_buffer, split_attributes=split_attributes, @@ -152,9 +209,10 @@ def parse_gtf_pandas(*args, **kwargs): def parse_gtf_and_expand_attributes( - filepath_or_buffer, - restrict_attribute_columns=None, - features=None): + filepath_or_buffer: Union[str, StringIO, TextIOWrapper], + restrict_attribute_columns: Optional[Union[List[str], Set[str]]] = None, + features: Optional[List[str]] = None + ) -> polars.DataFrame: """ Parse lines into column->values dictionary and then expand the 'attribute' column into multiple columns. This expansion happens @@ -193,14 +251,15 @@ def parse_gtf_and_expand_attributes( def read_gtf( - filepath_or_buffer, - expand_attribute_column=True, - infer_biotype_column=False, - column_converters={}, - column_cast_types={}, - usecols=None, - features=None, - result_type='polars'): + filepath_or_buffer: Union[str, StringIO, TextIOWrapper], + expand_attribute_column: bool = True, + infer_biotype_column: bool = False, + column_converters: Dict[str, Callable] = {}, + column_cast_types: Dict[str, Any] = {}, + usecols: Optional[List[str]] = None, + features: Optional[List[str]] = None, + result_type: str = 'polars' + ) -> Union[polars.DataFrame, Dict[str, Any]]: """ Parse a GTF into a dictionary mapping column names to sequences of values. diff --git a/pyproject.toml b/pyproject.toml index b9d8e59..ed638c5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,27 +1,43 @@ +[build-system] +requires = ["setuptools>=64", "wheel"] +build-backend = "setuptools.build_meta" + [project] name = "gtfparse" +version = "2.5.1" requires-python = ">=3.7" -authors = [ {name="Alex Rubinsteyn", email="alex.rubinsteyn@unc.edu" } ] +authors = [ {name="Alex Rubinsteyn", email="alex.rubinsteyn@unc.edu"} ] description = "Parsing library for extracting data frames of genomic features from GTF files" +readme = "README.md" +license = {text = "Apache Software License"} classifiers = [ - 'Development Status :: 4 - Beta', - 'Environment :: Console', - 'Operating System :: OS Independent', - 'Intended Audience :: Science/Research', - 'License :: OSI Approved :: Apache Software License', - 'Programming Language :: Python', - 'Topic :: Scientific/Engineering :: Bio-Informatics', + "Development Status :: 4 - Beta", + "Environment :: Console", + "Operating System :: OS Independent", + "Intended Audience :: Science/Research", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +dependencies = [ + "polars>=0.20.2", + "pyarrow>=18.0.0", + "pandas>=2.1.0", ] -readme = "README.md" -dynamic = ["version", "dependencies"] - -[tool.setuptools.dynamic] -version = {attr = "gtfparse.__version__"} -dependencies = {file = ["requirements.txt"]} -[tool.setuptools] -packages = ["gtfparse"] +[project.optional-dependencies] +dev = [ + "pytest", + "pytest-cov", + "flake8", + "pylint", + "coveralls", +] [project.urls] "Homepage" = "https://github.com/openvax/gtfparse" "Bug Tracker" = "https://github.com/openvax/gtfparse" + +[tool.setuptools.packages.find] +where = ["."] +include = ["gtfparse*"] diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index 918ae96..0000000 --- a/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -polars>=0.20.2 -pyarrow>=18.0.0 -pandas>=2.1.0