In [1]:
import bioframe as bf
import pandas as pd
import pathlib
from typing import Iterable, List
import numpy as np
import itertools
import re
from natsort import natsort_keygen

In [2]:
def import_gtf(path_to_gtf: pathlib.Path) -> pd.DataFrame:
    df = bf.read_table(path_to_gtf, schema="gtf", comment="#")
    df = df[df["feature"] == "gene"]

    for key in ["gene_id", "gene_name", "gene_type"]:
        df[key] = extract_attribute_gtf(df["attributes"], key)

    return df.drop(columns="attributes").set_index("gene_id").sort_index()


def extract_gene_types(dfs: Iterable[pd.DataFrame], gene_type_key="gene_type"):
    return np.sort(
        np.unique(
            list(
                itertools.chain.from_iterable(
                    (df[gene_type_key].unique() for df in dfs)
                )
            )
        )
    )


def extract_attribute_gtf(data: pd.Series, key: str) -> List[str]:
    pattern = re.compile(rf"{key} \"(.*?)\";")

    return data.str.extract(pattern)

In [3]:
coords = ["chr8:126338000-128232000", "chr10:71200000-73260000"]

deg_t1 = pd.read_table(
    "../data/output/diff_expression_analysis/star_salmon_gene/lfc_0.1/MCF10A_WT_vs_MCF10A_T1.de.tsv.gz"
)
deg_c1 = pd.read_table(
    "../data/output/diff_expression_analysis/star_salmon_gene/lfc_0.1/MCF10A_WT_vs_MCF10A_C1.de.tsv.gz"
)
gtf = import_gtf("../data/input/hg38/hg38_gencode_v43.gtf")

In [4]:
deg_t1 = deg_t1.merge(gtf[["chrom", "start", "end"]], left_on="id", right_index=True)
deg_c1 = deg_c1.merge(gtf[["chrom", "start", "end"]], left_on="id", right_index=True)

deg_t1 = pd.concat([bf.select(deg_t1, c) for c in coords]).sort_values(
    ["chrom", "start"], key=natsort_keygen()
)
deg_c1 = pd.concat([bf.select(deg_c1, c) for c in coords]).sort_values(
    ["chrom", "start"], key=natsort_keygen()
)

deg_t1 = deg_t1[
    [
        "chrom",
        "start",
        "end",
        "id",
        "gene_name",
        "baseMean",
        "log2FoldChange",
        "lfcSE",
        "svalue",
    ]
]
deg_c1 = deg_c1[
    [
        "chrom",
        "start",
        "end",
        "id",
        "gene_name",
        "baseMean",
        "log2FoldChange",
        "lfcSE",
        "svalue",
    ]
]

In [5]:
deg_t1.to_csv("/tmp/deg_t1.tsv", sep="\t", index=False)

In [6]:
deg_c1.to_csv("/tmp/deg_c1.tsv", sep="\t", index=False)