In [1]:
from pathlib import Path
from typing import Optional
from functools import reduce

import typer
import pyranges as pr
import pandas as pd

In [47]:
def add_source(annotation_file: Path) -> pr.PyRanges:

    prf = pr.read_gff(annotation_file)
    samplename = pd.Series(
        data=[annotation_file.parts[-2] for _ in range(len(prf))],
        name="Source"
    )

    return prf.insert(samplename)

In [3]:
def merge_ranges(pr1: pr.PyRanges, pr2: pr.PyRanges) -> pr.PyRanges:

    df1 = pr1.as_df()
    df2 = pr2.as_df()

    merged_df = pd.concat([df1, df2], ignore_index=True, axis=0)

    return pr.PyRanges(df=merged_df)

In [4]:
from glob import glob

In [5]:
help(glob)

Help on function glob in module glob:

glob(pathname, *, recursive=False)
    Return a list of paths matching a pathname pattern.
    
    The pattern may contain simple shell-style wildcards a la
    fnmatch. However, unlike fnmatch, filenames starting with a
    dot are special cases that are not matched by '*' and '?'
    patterns.
    
    If recursive is true, the pattern '**' will match any files and
    zero or more directories and subdirectories.



In [10]:
from os import getcwd

In [11]:
getcwd()

'/home/milo/workspace/cdna_brownie/notebooks'

In [22]:
directories = glob(pathname = "./test_data/bc10[0-9]?")

In [24]:
group_filename = "repaired.sample.group.txt"
gff_filename = "repaired.sample.gff"
count_filename = "repaired.sample.abundance.txt"
fastq_filename = None

In [25]:
directories = [Path(_) for _ in directories]
datafiles = {
        "group": group_filename,
        "annotation": gff_filename,
        "counts": count_filename
    }

if fastq_filename:
    datafiles.update({"sequence": fastq_filename})

In [26]:
for x in datafiles:
        item = [_ for _ in datafiles[x].split(",")]
        if len(item) == 1:
            datafiles[x] = [a.joinpath(item[0]) for a in directories]
        elif len(item) == len(directories):
            datafiles[x] = [a.joinpath(b) for a, b in zip(directories, item)]
        else:
            raise RuntimeError(f"The number of filenames given for {x} does not match the number of directories.")

        # Raise an issue if the given files do not exist
        for i in datafiles[x]:
            if not i.exists():
                raise FileNotFoundError(f"The {x} file {i} cannot be found")

In [51]:
datafiles["annotation"]

[PosixPath('test_data/bc1002/repaired.sample.gff'),
 PosixPath('test_data/bc1018/repaired.sample.gff'),
 PosixPath('test_data/bc1012/repaired.sample.gff'),
 PosixPath('test_data/bc1019/repaired.sample.gff'),
 PosixPath('test_data/bc1004/repaired.sample.gff'),
 PosixPath('test_data/bc1006/repaired.sample.gff'),
 PosixPath('test_data/bc1023/repaired.sample.gff'),
 PosixPath('test_data/bc1020/repaired.sample.gff'),
 PosixPath('test_data/bc1003/repaired.sample.gff'),
 PosixPath('test_data/bc1001/repaired.sample.gff'),
 PosixPath('test_data/bc1005/repaired.sample.gff'),
 PosixPath('test_data/bc1008/repaired.sample.gff')]

In [59]:
annotations = map(add_source, datafiles["annotation"])

In [53]:
datafiles["annotation"][0]

PosixPath('test_data/bc1002/repaired.sample.gff')

In [46]:
Path(datafiles["annotation"][0]).parts[-2]

'bc1002'

In [48]:
bc1002_annotation = add_source(datafiles["annotation"][0])

In [49]:
bc1002_annotation

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id
0,chr1,bc1002,transcript,854801,859446,.,+,.,PB.2,PB.2.1
1,chr1,bc1002,exon,854801,859446,.,+,.,PB.2,PB.2.1
2,chr1,bc1002,transcript,2133698,2139623,.,+,.,PB.7,PB.7.1
3,chr1,bc1002,exon,2133698,2138792,.,+,.,PB.7,PB.7.1
4,chr1,bc1002,exon,2139448,2139623,.,+,.,PB.7,PB.7.1
...,...,...,...,...,...,...,...,...,...,...
172888,chrX,bc1002,exon,154364525,154364631,.,-,.,PB.3112,PB.3112.38
172889,chrX,bc1002,transcript,154529886,154533308,.,-,.,PB.3113,PB.3113.1
172890,chrX,bc1002,exon,154529886,154532280,.,-,.,PB.3113,PB.3113.1
172891,chrX,bc1002,exon,154532385,154532462,.,-,.,PB.3113,PB.3113.1


In [60]:
merged_annotation = reduce(lambda i, j: merge_ranges(i, j), annotations)

In [61]:
merged_annotation

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id
0,chr1,bc1002,transcript,854801,859446,.,+,.,PB.2,PB.2.1
1,chr1,bc1002,exon,854801,859446,.,+,.,PB.2,PB.2.1
2,chr1,bc1002,transcript,2133698,2139623,.,+,.,PB.7,PB.7.1
3,chr1,bc1002,exon,2133698,2138792,.,+,.,PB.7,PB.7.1
4,chr1,bc1002,exon,2139448,2139623,.,+,.,PB.7,PB.7.1
...,...,...,...,...,...,...,...,...,...,...
2379118,chrY,bc1012,exon,19741317,19741488,.,-,.,PB.3466,PB.3466.5
2379119,chrY,bc1012,exon,19741734,19741857,.,-,.,PB.3466,PB.3466.5
2379120,chrY,bc1012,exon,19743161,19743239,.,-,.,PB.3466,PB.3466.5
2379121,chrY,bc1012,exon,19744384,19744553,.,-,.,PB.3466,PB.3466.5


In [62]:
merged_annotation = merged_annotation.cluster(nb_cpu=6)

In [63]:
merged_annotation

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,Cluster
0,chr1,bc1003,transcript,827658,857173,.,+,.,PB.2,PB.2.1,1
1,chr1,bc1003,exon,827658,827775,.,+,.,PB.2,PB.2.1,1
2,chr1,bc1003,exon,829002,829104,.,+,.,PB.2,PB.2.1,1
3,chr1,bc1004,transcript,833846,838936,.,+,.,PB.3,PB.3.1,1
4,chr1,bc1004,exon,833846,838936,.,+,.,PB.3,PB.3.1,1
...,...,...,...,...,...,...,...,...,...,...,...
2379118,chrY,bc1012,exon,19744670,19744723,.,-,.,PB.3466,PB.3466.4,3836
2379119,chrY,bc1012,exon,19744670,19744723,.,-,.,PB.3466,PB.3466.2,3836
2379120,chrY,bc1012,exon,19744670,19744726,.,-,.,PB.3466,PB.3466.3,3836
2379121,chrY,bc1012,exon,19744670,19744726,.,-,.,PB.3466,PB.3466.1,3836


In [64]:
merged_annotation.subset(lambda x: x.Cluster == 1)

Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,transcript_id,Cluster
0,chr1,bc1003,transcript,827658,857173,.,+,.,PB.2,PB.2.1,1
1,chr1,bc1003,exon,827658,827775,.,+,.,PB.2,PB.2.1,1
2,chr1,bc1003,exon,829002,829104,.,+,.,PB.2,PB.2.1,1
3,chr1,bc1004,transcript,833846,838936,.,+,.,PB.3,PB.3.1,1
4,chr1,bc1004,exon,833846,838936,.,+,.,PB.3,PB.3.1,1
5,chr1,bc1005,transcript,834239,838441,.,+,.,PB.1,PB.1.1,1
6,chr1,bc1005,exon,834239,838441,.,+,.,PB.1,PB.1.1,1
7,chr1,bc1003,exon,851926,852110,.,+,.,PB.2,PB.2.1,1
8,chr1,bc1003,exon,853390,857173,.,+,.,PB.2,PB.2.1,1
9,chr1,bc1002,transcript,854801,859446,.,+,.,PB.2,PB.2.1,1


In [65]:
from multiprocessing import cpu_count

In [66]:
cpu_count()

12