In [19]:
import ast
import re
from dataclasses import dataclass
from enum import Enum
from typing import Union

In [23]:
class AccessionType(Enum):
    BIOPROJECT = re.compile(r"PRJ([EDN])[A-Z]\d+")
    STUDY = re.compile(r"([EDS])RP\d{6,}")
    BIOSAMPLE = re.compile(r"SAM([EDN])[A-Z]?\d+")
    SAMPLE = re.compile(r"([EDS])RS\d{6,}")
    EXPERIMENT = re.compile(r"([EDS])RX\d{6,}")
    RUN = re.compile(r"([EDS])RR\d{6,}")

    @staticmethod
    def from_str(s: str) -> "AccessionType":
        for member in AccessionType:
            regex = member.value
            if regex.search(s):
                return member
        return None


@dataclass
class Accession:
    run: str = ""
    sample: str = ""
    biosample: str = ""
    experiment: str = ""
    bioproject: str = ""
    study: str = ""

    @staticmethod
    def from_line(s: str, delim: str = ",") -> "Accession":
        acc = Accession()
        fields = [f for f in s.rstrip().split(delim) if f]
        for f in fields:
            acc_type = AccessionType.from_str(f)
            if acc_type is None:
                continue
            if acc_type is AccessionType.BIOSAMPLE:
                acc.biosample = f
            elif acc_type is AccessionType.RUN:
                acc.run = f
            elif acc_type is AccessionType.EXPERIMENT:
                acc.experiment = f
            elif acc_type is AccessionType.SAMPLE:
                acc.sample = f
            elif acc_type is AccessionType.STUDY:
                acc.study = f
            elif acc_type is AccessionType.BIOPROJECT:
                acc.bioproject = f

        return acc

    def most_specific(self) -> Union[tuple[str, AccessionType], tuple[str, None]]:
        """Returns the most specific accession"""
        if self.run:
            return self.run, AccessionType.RUN
        elif self.experiment:
            return self.experiment, AccessionType.EXPERIMENT
        elif self.biosample:
            return self.biosample, AccessionType.BIOSAMPLE
        elif self.sample:
            return self.sample, AccessionType.SAMPLE
        elif self.bioproject:
            return self.bioproject, AccessionType.BIOPROJECT
        elif self.study:
            return self.study, AccessionType.STUDY
        else:
            return "", None

    def to_row(self, delim: str = ",") -> str:
        return delim.join(
            [
                self.bioproject,
                self.study,
                self.biosample,
                self.sample,
                self.experiment,
                self.run,
            ]
        )

In [24]:
no_results_accs = set()
with open("fill_who_sheet.o") as fp:
    for line in fp:
        if line.startswith("No results"):
            j = next(fp).strip()
            d = ast.literal_eval(j)
            no_results_accs.add(d["accession"])

In [25]:
len(no_results_accs)

179

In [36]:
to_delete = set()
with open("who-samplesheet-filled.csv") as fp:
    n_fields = len(next(fp).split(","))
    for row in map(str.rstrip, fp):
        assert n_fields == len(row.split(","))
        
        for a in no_results_accs:
            if a in row:
                t = AccessionType.from_str(a)
                acc = Accession.from_line(row)
                msa, mst = acc.most_specific()
                if mst is not AccessionType.RUN:
                    to_delete.add(a)

In [37]:
len(to_delete)

141

In [39]:
with open("to_delete.txt", "w") as fp:
    for a in to_delete:
        print(a.strip(), file=fp)