In [27]:
import pandas as pd
from enum import Enum
from typing import Optional, Tuple, List
from collections import Counter
from dataclasses import dataclass
import numpy as np

In [7]:
from pathlib import Path
table_files = list(Path("../../results/mutation_concordance/").rglob("*.csv"))

In [32]:
class Calls(Enum):
    Ref = "REF"
    Alt = "ALT"
    Null = "NULL"
    Minor = "HET"
    Filtered = "FILT"

class Classification(Enum):
    TruePositive = "TP"
    FalsePositive = "FP"
    TrueNegative = "TN"
    FalseNegative = "FN"

    def __str__(self) -> str:
        return self.value
    
    def __lt__(self, other):
        return str(self) < str(other)
    
    @staticmethod
    def from_pair(y: Calls, y_hat: Calls) -> "Classification":
        return {
            (Calls.Ref, Calls.Ref): Classification.TrueNegative,
            (Calls.Alt, Calls.Alt): Classification.TruePositive,
            (Calls.Ref, Calls.Alt): Classification.FalsePositive,
            (Calls.Alt, Calls.Ref): Classification.FalseNegative,
            (Calls.Minor, Calls.Alt): Classification.TruePositive,
            (Calls.Minor, Calls.Ref): Classification.FalseNegative,
            (Calls.Minor, Calls.Null): Classification.FalseNegative,
            (Calls.Null, Calls.Alt): Classification.FalsePositive,
            (Calls.Null, Calls.Ref): Classification.FalseNegative,
            (Calls.Null, Calls.Null): Classification.TrueNegative,
            (Calls.Ref, Calls.Null): Classification.FalseNegative,
            (Calls.Alt, Calls.Null): Classification.FalseNegative,
        }[(y, y_hat)]
        
    
class Classifier:
    def __init__(self, treat_minor_as: str = "HET", treat_null_as: str="NULL"):
        self.minor = treat_minor_as
        self.null = treat_null_as
        
    def convert(self, call: str) -> str:
        return {
            "HET": self.minor,
            "NULL": self.null
        }.get(call, call)

    def classify(self, illumina_call: str, nanopore_call: str) -> Optional[Classification]:
        illumina_call = Calls(self.convert(illumina_call))
        nanopore_call = Calls(self.convert(nanopore_call))
        if Calls.Filtered in (illumina_call, nanopore_call):
            return None
        else:
            return Classification.from_pair(illumina_call, nanopore_call)
        
@dataclass
class ConfusionMatrix:
    tp: int = 0
    tn: int = 0
    fp: int = 0
    fn: int = 0

    def ravel(self) -> Tuple[int, int, int, int]:
        """Return the matrix as a flattened tuple.
        The order of return is TN, FP, FN, TP
        """
        return self.tn, self.fp, self.fn, self.tp

    def as_matrix(self) -> np.ndarray:
        """Returns a 2x2 matrix [[TN, FP], [FN, TP]]"""
        return np.array([[self.tn, self.fp], [self.fn, self.tp]])

    def precision(self) -> float:
        """Also known as positive predictive value (PPV)"""
        return self.tp / (self.tp + self.fp)

    def recall(self) -> float:
        """Also known as true positive rate (TPR)"""
        return self.tp / (self.tp + self.fn)
    
    def fnr(self) -> float:
        "False negative rate"
        return 1 - self.recall
    
    def fpr(self) -> float:
        "False positive rate"
        return self.fp / (self.fp + self.tn)

    def fowlkes_mallows_index(self) -> float:
        """Geometric mean between precision and recall"""
        return math.sqrt(self.precision() * self.recall())

    def f_score(self, beta: float = 1.0) -> float:
        """Harmonic mean of precision and recall.
        When beta is set to 0, you get precision. When beta is set to 1, you get the
        unweighted F-score which is the harmonic mean of precision and recall. Setting
        beta to 2 weighs recall twice as much as precision. Setting beta to 0.5 weighs
        precision twice as much as recall.
        """
        ppv = self.precision()
        tpr = self.recall()
        beta2 = beta ** 2

        return ((beta2 + 1) * ppv * tpr) / ((beta2 * ppv) + tpr)

    def matthews_correlation_coefficient(self) -> float:
        """A correlation coefficient between the observed and predicted binary
        classifications.
        """
        tn, fp, fn, tp = self.ravel()
        numerator = tp * tn - fp * fn
        denominator = math.sqrt((tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))
        return numerator / denominator

    @staticmethod
    def from_predictions(pred: List[bool], truth: List[bool]) -> "ConfusionMatrix":
        assert len(pred) == len(truth)
        mtx = [[0, 0], [0, 0]]
        for y_true, y_pred in zip(truth, pred):
            mtx[y_true][y_pred] += 1
        [tn, fp], [fn, tp] = mtx
        return ConfusionMatrix(tp=tp, tn=tn, fp=fp, fn=fn)
    
    @staticmethod
    def from_counter(c: Counter) -> "ConfusionMatrix":
        return ConfusionMatrix(
            tp=c[Classification.TruePositive], 
            tn=c[Classification.TrueNegative],
            fp=c[Classification.FalsePositive],
            fn=c[Classification.FalseNegative]
        )

In [9]:
classifier = Classifier(treat_minor_as="FILT", treat_null_as="FILT")

In [36]:
data = {}
for file in table_files:
    sample = file.name.split(".")[0]
    with open(file) as f:
        _ = next(f)
        c = Counter()
        for row in f:
            mut, illumina_call, nanopore_call = row.rstrip().split(",")
            clf = classifier.classify(illumina_call, nanopore_call)
            if clf is None:
                continue
            c[clf] += 1
        data[sample] = ConfusionMatrix.from_counter(c)
#     break

In [37]:
fps = 0
fns = 0
for s, cm in data.items():
    fps += cm.fp
    fns = cm.fn

In [38]:
fps

4

In [39]:
fns

0