# False positive analysis

Where are the false positives? How are they distributed?

TODO:
1. Analze poorly performing tissue in addition to CommonBrain (a good performer)

In [1]:
from pathlib import Path

# data
import pandas as pd
import numpy as np

# plotting
import matplotlib.pyplot as plt
import seaborn as sns

## Load CommonBrain data

In [2]:
import pyarrow.parquet as pq

# read and save data, takes a long time to run
data = pq.read_table(
    "../results/model/get_labels/CommonBrain_nonrefonly.pqt"
).to_pandas()
data = data.loc[data.rpm >= 2, :]

# convert float32 to float16
for c in data.columns:
    if (data[c].dtype == "float32") and (c != "rpm"):
        data[c] = data[c].astype("float16")
        assert not np.isinf(data[c]).any(), f"{c} column contains inf values"
        assert not data[c].isna().any(), f"{c} column contains nan values"

# check that no rows have been duplicated
assert (
    data.shape[0]
    == data[["Chromosome", "Start", "End", "cell_id"]].drop_duplicates().shape[0]
), "some rows have been duplicated during labeling!"

data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1804055 entries, 61 to 19956329
Columns: 117 entries, Chromosome to neg_score_mean
dtypes: bool(21), float16(82), float64(1), int32(7), int64(2), int8(1), object(3)
memory usage: 464.5+ MB


In [3]:
# print all columns
pd.set_option("display.max_columns", None)
data.head()

Unnamed: 0,Chromosome,Start,End,n_fwd,n_rev,n_proper_pairs,n_ref_reads,3end_gini,5end_gini,max_mapq,n_reads,rpm,orientation_bias,frac_proper_pairs,alignment_score_q0,alignment_score_q0.25,alignment_score_q0.5,alignment_score_q0.75,alignment_score_q1,alignment_score_mean,alignment_score_normed_q0,alignment_score_normed_q0.25,alignment_score_normed_q0.5,alignment_score_normed_q0.75,alignment_score_normed_q1,alignment_score_normed_mean,L1_alignment_score_q0,L1_alignment_score_q0.25,L1_alignment_score_q0.5,L1_alignment_score_q0.75,L1_alignment_score_q1,L1_alignment_score_mean,L1_alignment_score_normed_q0,L1_alignment_score_normed_q0.25,L1_alignment_score_normed_q0.5,L1_alignment_score_normed_q0.75,L1_alignment_score_normed_q1,L1_alignment_score_normed_mean,L1_reference_start_q0,L1_reference_start_q0.25,L1_reference_start_q0.5,L1_reference_start_q0.75,L1_reference_start_q1,L1_reference_start_mean,L1_reference_end_q0,L1_reference_end_q0.25,L1_reference_end_q0.5,L1_reference_end_q0.75,L1_reference_end_q1,L1_reference_end_mean,L1_Acount_q0,L1_Acount_q0.25,L1_Acount_q0.5,L1_Acount_q0.75,L1_Acount_q1,L1_Acount_mean,mate_alignment_score_q0,mate_alignment_score_q0.25,mate_alignment_score_q0.5,mate_alignment_score_q0.75,mate_alignment_score_q1,mate_alignment_score_mean,mate_alignment_score_normed_q0,mate_alignment_score_normed_q0.25,mate_alignment_score_normed_q0.5,mate_alignment_score_normed_q0.75,mate_alignment_score_normed_q1,mate_alignment_score_normed_mean,mate_read_length_q0,mate_read_length_q0.25,mate_read_length_q0.5,mate_read_length_q0.75,mate_read_length_q1,mate_read_length_mean,num_supp_alignments_q0,num_supp_alignments_q0.25,num_supp_alignments_q0.5,num_supp_alignments_q0.75,num_supp_alignments_q1,num_supp_alignments_mean,cell_id,xtea_id,xtea,xtea_1kb_3end_id,xtea_1kb_3end,xtea_20kb,L1HS,L1HS_1kb_3end,L1HS_20kb,L1PA2,L1PA2_1kb_3end,L1PA2_20kb,L1PA3,L1PA3_1kb_3end,L1PA3_20kb,L1PA4,L1PA4_1kb_3end,L1PA4_20kb,L1PA5,L1PA5_1kb_3end,L1PA5_20kb,L1PA6,L1PA6_1kb_3end,L1PA6_20kb,donor_id,pos_score_q0,pos_score_q0.25,pos_score_q0.5,pos_score_q0.75,pos_score_q1,pos_score_mean,neg_score_q0,neg_score_q0.25,neg_score_q0.5,neg_score_q0.75,neg_score_q1,neg_score_mean
61,chr1,1568750,1569500,0,4,4,0,3.9e-05,4.5e-05,60,4,2.237463,1.0,1.0,96.0,137.25,151.0,151.0,151.0,137.25,0.635742,0.90918,1.0,1.0,1.0,0.90918,59.0,59.75,60.0,60.25,61.0,60.0,0.390625,0.395752,0.397461,0.398926,0.404053,0.397461,839.0,839.0,839.0,839.0,839.0,839.0,899.0,899.0,899.0,899.0,900.0,899.0,76.0,76.0,76.0,76.25,77.0,76.25,87.0,87.75,88.0,88.0,88.0,87.75,0.576172,0.581055,0.583008,0.583008,0.583008,0.581055,151.0,151.0,151.0,151.0,151.0,151.0,1.0,1.0,1.0,1.0,1.0,1.0,plate1_D3_S18,-1,False,-1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,CommonBrain,-28.390625,-13.484375,-9.273438,-5.898438,7.179688,-10.109375,-29.203125,-13.476562,-8.796875,-4.84375,7.929688,-9.5
63,chr1,1569000,1569750,0,5,5,0,4.7e-05,5.3e-05,60,5,2.796829,1.0,1.0,96.0,150.0,151.0,151.0,151.0,139.75,0.635742,0.993164,1.0,1.0,1.0,0.925781,59.0,60.0,60.0,61.0,62.0,60.40625,0.390625,0.397461,0.397461,0.404053,0.410645,0.399902,839.0,839.0,839.0,839.0,839.0,839.0,899.0,899.0,899.0,900.0,902.0,900.0,76.0,76.0,76.0,77.0,78.0,76.625,87.0,88.0,88.0,88.0,94.0,89.0,0.576172,0.583008,0.583008,0.583008,0.622559,0.589355,151.0,151.0,151.0,151.0,151.0,151.0,1.0,1.0,1.0,1.0,1.0,1.0,plate1_D3_S18,-1,False,-1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,CommonBrain,-28.390625,-13.609375,-9.0625,-5.261719,7.179688,-9.929688,-29.203125,-13.726562,-8.8125,-5.054688,7.929688,-9.671875
65,chr1,1569250,1570000,0,4,4,0,3.6e-05,3.6e-05,60,4,2.237463,1.0,1.0,150.0,150.75,151.0,151.0,151.0,150.75,0.993164,0.998535,1.0,1.0,1.0,0.998535,59.0,59.75,60.0,60.5,62.0,60.25,0.390625,0.395752,0.397461,0.400635,0.410645,0.398926,839.0,839.0,839.0,839.0,839.0,839.0,899.0,899.0,899.0,900.0,902.0,900.0,76.0,76.0,76.0,76.5,78.0,76.5,88.0,88.0,88.0,89.5,94.0,89.5,0.583008,0.583008,0.583008,0.592773,0.622559,0.592773,151.0,151.0,151.0,151.0,151.0,151.0,0.0,0.0,0.0,0.0,0.0,0.0,plate1_D3_S18,-1,False,-1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,CommonBrain,-28.390625,-14.296875,-8.742188,-4.417969,9.867188,-9.421875,-29.203125,-14.492188,-8.929688,-5.128906,7.914062,-10.125
83,chr1,1770000,1770750,1,3,3,0,5.1e-05,1.9e-05,60,4,2.40061,0.75,0.75,146.0,149.75,151.0,151.0,151.0,149.75,0.966797,0.991699,1.0,1.0,1.0,0.991699,40.0,49.75,54.0,56.25,60.0,52.0,0.264893,0.32959,0.357666,0.373291,0.399902,0.344971,839.0,839.0,839.0,839.0,839.0,839.0,879.0,894.0,899.5,900.5,902.0,895.0,64.0,68.5,71.0,74.0,80.0,71.5,88.0,95.5,101.0,107.0,116.0,101.5,0.583008,0.632324,0.668945,0.709961,0.773438,0.67334,150.0,150.75,151.0,151.0,151.0,150.75,0.0,0.0,0.0,0.0,0.0,0.0,plate1_H6_S45,-1,False,-1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,CommonBrain,-29.9375,-13.429688,-9.273438,-5.765625,6.453125,-10.265625,-27.78125,-13.164062,-8.382812,-3.667969,8.984375,-8.765625
344,chr1,2083500,2084250,0,6,0,0,3e-06,1e-05,60,6,3.184573,1.0,0.0,63.0,76.75,92.5,105.25,151.0,96.6875,0.552734,0.653809,0.704102,0.732422,1.0,0.723145,62.0,62.0,62.0,62.0,63.0,62.15625,0.410645,0.438232,0.486328,0.488281,0.520996,0.469238,839.0,839.0,839.0,839.0,839.0,839.0,902.0,902.0,902.0,902.0,902.0,902.0,78.0,86.0,86.5,88.5,89.0,85.8125,80.0,82.5,84.5,86.5,91.0,84.8125,0.556152,0.589355,0.65918,0.677734,0.716309,0.640137,119.0,127.0,127.5,143.75,151.0,133.5,1.0,1.0,1.0,1.0,1.0,1.0,plate1_G2_S13,-1,False,-1,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,False,CommonBrain,-29.15625,-14.84375,-9.179688,-5.277344,7.703125,-10.21875,-28.953125,-12.953125,-8.640625,-5.238281,9.867188,-9.59375


## Load metadata

In [4]:
# read metadata
meta = pd.read_csv(
    "/iblm/logglun02/mcuoco/workflows/sz_slavseq/config/slavseq_metadata.tsv", sep="\t"
)
meta.columns = [col.lower() for col in meta.columns]
donors = pd.read_csv(
    "/iblm/logglun02/mcuoco/workflows/sz_slavseq/config/all_donors.tsv", sep="\t"
)
cells = pd.read_csv(
    "/iblm/logglun02/mcuoco/workflows/sz_slavseq/config/all_samples.tsv", sep="\t"
)
cells = pd.merge(cells, donors, on="donor_id", how="left")
cells = pd.merge(
    cells, meta[["tissue_id", "sequencing", "region"]], on="tissue_id", how="left"
)

# Train xgboost model on one tissue 

In [5]:
# define features
features = []
keys = ["_mean", "frac", "gini", "bias"]
for c in data.columns:
    if ("_score" in c) or ("_length" in c):
        if "_normed" not in c:
            continue
    for k in keys:
        if k in c:
            features.append(c)
features.append("rpm")
print("Features:", features)

# define the classifier
# TODO: ask to optimize scale_pos_weight
from flaml import AutoML

clf = AutoML(
    task="classification",
    estimator_list=["xgboost"],
    early_stop=True,
    time_budget=120,  # time budget in seconds, 120 is good for CommonBrain, have tried larger but best model is usually found in <120s
    metric="ap",
    skip_transform=True,  # don't preprocess data
    auto_augment=False,  # don't augment rare classes
    starting_points="static",  # use data-independent hyperparameterstarting points
    verbose=4,
)

# setup outdir
Path("model_logs").mkdir(exist_ok=True)

Features: ['3end_gini', '5end_gini', 'orientation_bias', 'frac_proper_pairs', 'alignment_score_normed_mean', 'L1_alignment_score_normed_mean', 'L1_reference_start_mean', 'L1_reference_end_mean', 'L1_Acount_mean', 'mate_alignment_score_normed_mean', 'num_supp_alignments_mean', 'rpm']


In [7]:
from scripts.pyslavseq.model_selection import Model

mdl = Model(
    clf=clf,
    data=data,
    features=features,
    label_col="xtea_1kb_3end",
    rpm_filter=5,
    outfile=f"model_logs/CommonBrain10.log",
)
mdl.cv(n_splits=5)
results = mdl.get_results()

Fold 1/5

		Tuning model with 12 features on 517160 windows
		29649 positive windows (7487 loci)
		487511 negative windows
		18 Chromosomes: ['chr1' 'chr10' 'chr12' 'chr13' 'chr15' 'chr16' 'chr17' 'chr18' 'chr19'
 'chr2' 'chr20' 'chr22' 'chr3' 'chr5' 'chr6' 'chr7' 'chr8' 'chr9']
		1 Donors: ['CommonBrain']
		101 cells
		
[flaml.automl.logger: 08-23 15:08:56] {1679} INFO - task = classification
[flaml.automl.logger: 08-23 15:08:56] {1690} INFO - Evaluation method: cv
[flaml.automl.logger: 08-23 15:08:56] {1788} INFO - Minimizing error metric: 1-ap
[flaml.automl.logger: 08-23 15:08:56] {1900} INFO - List of ML learners in AutoML Run: ['xgboost']
[flaml.automl.logger: 08-23 15:08:56] {2218} INFO - iteration 0, current learner xgboost
[flaml.tune.tune: 08-23 15:08:56] {805} INFO - trial 1 config: {'n_estimators': 4, 'max_leaves': 4, 'min_child_weight': 0.9999999999999993, 'learning_rate': 0.09999999999999995, 'subsample': 1.0, 'colsample_bylevel': 1.0, 'colsample_bytree': 1.0, 'reg_alpha':

In [None]:
sns.relplot(
    results.explode(["precision", "adjusted_locus_recall"]),
    x="adjusted_locus_recall",
    y="precision",
    hue="fold",
    col="stage",
    kind="line",
).set(xlim=(0, 1), ylim=(0, 1))

## Analyze distribution of false positives

In [None]:
sns.relplot(
    results.explode(["precision", "threshold"]),
    x="threshold",
    y="precision",
    hue="fold",
    col="stage",
    kind="line",
).set(xlim=(0, 1), ylim=(0, 1))

In [None]:
from scipy.stats import binomtest, chisquare

In [None]:
# are certain loci overrepresented in FP?
loci = [c for c in data.columns if ("kb" in c) and ("id" not in c)]

res = []
for prob in np.linspace(0.1, 0.9, 9):
    fp_df = mdl.data[(mdl.data["proba"] > prob) & (mdl.data["xtea_1kb_3end"] == False)]
    for anno in loci:
        p = (
            mdl.data[anno].sum() / mdl.data.shape[0]
        )  # proportion of loci with annotation in background
        test = binomtest(
            fp_df[anno].sum(), n=fp_df.shape[0], p=p, alternative="greater"
        )
        res.append(
            {
                "proba": prob,
                "anno": anno,
                "p": test.pvalue,
                "-log10(p)": np.log10(test.pvalue) * -1,
            }
        )
res = pd.DataFrame(res)

g = sns.lineplot(data=res, x="proba", y="-log10(p)", hue="anno")
g.set(
    title="Binomial test for enrichment of annotations in CommonBrain FPs",
    xlabel="Probability threshold",
    ylabel="-log10(p)",
)
g.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.0)

In [None]:
fp_df = mdl.data[(mdl.data["proba"] > 0.5) & (mdl.data["xtea_1kb_3end"] == False)]
print(f"{fp_df['xtea_20kb'].sum()}/{fp_df.shape[0]} FPs are in xtea_20kb regions")

## Look at some examples in IGV

In [None]:
# look at the FPs with 20kb of an xtea
fp_df[fp_df["xtea_20kb"]][["Chromosome", "Start", "End", "cell_id"]]

In [None]:
# choose one, find the insertion
data[
    (data["xtea_1kb_3end"])
    & (data["Chromosome"] == "chr1")
    & (data["Start"] > 74e6)
    & (data["Start"] < 75e6)
][["Chromosome", "Start", "End"]].drop_duplicates()

In [None]:
# find all cells with FPs near this insertion
cells = fp_df[
    fp_df["xtea_20kb"]
    & (fp_df["Chromosome"] == "chr1")
    & (fp_df["Start"] >= 74725e3)
    & (fp_df["Start"] < 74745e3)
]["cell_id"].unique()

In [None]:
# DISPLAY IN IGV
import tempfile, shutil, igv_jupyter, os

# make temp directory, all data must be in jupyter filetree
tmpdir = tempfile.TemporaryDirectory(prefix=f"{os.getcwd()}/igv_data_")

# make list to store tracks for IGV
track_list = []

vcf = "/iblm/netapp/data4/mcuoco/wgs-te-pipeline/results/xtea/illumina_10x/CommonBrain/L1.vcf"
shutil.copy(vcf, tmpdir.name)

rel_name = Path(tmpdir.name).name
track = {
    "name": "xTEA calls",
    "url": f"{rel_name}/{Path(vcf).name}",
    "type": "variant",
    "indexed": "False",
}
track_list.append(track)

for c in cells:
    file = f"../results/align/CommonBrain/{c}.tagged.sorted.bam"
    assert Path(file).exists(), f"{file} does not exist"
    # copy to tempdir
    shutil.copy(file, tmpdir.name)
    shutil.copy(file + ".bai", tmpdir.name)
    track = {
        "name": c,
        "path": f"{rel_name}/{Path(file).name}",
        "type": "alignment",
        "indexed": True,
        "displayMode": "SQUISHED",
    }
    track_list.append(track)

# start IGV (https://github.com/igvteam/igv-notebook)
igv_jupyter.init()
igv_browser = igv_jupyter.Browser(
    {"genome": "hg38", "locus": "chr1:74715500-74735500", "tracks": track_list}
)
igv_browser.to_svg()

In [None]:
# close tempdir
tmpdir.cleanup()

## Screenshots

<!-- add image  -->
chr1:74721000-7473300
![chr1:74721000-7473300](./chr1_fp_example.png)

In [None]:
fp_df[
    (fp_df["xtea_20kb"])
    & (fp_df["Chromosome"] == "chr9")
    & (fp_df["Start"] > 102306e3)
    & (fp_df["Start"] < 102330e3)
    & (fp_df["cell_id"].str.contains("plate2_H"))
][["Chromosome", "Start", "End", "cell_id"]]

chr9:102306000-102330000
![chr9:102306000-102330000](./chr9_fp_example.png)

In [None]:
# average max_mapq
print("background average max_mapq:", mdl.data["max_mapq"].mean())
print("fp average max_mapq:", fp_df["max_mapq"].mean())

In [None]:
# how many FP windows are adjacent vs singleton?