In [1]:
%load_ext autoreload
%autoreload 2

import os
import re
import shutil
import random
import pprint
import itertools
import functools
import collections

import pysam
import pyranges as pr
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import scipy.stats
import sklearn.cluster

import sys
sys.path.append('/home/users/pjh/scripts/python_genome_package_dev')

In [2]:
from handygenome import common
from handygenome.common import ChromDict, Interval
from handygenome.variant.vcfspec import Vcfspec

from handygenome.sv.breakends import Breakends
from handygenome.variant.variantplus import VariantPlus, VariantPlusList
from handygenome.igvhandle import IGVHandle

import handygenome.variant.filter as libfilter

# Setups

In [3]:
SAMPLEIDS = [
    '14', '6', '87', 'F13', 'F2', 'F33', 'F37', 'FF1', 'FF104', 'FF115', 'FF13', 'FF18', 'FF20', 'FF21', 'FF23', 'FF24', 'FF27', 
    'FF3', 'FF31', 'FF34', 'FF37', 'FF39', 'FF4', 'FF43', 'FF53', 'FF56', 'FF57', 'FF58', 'FF6', 'FF62', 'FF67', 'FF71', 'FF76', 
    'FF77', 'FF78', 'FF79', 'FF80', 'FF85', 
    #'IO04',  # swapped sample
    'IO05', 'IO06', 'IO07', 'IO08', 'IO09', 'IO12', 'IO13', 'IO15', 'IO16', 'IO17', 
    'IO18', 'IO22', 'IO23', 'IO24', 'IO25', 'SC126', 'SC134', 'SC31', 'SC81', 'SC88', 'SC97', 'SC98', 
]
SAMPLEIDS_LU = [f'LU-{x}' for x in SAMPLEIDS]

BAM_TOPDIR = '/home/users/team_projects/Lung_Cancer_Panel_data/03_Data_from_YTKim/02_BAM/'

sampletype_filename_map = {
    'tumor': ['tumor'], 
    'normal': ['normal'], 
    'panel': ['panel'], 
    'rna': ['RNA', 'RNA_GFP'],
}
BAM_PATHS = {sampletype: dict() for sampletype in sampletype_filename_map.keys()}

for top, dirs, files in os.walk(BAM_TOPDIR):
    for f in files:
        for sampleid in SAMPLEIDS_LU:
            for sampletype, fname_portions in sampletype_filename_map.items():
                if f in (f'{sampleid}.{fname_portion}.bam' for fname_portion in fname_portions):
                    BAM_PATHS[sampletype][sampleid] = os.path.join(top, f)

In [4]:
from handygenome import ucscdata as ucscdata
cytoband_gr = ucscdata.get_cytoband_gr('hg19')
centromere_gr = cytoband_gr[cytoband_gr.Stain == 'acen']

# Analysis

### Setup

In [5]:
topdir = '/home/users/team_projects/Lung_Cancer_Panel_data/03_Data_from_YTKim/23_binned_depths/results'
all_normal_100bin_files = dict()

for top, dirs, files in os.walk(topdir):
    for f in files:
        if f.endswith('normal.regions.bed.gz'):
            sid = f.split('.')[0]
            abspath = os.path.join(top, f)
            all_normal_100bin_files[sid] = abspath

In [8]:
import multiprocessing
import itertools

nproc = 18

keys = list(all_normal_100bin_files.keys())
kwargs = dict(sep='\t', names=['Chromosome', 'Start', 'End', 'depth'], dtype={'Chromosome': str, 'Start': int, 'End': int, 'depth': float})

def worker(x, y):
    return pd.read_csv(x, **y)

with multiprocessing.Pool(nproc) as pool:
    dflist = pool.starmap(
        worker,
        zip(
            (all_normal_100bin_files[key] for key in keys),
            itertools.repeat(kwargs),
        )
    )

normal_100bin_dfs = dict(zip(keys, dflist))

In [9]:
normal_100bin_df_list = list(normal_100bin_dfs.items())
merged_dfs = pd.concat(
    (
        [normal_100bin_df_list[0][1]]
        + [x[1].iloc[:, 3] for x in normal_100bin_df_list[1:]]
    ),
    names=(
        ['Chromosome', 'Start', 'End'] 
        + [x[0] for x in normal_100bin_df_list]
    ),
    axis=1,
)

In [11]:
merged_dfs.columns = (['Chromosome', 'Start', 'End'] + [x[0] for x in normal_100bin_df_list])

In [25]:
merged_gr = pr.PyRanges(merged_dfs)

In [17]:
coordcols = merged_dfs.iloc[:, :3]
datacols = merged_dfs.iloc[:, 3:]

In [60]:
def get_ranks(data):
    argsort = scipy.stats.rankdata(data, method='max')
    return argsort / len(argsort)


def pick_row_view(df, igv):
    row = df.sample(n=1)
    chrom, start0, end0 = row.iloc[0, :3]
    locus = (chrom, start0 + 1, end0)
    print(locus)
    
    row_data = row.iloc[0, 3:]
    lt9995_samples = row_data.loc[row_data < 0.9995].index.to_list()
    gt9995_samples = row_data.loc[row_data > 0.9995].index.to_list()
    print('lt', lt9995_samples)
    
    igv.new()
    igv.goto([locus])
    igv.load([BAM_PATHS['normal'][x] for x in lt9995_samples])
    igv.load([BAM_PATHS['normal'][x] for x in np.random.choice(gt9995_samples, size=3)])
    igv.cmd('squish')
    
    return row


def pick_row_view_mean(row, igv, means):
    # row = df.sample(n=1)
    chrom, start0, end0 = row.iloc[:3]
    locus = (chrom, start0 + 1, end0)
    print(locus)
    
    row_data = row.loc[[x for x in row.index if x.startswith('LU-')]]
    # lt_samples = row_data.index[row_data < (means * 4)].to_list()
    # gt_samples = row_data.index[row_data > (means * 4)].to_list()

    sorted_mean_ratios = (row_data / means).sort_values()
    # print(sorted_mean_ratios)
    least_samples = sorted_mean_ratios.index[:5]
    greatest_samples = sorted_mean_ratios.index[-5:]
    
    print('least samples')
    for sample in least_samples:
        print(f'sample={sample}, mean={means[sample]}, current_depth={row_data[sample]}, ratio={sorted_mean_ratios[sample]}')
    print('greatest samples')
    for sample in greatest_samples:
        print(f'sample={sample}, mean={means[sample]}, current_depth={row_data[sample]}, ratio={sorted_mean_ratios[sample]}')
    
    # print('lt', lt_samples)
    # print('gt', gt_samples)
    
    igv.new()
    igv.goto([locus])
    # igv.load([BAM_PATHS['normal'][x] for x in np.random.choice(lt_samples, replace=False, size=min(3, len(lt_samples)))])
    # igv.load([BAM_PATHS['normal'][x] for x in np.random.choice(gt_samples, replace=False, size=min(3, len(gt_samples)))])
    igv.load([BAM_PATHS['normal'][x] for x in least_samples])
    igv.load([BAM_PATHS['normal'][x] for x in greatest_samples])
    igv.cmd('squish')

### Create candidate blacklist region

##### Rank-based

In [187]:
# quantile_datacols = datacols.apply(get_ranks, axis=0)
datacols_rank = datacols.rank(axis=0, method='max', pct=True)

KeyboardInterrupt: 

In [193]:
pctrank_df = pd.concat([coordcols, quantile_datacols], axis=1)

In [194]:
datacols_rank_gt9995 = quantile_datacols > 0.9995

In [195]:
rank_gt9995_samplefraction = datacols_rank_gt9995.mean(axis=1)

In [196]:
pctrank_gt9995_morethan95pct = rank_gt9995_samplefraction > 0.95

##### Mean-based

In [32]:
means = datacols.mean(axis=0)

In [44]:
datacols_gtmean = dict()
gtmean_samplefracs = dict()
gtmean_samplecounts = dict()

def add_meancompare_data(fraction):
    datacols_gtmean[fraction] = datacols > (fraction * means)
    gtmean_samplefracs[fraction] = datacols_gtmean[fraction].mean(axis=1)
    gtmean_samplecounts[fraction] = datacols_gtmean[fraction].sum(axis=1)

In [45]:
add_meancompare_data(3)
add_meancompare_data(4)

In [36]:
igv = IGVHandle(port=60387)

In [148]:
merged_dfs_annot = merged_dfs.assign(**{'gtmean_samplecounts': gtmean_samplecounts[4]})
# subdf = merged_dfs.assign(**{'gtmean_samplecounts': gtmean_samplecounts[4]})
merged_dfs_annot.sort_values('gtmean_samplecounts', ascending=True, inplace=True)
subdf = merged_dfs_annot.loc[merged_dfs_annot['gtmean_samplecounts'] >= 5, :]

# subdf = merged_dfs.loc[gtmean_samplefracs[4].between(0.5, 0.6).array, :]
# subdf = merged_dfs.loc[(gtmean_samplefracs[3] > 0.5).array, :]

In [149]:
print(subdf.shape)
print(merged_dfs.shape)

(76007, 64)
(31018086, 63)


In [68]:
subdf.head()

Unnamed: 0,Chromosome,Start,End,LU-FF80,LU-IO18,LU-FF76,LU-14,LU-IO05,LU-IO23,LU-SC134,...,LU-IO24,LU-SC81,LU-FF104,LU-FF57,LU-IO12,LU-FF85,LU-FF27,LU-SC98,LU-FF53,gtmean_samplecounts
16089684,9,69808300,69808400,151.4,142.82,163.17,154.71,158.36,119.42,159.19,...,165.99,150.59,94.05,106.21,141.48,138.3,109.24,179.4,73.19,5
14052360,8,12440000,12440100,163.55,87.32,160.99,82.44,164.61,149.41,95.53,...,92.23,95.8,74.9,58.42,90.3,117.43,66.63,135.15,87.04,5
31000368,GL000195.1,112000,112100,106.39,92.12,109.07,149.02,125.35,87.57,91.73,...,152.74,127.05,63.55,89.65,155.39,101.93,82.35,25.26,70.54,5
30975211,GL000214.1,103700,103800,97.75,131.08,136.24,96.33,137.36,62.83,174.39,...,137.04,142.48,76.06,69.71,75.91,146.06,58.72,144.77,93.92,5
22193738,14,19437000,19437100,96.95,98.25,160.11,104.92,143.91,146.11,120.63,...,134.97,71.19,96.7,45.39,121.87,94.35,75.41,93.04,93.89,5


In [67]:
row = subdf.iloc[0, :]
print(row)
pick_row_view_mean(row, igv, means)

Chromosome                    9
Start                  69808300
End                    69808400
LU-FF80                   151.4
LU-IO18                  142.82
                         ...   
LU-FF85                   138.3
LU-FF27                  109.24
LU-SC98                   179.4
LU-FF53                   73.19
gtmean_samplecounts           5
Name: 16089684, Length: 64, dtype: object
('9', 69808301, 69808400)
least samples
sample=LU-IO06, mean=41.282814418014084, current_depth=26.4, ratio=0.6394912840167251
sample=LU-FF53, mean=31.707044289579997, current_depth=73.19, ratio=2.3083198588792047
sample=LU-IO23, mean=47.7968633280596, current_depth=119.42, ratio=2.498490312645545
sample=LU-FF24, mean=36.186269118926326, current_depth=97.66, ratio=2.6988137317787584
sample=LU-IO22, mean=61.57973562134055, current_depth=172.47, ratio=2.8007590201512698
greatest samples
sample=LU-FF6, mean=31.188080685249222, current_depth=126.34, ratio=4.050906539425301
sample=LU-14, mean=36.976260121

##### Readstats_annotator --mq-limits --depth-limits test

In [365]:
bam_path = '/home/users/team_projects/Lung_Cancer_Panel_data/03_Data_from_YTKim/02_BAM/IO22/LU-IO22.normal.bam'
bam = pysam.AlignmentFile(bam_path)

In [413]:
chrom = '2'
# pos0 = 213_056_734
pos0 = 213_055_850

start0 = pos0
end0 = pos0 + 1

In [414]:
%%timeit

pup = bam.pileup(chrom, start0, end0, truncate=True, stepper='nofilter', ignore_overlaps=False, flag_filter=0, ignore_orphans=False, min_base_quality=0, min_mapping_quality=0)
pupcol = next(pup)
mqs = pupcol.get_mapping_qualities()
depth = pupcol.get_num_aligned()

37.7 ms ± 1.29 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [415]:
%%timeit

mqs2 = list()
for idx, read in enumerate(bam.fetch(chrom, start0, end0)):
    mqs2.append(read.mapping_quality)
depth2 = idx + 1

8.18 ms ± 82.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [409]:
sorted(mqs2) == sorted(mqs)

True

In [410]:
depth == depth2

True

In [416]:
import handygenome.annotation.readstats as libreadstats

In [436]:
depth, mqs = libreadstats.get_position_info(bam, '1', 10751)
print(depth, np.mean(mqs))

111 19.72072072072072


In [424]:
# vcfspec = Vcfspec('2', 213_056_736, 'T', ('G',), refver='hg19')
vcfspec = Vcfspec('2', 213_056_543, 'T', ('C',), refver='hg19')

In [425]:
readstats = libreadstats.ReadStats.from_bam(vcfspec, bam, common.DEFAULT_FASTAS['hg19'], common.DEFAULT_CHROMDICTS['hg19'])

In [426]:
readstats

<ReadStats(
    {'rppcounts': {[38;5;93mNone[0m: 157, -1: 0, 0: 25, 1: 21, 'softclip_overlap': 157},
     'mean_BQs': {[38;5;93mNone[0m: nan, -1: nan, 0: 31.76, 1: 30.09090909090909},
     'median_BQs': {[38;5;93mNone[0m: nan, -1: nan, 0: 32.0, 1: 31.0},
     'mean_MQs': {[38;5;93mNone[0m: 14.872611464968152, -1: nan, 0: 60.2, 1: 59.5},
     'median_MQs': {[38;5;93mNone[0m: 12.0, -1: nan, 0: 60.0, 1: 60.0},
     'mean_cliplens': {[38;5;93mNone[0m: 261.28662420382165,
                       -1: nan,
                       0: 7.88,
                       1: 7.0476190476190474},
     'median_cliplens': {[38;5;93mNone[0m: 261.0, -1: nan, 0: 3.0, 1: 0.0},
     'mNM': {[38;5;93mNone[0m: 0.025477707006369428, -1: 0, 0: 0.84, 1: 1.0},
     'recurrent_mNM': {[38;5;93mNone[0m: 1, -1: 0, 0: 2, 1: 3},
     'pairorient_pvalues': {[38;5;93mNone[0m: 0.1101572607611227,
                            -1: nan,
                            0: 0.30745625495910645,
                        

In [427]:
readstats.is_invalid

False

##### Readstats_annotator long-running regions

In [8]:
import sys
import os
import re
import pickle

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

sys.path.append('/home/users/pjh/scripts/python_genome_package_dev/')

In [64]:
LOGDIR = '/home/users/team_projects/Lung_Cancer_Panel_data/03_Data_from_YTKim/06_haplotypecaller/merged_vcfs/all_sample_merged.vcf.gz_readstats_annotator_5adyj0tk/logs'
PAT = re.compile(r'\[.+\] Processing (.+)')

def get_region_from_logfile(fname):
    fname = os.path.join(LOGDIR, fname)
    coords = list()
    with open(fname) as infile:
        for line in infile:
            line = line.strip()
            if line == '':
                continue
                
            mat = PAT.fullmatch(line)
            if mat is None:
                continue
                
            variantinfo = mat.group(1).split()
            chrom, pos = variantinfo[0], int(variantinfo[1])
            coords.append((chrom, pos))
            
    locus = (coords[0], coords[-1])
    print(locus[0][0], f'{locus[0][1]:,}')
    print(locus[1][0], f'{locus[1][1]:,}')
    return locus

In [28]:
import handygenome.ucscdata as ucscdata
cytoband_gr = ucscdata.get_cytoband_gr('hg19')

In [43]:
from handygenome.igvhandle import IGVHandle
igv = IGVHandle(60387)

In [46]:
igv.load([
    BAM_PATHS['normal']['LU-14'],
    BAM_PATHS['normal']['LU-6'],
])

OK
OK


In [54]:
igv.cmd('squish')

OK


In [2]:
pickle_path = '/home/users/team_projects/Lung_Cancer_Panel_data/03_Data_from_YTKim/06_haplotypecaller/workspace/timespent.output.pickle'
with open(pickle_path, 'rb') as infile:
    data = pickle.load(infile)

In [3]:
ser = pd.Series(data)
ser_secperline = ser * 60 / 1565

In [4]:
np.quantile(ser, np.linspace(0, 1, 11, endpoint=True))

array([  18.03333333,   42.78333333,   44.1       ,   45.26166667,
         46.42666667,   47.63333333,   48.98333333,   50.5       ,
         52.65      ,   57.96833333, 2276.16666667])

In [5]:
np.quantile(ser_secperline, np.linspace(0, 1, 11, endpoint=True))

array([ 0.6913738 ,  1.64025559,  1.69073482,  1.73527157,  1.7799361 ,
        1.82619808,  1.87795527,  1.93610224,  2.01853035,  2.22242812,
       87.26517572])

In [97]:
for idx, (key, val) in enumerate(ser[ser>100].sort_values(ascending=False).to_dict().items()):
    print(idx, key, val)

0 5726.sbatch.success 2276.1666666666665
1 5721.sbatch.success 2084.516666666667
2 4275.sbatch.success 1599.45
3 5720.sbatch.success 1270.7166666666667
4 1087.sbatch.success 1137.5166666666667
5 6225.sbatch.success 1117.6166666666666
6 2375.sbatch.success 1090.6
7 2379.sbatch.success 938.6833333333333
8 5717.sbatch.success 896.1333333333333
9 2380.sbatch.success 885.1166666666667
10 2376.sbatch.success 879.4833333333333
11 2377.sbatch.success 855.35
12 5719.sbatch.success 847.0
13 2373.sbatch.success 724.8
14 9828.sbatch.success 719.65
15 4791.sbatch.success 683.0166666666667
16 9827.sbatch.success 682.65
17 8827.sbatch.success 682.3
18 1085.sbatch.success 655.0666666666667
19 1066.sbatch.success 652.3833333333333
20 8527.sbatch.success 646.8833333333333
21 8025.sbatch.success 636.4666666666667
22 2390.sbatch.success 615.0333333333333
23 5725.sbatch.success 577.6833333333333
24 2374.sbatch.success 565.8666666666667
25 5724.sbatch.success 560.1166666666667
26 0874.sbatch.success 557.05


In [109]:
idx = 26
fname = ser[ser>100].sort_values(ascending=False).index[idx]
print(fname)
locus = get_region_from_logfile(fname)

0874.sbatch.success
2 32,877,384
2 33,185,302


In [110]:
subgr = cytoband_gr[cytoband_gr.Stain == 'acen'][locus[0][0]]

for idx, row in subgr.df.iterrows():
    print(row[0], f'{row[1]:,}', f'{row[2]:,}', sep='\t')

2	90,500,000	93,300,000
2	93,300,000	96,800,000


In [111]:
igv.goto(
    [
        (locus[0][0], locus[0][1], locus[0][1])
    ]
)

OK


In [86]:
igv.goto(
    [
        ('Y', 13_400_000, 13_400_000)
    ]
)

OK


### Check cancer gene overlap

##### Get cancer gene coordinates & intersect with candidate blacklist

In [69]:
import handygenome.analysis.feature as analysisfeat
import handygenome.annotation.oncokb as liboncokb

ONCOKB_TOKEN = '864a64ee-cfa0-4b99-aa25-305fb4bcda39'
oncokb_cancergenes = liboncokb.get_cancerGeneList(ONCOKB_TOKEN)
oncokb_cancergenes_coords = analysisfeat.get_gene_coords_gr(oncokb_cancergenes['hugoSymbol'].to_list(), 'hg19').sort()

In [70]:
oncokb_cancergenes_coords

Unnamed: 0,Chromosome,Start,End,gene_name
0,1,1682670,1711896,NADK
1,1,1716728,1822495,GNB1
2,1,2487077,2496821,TNFRSF14
3,1,2985731,3355185,PRDM16
4,1,6241328,6269449,RPL22
...,...,...,...,...
1025,X,152783133,152848397,ATP2B3
1026,X,152907945,152916781,DUSP9
1027,X,153618314,153637504,RPL10
1028,X,153656977,153664862,ATP6AP1


In [72]:
subdf_gr = pr.PyRanges(subdf)
subdf_cgeneannot_gr = subdf_gr.join(oncokb_cancergenes_coords)

##### Load panel gene list

In [87]:
curated_drivers = {
    'tcga_adc': [ # The Cancer Genome Atlas Research Network. Comprehensive molecular profiling of lung adenocarcinoma. Nature 511, 543–550 (2014).
        'RIT1',
        'U2AF1',
        'CDKN2A',
        'RB1',
        'SMARCA4',
        'PIK3CA',
        'ARID1A',
        'MET',
        'MGA',
        'RBM10',
        'SETD2',
        'BRAF',
        'NF1',
        'EGFR',
        'STK11',
        'KEAP1',
        'KRAS',
        'TP53',
    ],
    'tcga_sqcc': [ # The Cancer Genome Atlas Research Network. Comprehensive genomic characterization of squamous cell lung cancers. Nature 489, 519–525 (2012).
        'RB1',
        'NOTCH1',
        'NFE2L2',
        'HLA-A',
        #'MLL2', - synonym of KMT2D
        'KEAP1',
        'PIK3CA',
        'PTEN',
        'CDKN2A',
        'TP53',        
    ],
    'natgen2016_adc': [ # Cancer Genome Atlas Research Network et al. Distinct patterns of somatic genome alterations in lung adenocarcinomas and squamous cell carcinomas. Nat Genet 48, 607–616 (2016).
        'TP53',
        'KRAS',
        'KEAP1',
        'EGFR',
        'STK11',
        'SMARCA4',
        'RBM10',
        'RB1',
        'NF1',
        'ARID1A',
        'BRAF',
        'ERBB2',
        'SETD2',
        'MGA',
        #'FTSJD1', - synonym of CMTR2
        'CMTR2',
        'MET',
        'ATM',
        'CDKN2A',
        'U2AF1',
        'RIT1',
        'DOT1L',
        'ARID2',
        'SMAD4',
        'PTPRU',
        'CTNNB1',
        'ARHGEF12',
        'APC',
        'KLHL5',
        'PIK3CA',
        'PPP3CA',
        'ATF7IP',
        'KARS',
        'RAF1',
        #'MLL3', - synonym of KMT2C
        'FANCM',
        'STIM1',
        'NRAS',
        'MAP2K1',
    ],
    'natgen2016_sqcc': [ # Cancer Genome Atlas Research Network et al. Distinct patterns of somatic genome alterations in lung adenocarcinomas and squamous cell carcinomas. Nat Genet 48, 607–616 (2016).
        'TP53',
        'CDKN2A',
        'NFE2L2',
        'PTEN',
        #'MLL2', - synonym of KMT2D
        'RB1',
        'FAT1',
        'NOTCH1',
        'RASA1',
        'NF1',
        'ARID1A',
        'KDM6A',
        'PIK3CA',
        'CUL3',
        'HRAS',
        'IRF6',
        'FBXW7',
        'ARHGAP35',
        'PASK',
        'NSD1',
    ],
    'natrev2019_conventional': [ # Skoulidis, F. & Heymach, J. V. Co-occurring genomic alterations in non-small-cell lung cancer biology and therapy. Nat Rev Cancer 19, 495–509 (2019).
        'KRAS',
        'EGFR',
        'BRAF',
        'NF1',
        'ERBB2',
        'MET',
        'ALK',
        'ROS1',
        'RET',
        'MET',
        'ERBB2',
        'MAP2K1',
        'NRAS',
        'HRAS',
        'RIT1',
        'FGFR2',
        'FGFR1',
    ],
    'natrev2019_KRAS_excl': [ # Skoulidis, F. & Heymach, J. V. Co-occurring genomic alterations in non-small-cell lung cancer biology and therapy. Nat Rev Cancer 19, 495–509 (2019).
        'EGFR',
        'EML4',
        'ERBB2',
        'TP53',
        'ROS1',
    ],
    'natrev2019_KRAS_co': [ # Skoulidis, F. & Heymach, J. V. Co-occurring genomic alterations in non-small-cell lung cancer biology and therapy. Nat Rev Cancer 19, 495–509 (2019).
        #'LKB1', - synonym of STK11
        'KEAP1', 
        'ATM',
        'RBM10',
        'PTPRD',
        'U2AF1',
        'POLE',
        'NTRK3',
    ],
    'natrev2019_EGFR_excl': [ # Skoulidis, F. & Heymach, J. V. Co-occurring genomic alterations in non-small-cell lung cancer biology and therapy. Nat Rev Cancer 19, 495–509 (2019).
        'KRAS',
        'KEAP1',
        #'LKB1', - synonym of STK11
        'ALK',
        'EML4',
        'MET',
        'DDR2',
        'BRCA2',
        'PTPRT',
        'PTPRD',
        'NF1',
        #'EPHA',
        'EPHA1',
        'EPHA2',
        'EPHA3',
        'EPHA4',
        'EPHA5',
        'EPHA6',
        'EPHA7',
        'EPHA8',
        'EPHA10',
        'RET',
        #'PAK5', - synonym of PAK6
        'PAK6',
        'ERBB4',
        'KMT2C',
        'TBX3',
        'FAT1',
    ],
    'natrev2019_EGFR_co': [ # Skoulidis, F. & Heymach, J. V. Co-occurring genomic alterations in non-small-cell lung cancer biology and therapy. Nat Rev Cancer 19, 495–509 (2019).
        'TP53',
        'RB1',
        'PIK3CA',
        'CTNNB1',
    ],
    'manually_added_221113': [
        # from benchmark ppt
        'PIK3CA',
        'PIK3R1',
        'PIK3C3',
        'PIK3C2G',
        'PIK3CG',
        'PIK3R2',
        'PIK3CA',
        'ARID2',
        'CCND2',
        'DNMT3A',
        'FGFR4',
        'MTOR',
        'NCOR1',
        'NKX2-1',
        'SF3B1',
        'SMARCA2',
        'SMARCB1',
        # from TCGA paper fig 3
        'RIT1',
    ],
}

panel_genes_exon = [
    'AKT1',
    'AKT3',
    'ALK',
    'ARAF',
    'ARID1A',
    'ATM',
    'ATR',
    'AXL',
    'BRAF',
    'BRCA1',
    'BRCA2',
    'CBL',
    'CCND1',
    'CD274',
    'CDK11B',
    'CDKN2A',
    'CHEK2',
    'CREBBP',
    'CTNNB1',
    'DDR2',
    'EGFR',
    'EP300',
    'ERBB2',
    'ERBB3',
    'ERBB4',
    'FBXW7',
    'FGFR1',
    'FGFR2',
    'FGFR3',
    'HRAS',
    'IDH1',
    'IDH2',
    'IGF1R',
    'JAK2',
    'JAK3',
    'KDR',
    'KEAP1',
    'KIT',
    'KMT2D',
    'KRAS',
    'MAP2K1',
    'MAP2K2',
    'MAP2K4',
    'MDM2',
    'MET',
    'MYC',
    'MYCN',
    'NEK2',
    'NF1',
    'NFE2L2',
    'NOTCH1',
    'NRAS',
    'NRG1',
    'NTRK1',
    'NTRK2',
    'NTRK3',
    'PDGFRA',
    'PIK3CA',
    'PTEN',
    'RAF1',
    'RB1',
    'RBM10',
    'RET',
    'RIT1',
    'ROS1',
    'SDK1',
    'SETD2',
    'SMAD4',
    'SMARCA4',
    'SMG1',
    'SOX2',
    'STK11',
    'TERT',
    'TP53',
    'TP63',
]

panel_genes_intron = [
    'ALK',
    'AXL',
    'BRAF',
    'EGFR',
    'ERBB4',
    'FGFR1',
    'FGFR2',
    'FGFR3',
    'MET',
    'NRG1',
    'NTRK1',
    'NTRK2',
    'NTRK3',
    'PDGFRA',
    'RET',
    'ROS1',    
]

In [89]:
panel_genes_all = set(itertools.chain(panel_genes_intron, panel_genes_exon))

In [100]:
sorted(subdf_cgeneannot_gr.gene_name.unique())

['ABI1',
 'ACSL3',
 'ACTB',
 'AFF1',
 'AKT3',
 'APC',
 'ARHGAP35',
 'ARID1B',
 'ASXL1',
 'ATRX',
 'BCL2',
 'BCOR',
 'BRSK1',
 'CAMTA1',
 'CBFA2T3',
 'CCDC6',
 'CCND3',
 'CDC42',
 'CDK12',
 'CLTCL1',
 'CREBBP',
 'CRKL',
 'CTNNA1',
 'CUL4A',
 'CUX1',
 'DDX6',
 'DICER1',
 'DNM2',
 'DNMT1',
 'DNMT3A',
 'EBF1',
 'EP300',
 'EP400',
 'EPHA5',
 'EPHB1',
 'ERBB4',
 'ERC1',
 'ESR1',
 'EWSR1',
 'EXT1',
 'FLT4',
 'FOXP1',
 'GAB2',
 'GAS7',
 'GNA13',
 'GPC3',
 'GPHN',
 'GRIN2A',
 'GRM3',
 'INPP5D',
 'INSR',
 'KDM4C',
 'KDM6A',
 'KLF3',
 'KMT2C',
 'LIFR',
 'LPP',
 'LRP1B',
 'MAP4K4',
 'MAPKAP1',
 'MDC1',
 'MECOM',
 'MEF2C',
 'MGAM',
 'MIB1',
 'MKNK1',
 'MSH2',
 'MSH3',
 'MSH6',
 'MUC1',
 'MYH9',
 'NBEAP1',
 'NCOA1',
 'NCOA4',
 'NCOR1',
 'NCOR2',
 'NEGR1',
 'NF1',
 'NOTCH2',
 'NRG1',
 'NSD1',
 'NT5C2',
 'NUP214',
 'NUP98',
 'PASK',
 'PAX3',
 'PBRM1',
 'PCM1',
 'PDE4DIP',
 'PIK3CB',
 'PLCG2',
 'POLD1',
 'PPARG',
 'PRCC',
 'PRDM16',
 'PTCH1',
 'PTPRD',
 'RAD51',
 'RAD51B',
 'REST',
 'REV3L',
 'ROBO1',


In [92]:
panel_genes_all.intersection(subdf_cgeneannot_gr.gene_name)

{'AKT3', 'CREBBP', 'EP300', 'ERBB4', 'NF1', 'NRG1', 'SMG1', 'TERT'}

In [126]:
subdf_cgeneannot_gr[subdf_cgeneannot_gr.gene_name == 'TSHR']

Unnamed: 0,Chromosome,Start,End,LU-FF80,LU-IO18,LU-FF76,LU-14,LU-IO05,LU-IO23,LU-SC134,...,LU-FF57,LU-IO12,LU-FF85,LU-FF27,LU-SC98,LU-FF53,gtmean_samplecounts,Start_b,End_b,gene_name
0,14,81561000,81561100,267.11,442.89,30.33,35.34,299.47,365.64,185.21,...,33.16,355.57,35.97,26.45,250.38,24.99,16,81421332,81612646,TSHR
1,14,81469100,81469200,4319.4,8988.26,13.77,21.35,5390.95,6153.41,2932.21,...,27.63,6728.94,15.57,20.83,4128.51,13.29,19,81421332,81612646,TSHR


In [128]:
row = subdf_cgeneannot_gr[subdf_cgeneannot_gr.gene_name == 'TSHR'].df.iloc[1, :]
# idx = 6
# row = cancergene_overlap_gr.df.iloc[idx:(idx + 1), :]
print(row)
pick_row_view_mean(row, igv, means)

Chromosome                   14
Start                  81469100
End                    81469200
LU-FF80                  4319.4
LU-IO18                 8988.26
                         ...   
LU-FF53                   13.29
gtmean_samplecounts          19
Start_b                81421332
End_b                  81612646
gene_name                  TSHR
Name: 1, Length: 67, dtype: object
('14', 81469101, 81469200)
least samples
sample=LU-FF56, mean=28.942476457122417, current_depth=8.72, ratio=0.30128727971562735
sample=LU-FF76, mean=45.61566637380509, current_depth=13.77, ratio=0.30186997351215844
sample=LU-FF78, mean=32.53089656208966, current_depth=12.42, ratio=0.3817908915081616
sample=LU-6, mean=33.33318137876073, current_depth=12.76, ratio=0.3828017450542669
sample=LU-FF39, mean=25.121992081007214, current_depth=9.87, ratio=0.3928828561116353
greatest samples
sample=LU-IO24, mean=49.76233501544866, current_depth=5982.02, ratio=120.21180272474933
sample=LU-IO25, mean=48.90238395173711

### Confirm Final blacklist and save it

In [130]:
blacklist_gr = pr.PyRanges(subdf).sort()

In [134]:
blacklist_df = blacklist_gr.df.rename(columns={'gtmean_samplecounts': 'gt_4mean_samplecounts'})

In [136]:
blacklist_path = '/home/users/pjh/scripts/python_genome_package_dev/data/custom_blacklist.tsv.gz'
blacklist_df.to_csv(blacklist_path, sep='\t', header=True, index=False)

In [150]:
merged_dfs_annot_gr = pr.PyRanges(merged_dfs_annot)

In [152]:
blacklist_gr['2', 33_140_000:33_142_000]

Unnamed: 0,Chromosome,Start,End,LU-FF80,LU-IO18,LU-FF76,LU-14,LU-IO05,LU-IO23,LU-SC134,...,LU-IO24,LU-SC81,LU-FF104,LU-FF57,LU-IO12,LU-FF85,LU-FF27,LU-SC98,LU-FF53,gtmean_samplecounts
0,2,33141200,33141300,10061.92,18901.03,662.9,476.56,12624.36,12815.05,8621.38,...,13262.95,617.76,454.16,493.94,15653.0,440.86,395.86,9882.05,408.98,60
1,2,33141300,33141400,2922557.42,5869068.87,66513.92,47291.85,3747546.08,3914055.12,2341136.9,...,4048134.33,59262.13,45194.47,49687.68,4742511.44,46518.35,38207.37,2901473.37,40231.16,60
2,2,33141400,33141500,2308283.28,4532227.15,71166.99,55250.19,3082463.35,2926391.11,2285220.69,...,3215387.23,69190.25,46811.78,52499.79,3897396.19,57191.96,39426.46,2365816.21,41335.07,60
3,2,33141500,33141600,5895586.12,11218106.59,148116.46,113737.55,7746268.78,7473889.92,5767356.35,...,7946781.27,144794.17,95392.92,109254.27,9374017.79,113462.85,78539.73,6094810.55,82335.65,60
4,2,33141600,33141700,3188920.02,6199340.58,53445.2,41923.58,4140398.21,4108340.56,2875539.71,...,4332996.57,51917.16,34638.3,39518.79,5090919.61,39525.65,28274.88,3227658.74,29720.33,60


In [153]:
merged_dfs_annot_gr['2', 33_140_000:33_142_000]

Unnamed: 0,Chromosome,Start,End,LU-FF80,LU-IO18,LU-FF76,LU-14,LU-IO05,LU-IO23,LU-SC134,...,LU-IO24,LU-SC81,LU-FF104,LU-FF57,LU-IO12,LU-FF85,LU-FF27,LU-SC98,LU-FF53,gtmean_samplecounts
0,2,33140000,33140100,42.0,39.76,37.49,30.77,40.25,43.94,51.3,...,42.4,33.62,38.72,32.53,37.13,52.61,27.8,49.78,25.57,0
1,2,33140100,33140200,44.61,30.61,31.3,25.01,38.54,44.71,49.52,...,41.39,35.16,28.9,30.34,44.15,32.02,24.43,41.99,20.96,0
2,2,33140200,33140300,47.35,38.67,30.33,22.1,41.72,56.64,36.09,...,45.31,32.66,22.29,32.48,40.57,25.65,25.63,39.84,32.03,0
3,2,33140300,33140400,44.05,38.27,33.23,25.9,43.43,55.24,40.42,...,49.04,37.68,27.92,29.32,47.93,28.16,26.31,37.39,32.64,0
4,2,33140400,33140500,52.68,33.66,45.54,39.52,49.08,51.61,64.3,...,50.31,53.46,40.37,35.44,57.76,38.94,29.78,45.88,34.9,0
5,2,33140500,33140600,53.65,42.15,40.95,26.03,54.46,57.7,64.74,...,39.1,46.77,27.33,38.17,47.43,33.55,33.06,55.66,29.88,0
6,2,33140600,33140700,55.44,42.58,37.72,30.41,60.69,50.36,47.17,...,39.9,45.72,28.13,35.36,55.39,35.57,37.54,56.51,31.05,0
7,2,33140700,33140800,59.57,44.32,40.14,37.35,58.31,37.27,44.98,...,41.09,48.37,34.42,32.69,50.99,37.95,26.93,49.25,29.51,0
8,2,33140800,33140900,51.48,39.11,42.35,33.69,51.15,30.47,42.1,...,33.9,49.19,24.23,43.1,54.08,34.08,24.46,33.38,26.59,0
9,2,33140900,33141000,33.87,29.0,33.91,21.52,48.52,29.47,30.4,...,27.49,36.8,21.4,31.88,36.69,33.67,27.17,33.31,28.83,0
