In [13]:
%load_ext autoreload
%autoreload 2

import os

import pysam
import pyranges as pr
import numpy as np

import sys
sys.path.append('/home/users/pjh/scripts/python_genome_package_dev')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
from handygenome import common

In [2]:
SAMPLEIDS = [
    '14', '6', '87', 'F13', 'F2', 'F33', 'F37', 'FF1', 'FF104', 'FF115', 'FF13', 'FF18', 'FF20', 'FF21', 'FF23', 'FF24', 'FF27', 
    'FF3', 'FF31', 'FF34', 'FF37', 'FF39', 'FF4', 'FF43', 'FF53', 'FF56', 'FF57', 'FF58', 'FF6', 'FF62', 'FF67', 'FF71', 'FF76', 
    'FF77', 'FF78', 'FF79', 'FF80', 'FF85', 
    #'IO04',  # swapped sample
    'IO05', 'IO06', 'IO07', 'IO08', 'IO09', 'IO12', 'IO13', 'IO15', 'IO16', 'IO17', 
    'IO18', 'IO22', 'IO23', 'IO24', 'IO25', 'SC126', 'SC134', 'SC31', 'SC81', 'SC88', 'SC97', 'SC98', 
]
SAMPLEIDS_LU = [f'LU-{x}' for x in SAMPLEIDS]
SAMPLEIDS_LU_PANEL = [f'{x}_panel' for x in SAMPLEIDS_LU]
SAMPLEIDS_LU_TUMOR = [f'{x}_tumor' for x in SAMPLEIDS_LU]

BAM_TOPDIR = '/home/users/team_projects/Lung_Cancer_Panel_data/03_Data_from_YTKim/02_BAM/'
PANEL_REGION_PATH = '/home/users/team_projects/Lung_Cancer_Panel_data/03_Data_from_YTKim/metadata/panel_bait_design/custom/exon_intron_targetregions_merged.bed'
PANEL_REGION_EXONS_PATH = '/home/users/team_projects/Lung_Cancer_Panel_data/03_Data_from_YTKim/metadata/panel_bait_design/custom/SNUH_FIRST_Lung_Cancer_V5_exon_Regions_tracknamechanged.bed'
PANEL_REGION_INTRONS_PATH = '/home/users/team_projects/Lung_Cancer_Panel_data/03_Data_from_YTKim/metadata/panel_bait_design/custom/SNUH_FIRST_Lung_Cancer_V5_intron_Regions_tracknamechanged.bed'
WGS_TMB_PATH = '/home/users/team_projects/Lung_Cancer_Panel_data/03_Data_from_YTKim/metadata/wgs_TMB_previously_found_210312.txt'

ONCOKB_TOKEN = '864a64ee-cfa0-4b99-aa25-305fb4bcda39'

In [3]:
sampletype_filename_map = {
    'tumor': ['tumor'], 
    'normal': ['normal'], 
    'panel': ['panel'], 
    'rna': ['RNA', 'RNA_GFP'],
}
BAM_PATHS = {sampletype: dict() for sampletype in sampletype_filename_map.keys()}

for top, dirs, files in os.walk(BAM_TOPDIR):
    for f in files:
        for sampleid in SAMPLEIDS_LU:
            for sampletype, fname_portions in sampletype_filename_map.items():
                if f in (f'{sampleid}.{fname_portion}.bam' for fname_portion in fname_portions):
                    BAM_PATHS[sampletype][sampleid] = os.path.join(top, f)

In [6]:
MQ_df = common.DEFAULT_CHROMDICTS['hg19'].to_gr().window(1000).df

In [11]:
MQ_df.head()

Unnamed: 0,Chromosome,Start,End
0,1,0,1000
1,1,1000,2000
2,1,2000,3000
3,1,3000,4000
4,1,4000,5000


In [8]:
bam = pysam.AlignmentFile(BAM_PATHS['normal']['LU-FF18'])

In [9]:
def mqcalc(chrom, start0, end0):
    mqs = list()
    for read in bam.fetch(chrom, start0, end0):
        mqs.append(read.mapping_quality)
    
    if not mqs:
        return np.nan
    else:
        return np.mean(mqs)

In [15]:
mean_MQs = list()
for idx, row in MQ_df.iterrows():
    mean_MQs.append(mqcalc(row['Chromosome'], row['Start'], row['End']))
    if idx % 10000 == 0:
        print(row['Chromosome'], row['Start'], row['End'])

1 0 1000
1 10000000 10001000
1 20000000 20001000
1 30000000 30001000
1 40000000 40001000
1 50000000 50001000
1 60000000 60001000
1 70000000 70001000
1 80000000 80001000
1 90000000 90001000
1 100000000 100001000
1 110000000 110001000
1 120000000 120001000
1 130000000 130001000
1 140000000 140001000
1 150000000 150001000
1 160000000 160001000
1 170000000 170001000
1 180000000 180001000
1 190000000 190001000
1 200000000 200001000
1 210000000 210001000
1 220000000 220001000
1 230000000 230001000
1 240000000 240001000
2 749000 750000
2 10749000 10750000
2 20749000 20750000
2 30749000 30750000
2 40749000 40750000
2 50749000 50750000
2 60749000 60750000
2 70749000 70750000
2 80749000 80750000
2 90749000 90750000
2 100749000 100750000
2 110749000 110750000
2 120749000 120750000
2 130749000 130750000
2 140749000 140750000
2 150749000 150750000
2 160749000 160750000
2 170749000 170750000
2 180749000 180750000
2 190749000 190750000
2 200749000 200750000
2 210749000 210750000
2 220749000 220750000

In [16]:
len(mean_MQs)

3095689

In [17]:
import pickle
outfile = '/home/users/pjh/scripts/python_genome_package_dev/tests/binned_MQ.pickle'
with open(outfile, 'wb') as f:
    pickle.dump(mean_MQs, f)