In [None]:
import os
import pandas as pd
import numpy as np
from collections import defaultdict

import matplotlib.pyplot as plt

In [None]:
METADATA_PATH = '../../raw_data/cristiano_cfdnas/meta_data.csv'
CRISTIANO_PAPER = 'Genome-wide cell-free DNA fragmentation in patients with cancer'

def parse_metadata(file_path, paper):
    metadata_df = pd.read_csv(file_path)
    metadata_df = metadata_df[metadata_df.publication == paper]
    return dict(zip(metadata_df.sample_file_id, metadata_df.sample_disease))

metadata = parse_metadata(METADATA_PATH, CRISTIANO_PAPER)

In [None]:
INPUT_FILE_PATTERN = '{sid}__{dhs}_sorted.npy'
DATA_DIR = "../../data/cristiano_cfdnas_dhs_small/"
DHS_FOLDER = '../../raw_data/dhs_small'

DHS_FILES = [f.split('.')[0] for f in os.listdir(DHS_FOLDER)]
# DHS_FILES = ['Lymphoid_DHS']

vector_sums, labels = defaultdict(list), defaultdict(list)
for sid, group_name in metadata.items():
    for dhs in DHS_FILES:
        
        fname = INPUT_FILE_PATTERN.format(sid=sid, dhs=dhs)
        try:
            path = os.path.join(DATA_DIR, fname)
            vec = np.load(path)
            vector_sums[(dhs, group_name)].append(np.sum(vec.flatten()))
            labels[(dhs, group_name)].append(sid)
        except FileNotFoundError:
            continue

In [None]:
groups = sorted({group for (_, group) in vector_sums.keys()})
dhs_types = sorted({dhs for (dhs, _) in vector_sums.keys()})

for dhs in dhs_types:
    data = [vector_sums[(dhs, g)] for g in groups if (dhs, g) in vector_sums]
    
    plt.figure(figsize=(10, 6))
    plt.boxplot(data)
    
    plt.xticks(range(1, len(groups) + 1), groups, rotation=45, ha='right')
    plt.title(f"Distribution of coverage — {dhs}")
    plt.ylabel("Coverage (number of reads within the window -2kb +2kb)")
    plt.tight_layout()
    plt.savefig(f'../../data/cristiano_cfdnas_dhs_small_pca/coverage_plot__{dhs}.png', dpi=200)
    plt.show()

In [None]:
LOWER_CUTOFF = 4e5

filtered_vector_sums, filtered_labels = defaultdict(list), defaultdict(list)
for (dhs, group), values in vector_sums.items():
    vals = np.array(values)
    sids = np.array(labels[(dhs, group)])

    mask = (vals >= LOWER_CUTOFF)
    
    filtered_vals = vals[mask]
    filtered_sids = sids[mask]
    
    if len(filtered_vals) > 0:
        filtered_vector_sums[(dhs, group)] = filtered_vals.tolist()
        filtered_labels[(dhs, group)] = filtered_sids.tolist()

In [None]:
for dhs in dhs_types:
    data = [filtered_vector_sums[(dhs, g)] for g in groups if (dhs, g) in filtered_vector_sums]
    
    plt.figure(figsize=(10, 6))
    plt.boxplot(data)
    
    plt.xticks(range(1, len(groups) + 1), groups, rotation=45, ha='right')
    plt.title(f"Distribution of coverage — {dhs}")
    plt.ylabel("Coverage (number of reads within the window -2kb +2kb)")
    plt.tight_layout()
    plt.show()

In [None]:
counter = defaultdict(int)
for (dhs, group), values in filtered_vector_sums.items():
    counter[dhs] += len(values)
counter

In [None]:
label_counter = defaultdict(int)
label_dict = defaultdict(list)
for (dhs, group), values in filtered_labels.items():
    label_counter[dhs] += len(values)
    label_dict[dhs].extend(values)
label_counter

In [None]:
INP_FILE_PATTERN = '{sid}.hg38.frag.gz'
INP_DATA_FOLDER = '../../raw_data/cristiano_cfdnas/'
FILTERED_INPUT_DATA_FOLDER = '../../raw_data/cristiano_cfdnas_400k_filtered/'
os.makedirs(FILTERED_INPUT_DATA_FOLDER, exist_ok=True)

for (dhs, group), values in filtered_labels.items():
    for sid in values:
        inputfname = INP_FILE_PATTERN.format(sid=sid)
        source_input_path = os.path.join(INP_DATA_FOLDER, inputfname)
        destination_input_path = os.path.join(FILTERED_INPUT_DATA_FOLDER, inputfname)
        
        try:
            os.symlink(source_input_path, destination_input_path)
        except FileExistsError:
            print(f'Symlink already exists. source_path: {source_input_path}, destination_path: {destination_input_path}')
        except Exception:
            print(f'error, source_path: {source_input_path}, destination_path: {destination_input_path}')