In [1]:
from pathlib import Path
import h5py
import pandas as pd
import numpy as np
from joblib import Parallel, delayed
import subprocess

In [6]:
import h5py
import numpy as np
import pandas as pd
from pathlib import Path
PWD = Path().cwd()
MAZUR = Path("/data/mazurovev/all_marks/methylation/lncRNA_Peaks_corrs").resolve()
DATA = Path("../data")
METHYL = DATA / "methylation_peaks"
ANNO = Path("/home/magletdinov/shared/annotations")
PREPROCESSING = Path("../preprocessing")
METHYL_INDEX = PREPROCESSING / "methyl_index"

In [3]:
import gzip
import pandas as pd

# Функция для чтения GTF-файла и подсчета уникальных генов
def count_genes_in_gtf(gtf_file):
    # Открываем файл
    with gzip.open(gtf_file, 'rt') as f:
        # Считываем строки, относящиеся к генам
        gene_lines = [line for line in f if 'gene' in line and line.split('\t')[2] == 'gene']
        
    # Выделяем ID генов из строк
    gene_ids = set()
    for line in gene_lines:
        # Разбиваем строку на колонки
        columns = line.split('\t')
        # Последняя колонка содержит атрибуты, ищем gene_id
        attributes = columns[8]
        for attribute in attributes.split(';'):
            if 'gene_id' in attribute:
                gene_id = attribute.split('"')[1]
                gene_ids.add(gene_id)
                break
    
    return len(gene_ids)

In [4]:
def get_all_correlated_peaks(hm, to_hdf5, to_save):
    if to_save.exists():
        print(f"Skipped: {to_hdf5.stem}")
        return None
    all_results = []

    # Открываем HDF5 файл
    with h5py.File(to_hdf5, 'r') as f:
        lncRNAs = list(f['lncRNAs_names'][:])
        corrs_matrix = f['corrs_matrix'][:]

    # Читаем пики
    peaks = pd.read_csv(Path("/data/mazurovev/all_marks") / hm / "merged_peaks_first_in_biosample.bed", sep="\t", header=None)

    for i, lncRNA in enumerate(lncRNAs):
        corrs = corrs_matrix[i, :]
        nonzero_indices = np.nonzero(corrs)[0]
        nonzero_corrs = corrs[nonzero_indices]
        nonzero_peaks = ["peak_" + str(j) for j in nonzero_indices]

        # Извлекаем соответствующие пики
        res = peaks[peaks[3].isin(nonzero_peaks)].copy()
        res["corr"] = nonzero_corrs
        res["lnc_ens"] = lncRNA.decode('utf-8')  # Декодируем байтовую строку в строку

        all_results.append(res)

    # Объединяем все результаты в один DataFrame
    final_result = pd.concat(all_results, ignore_index=True)
    
    final_result.to_csv(to_save, sep="\t", compression="gzip", index=False)

In [None]:
for to_hdf5 in MAZUR.glob("*.hdf5"):
    print(to_hdf5.stem)
    to_save = METHYL / f"{to_hdf5.stem}.tsv.gz"
    get_all_correlated_peaks(hm="methylation", to_hdf5=to_hdf5, to_save=to_save)

In [12]:
def indexing_methyl(chunk, lnc_ens, corr_sign, subdf):
    to_index = METHYL_INDEX / chunk
    to_index.mkdir(exist_ok=True, parents=True)
    to_save = to_index / f"methyl-{lnc_ens}-{corr_sign}-.tsv.gz"
    if to_save.exists():
        print(f"Skipped: methyl {lnc_ens} {corr_sign}")
        return None
    subdf.to_csv(to_save, sep="\t", compression="gzip", index=False)

In [None]:
for to_df in METHYL.glob("*.tsv.gz"):
    print(to_df.stem)
    chunk = to_df.stem.split("_")[6]
    df = pd.read_csv(to_df, sep="\t")
    df.rename(columns={"0":"chrom", "1":"start", "2":"end", "3":"name"}, inplace=True)
    df["corr"] = df["corr"].apply(lambda x: np.round(x, 3))
    df["corr_sign"] = np.where(df["corr"] > 0, "plus", "minus")
    for (lnc_ens, corr_sign), subdf in df.groupby(by=["lnc_ens", "corr_sign"]):
        indexing_methyl(chunk=chunk, lnc_ens=lnc_ens, corr_sign=corr_sign, subdf=subdf)

In [None]:
#Не хватает оперативной памяти
dtype = {
    "chrom": str,
    "start": np.uint32,
    "end": np.uint32
}
all_peaks = {}
for sign in "minus","plus":
    all_peaks[sign] = pd.concat([pd.read_csv(to_df, sep="\t", dtype=dtype, header=None).loc[1:] for to_df in METHYL_INDEX.glob(f"*{sign}*")]).reset_index(drop=True)\
    .drop_duplicates().dropna()
    
all_peaks_df = pd.concat([all_peaks[sign] for sign in ("minus","plus")]).drop_duplicates()
del all_peaks

In [8]:
ENSG00000280047 = pd.concat([pd.read_csv(to_df, sep="\t").loc[1:] for to_df in METHYL_INDEX.glob("*ENSG00000280047*minus*")]).reset_index(drop=True)
ENSG00000280047

Unnamed: 0,chrom,start,end,name,corr,lnc_ens,corr_sign
0,chr13,99062606,99062607,peak_13294107,-0.89,ENSG00000280047,minus
1,chr13,99062612,99062613,peak_13294109,-0.856,ENSG00000280047,minus
2,chr2,32758365,32758366,peak_25313521,-0.869,ENSG00000280047,minus


In [10]:
!ls /data/mazurovev/all_marks/methylation/lncRNA_Peaks_corrs

lncRNA_Peaks_Correlations_corrected_non_zero_0.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_10.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_11.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_12.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_13.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_14.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_15.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_16.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_17.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_18.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_1.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_2.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_3.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_4.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_5.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_6.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_7.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_8.hdf5
lncRNA_Peaks_Correlations_corrected_non_zero_9.hdf5


In [11]:
!ls {METHYL}

lncRNA_Peaks_Correlations_corrected_non_zero_0.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_10.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_11.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_12.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_13.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_14.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_15.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_16.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_17.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_18.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_1.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_2.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_3.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_4.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_5.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_6.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_7.tsv.gz
lncRNA_Peaks_Correlations_corrected_non_zero_8.tsv.gz
lncRNA_Peaks_Correl

In [None]:
corr_df = pd.concat([pd.read_csv(to_df, sep="\t") for to_df in METHYL.glob("*.tsv.gz")])
corr_df.sort_values(by=["lnc_ens", "0", "1"], inplace=True)
corr_df.shape