In [41]:
import polars as pl
import pandas as pd
import os

In [42]:
cache_file = 'mirna/circ_mirna_gene.parquet'

if not os.path.exists(cache_file):
    df_mirna_gene = pl.from_pandas(pd.read_pickle("mirna/preprocessing/mirnas.pkl"), include_index=True).lazy()
    df_mirna_gene = df_mirna_gene.rename({"miRNA ID": 'mirna', "Gene Symbol": 'genes'})

    df_mirna_circ = pl.scan_parquet("mirna/circ_mirna.parquet")
    df_mirna_circ = df_mirna_circ.rename({'transcript': 'circ'})
    df_mirna_circ = df_mirna_circ.with_columns(circ=pl.col("circ").str.slice(len("circ_")))

    df_circ_annotation = pl.scan_csv("dea/annotation.bed", separator='\t', has_header=False, new_columns=['chr', 'start', 'end', 'name', 'score', 'strand','type', 'gene_id', 'gene_name', 'transcript_id', 'database'])
    df_circ_annotation = df_circ_annotation.select('name', 'type', 'gene_id', 'database')
    df_circ_annotation = df_circ_annotation.rename({'gene_id': 'host_gene'})

    df_joined = df_mirna_circ.join(df_mirna_gene, on='mirna', how='inner')
    df_joined = df_joined.select('circ', 'mirna', 'genes')
    df_joined = df_joined.join(df_circ_annotation, left_on='circ', right_on='name', how='inner')
    df_joined = df_joined.collect()
    df_joined.write_parquet(cache_file)
    df_joined = df_joined.lazy()
else:
    df_joined = pl.scan_parquet(cache_file)

In [51]:
deseq2_dir = '../chapters/4_results_and_discussion/figures/dea/deseq2'
for contrast in os.listdir(deseq2_dir):
    contrast_dir = os.path.join(deseq2_dir, contrast)
    res_file = os.path.join(contrast_dir, 'res.tsv')
    df_res = pl.scan_csv(res_file, separator='\t', null_values=['NA'])
    df_res = df_res.rename({'': 'circ'})
    df_res = df_res.filter(pl.col('padj') < 0.05)
    df_res = df_res.head(5)
    df_res = df_res.select('circ')
    circs = df_res.collect()['circ'].to_list()
    for circ in circs:
        df_res_circ = df_joined.filter(pl.col('circ') == circ)
        df_res_circ = df_res_circ.explode('genes')
        df_res_circ = df_res_circ.select('genes')
        df_res_circ = df_res_circ.group_by('genes').len().select('genes')
        df_res_circ.sink_csv(os.path.join(contrast_dir, f'{circ}_targets.txt'), include_header=False)