In [3]:
import polars as pl
import pandas as pd
import os

In [4]:
def parse_locstring(locstring: str):
    chrom, coords, strand = locstring.split(":")
    start, end = coords.split("-")
    return {"chr": chrom, "start": int(start), "end": int(end), "strand": strand}

In [6]:
cache_file = 'mirna/circ_mirna_gene.parquet'

if not os.path.exists(cache_file):
    df_joined_mirna_gene = pl.from_pandas(pd.read_pickle("mirna/preprocessing/mirnas.pkl"), include_index=True).lazy()
    df_mirna_gene = df_mirna_gene.rename({"miRNA ID": 'mirna', "Gene Symbol": 'genes'})

    df_mirna_circ = pl.scan_parquet("mirna/circ_mirna.parquet")
    df_mirna_circ = df_mirna_circ.rename({'transcript': 'circ'})
    df_mirna_circ = df_mirna_circ.with_columns(circ=pl.col("circ").str.slice(len("circ_")))

    df_circ_annotation = pl.scan_csv("dea/annotation.bed", separator='\t', has_header=False, new_columns=['chr', 'start', 'end', 'name', 'score', 'strand','type', 'gene_id', 'gene_name', 'transcript_id', 'database'])
    df_circ_annotation = df_circ_annotation.select('name', 'type', 'gene_id', 'database')
    df_circ_annotation = df_circ_annotation.rename({'gene_id': 'host_gene'})

    df_joined = df_mirna_circ.join(df_mirna_gene, on='mirna', how='inner')
    df_joined = df_joined.select('circ', 'mirna', 'genes')
    df_joined = df_joined.join(df_circ_annotation, left_on='circ', right_on='name', how='inner')

    df_joined = df_joined.with_columns(
        location=pl.col("circ").map_elements(parse_locstring, return_dtype=pl.Struct)
    )

    df_joined = df_joined.with_columns(
        chr=pl.col("location").map_elements(lambda x: x["chr"], return_dtype=str),
        start=pl.col("location").map_elements(lambda x: x["start"], return_dtype=int),
        end=pl.col("location").map_elements(lambda x: x["end"], return_dtype=int)
    )

    df_joined = df_joined.drop("circ", "location")

    df_joined = df_joined.collect()
    df_joined.write_parquet(cache_file)
    df_joined = df_joined.lazy()
else:
    df_joined = pl.scan_parquet(cache_file).collect().lazy()

In [12]:
deseq2_dir = '../chapters/4_results_and_discussion/figures/dea/deseq2'
max_shift = 3
for contrast in os.listdir(deseq2_dir):
    contrast_dir = os.path.join(deseq2_dir, contrast)
    res_file = os.path.join(contrast_dir, 'sign.tsv')
    df_res = pl.scan_csv(res_file, separator='\t', null_values=['NA'])
    df_res = df_res.rename({'': 'circ'})
    df_res = df_res.head(5)
    df_res = df_res.select('circ')
    circs = df_res.collect()['circ'].to_list()
    for circ in circs:
        print(f'Processing {circ}')
        location = parse_locstring(circ)
        chrom, start, end = location['chr'], location['start'], location['end']
        df_res_circ = df_joined.filter((pl.col('chr') == chrom) & ((pl.col('start') - start).abs() <= max_shift) & ((pl.col('end') - end).abs() <= max_shift))
        df_res_circ = df_res_circ.explode('genes')
        df_res_circ = df_res_circ.select('genes')
        df_res_circ = df_res_circ.group_by('genes').len().select('genes').sort('genes').filter(pl.col('genes') != '').collect()
        if len(df_res_circ) > 0:
            df_res_circ.write_csv(os.path.join(contrast_dir, f'{chrom}:{start}-{end}_targets.txt'), include_header=False)
        else:
            print('No targets found')

Processing chr5:87925915-87926842:+
Processing chr5:87925915-87926842:-
Processing chr5:87817372-87821139:-
Processing chr5:87817373-87821140:+
Processing chr9:22555041-22570441:+
No targets found
Processing chr5:87925915-87926842:+
Processing chr5:87925915-87926842:-
Processing chr2:24753235-24767638:+
No targets found
Processing chr2:24753235-24767638:-
No targets found
Processing chr17:40158496-40158638:+
Processing chr1:21494273-21549693:+
No targets found
Processing chr1:21494273-21549693:-
No targets found
Processing chr1:36249067-36255387:+
Processing chr1:36249067-36255387:-
Processing chr1:40049716-40051001:+
Processing chr1:4212835-4298842:+
No targets found
Processing chr1:4212835-4298842:-
No targets found
Processing chr1:9949250-9952116:+
Processing chr1:9949251-9952116:-
Processing chr1:10258981-10270129:+
No targets found
