In [63]:
from subprocess import Popen, PIPE, DEVNULL
from tempfile import NamedTemporaryFile
from io import StringIO
from collections import defaultdict
from pathlib import Path

import pandas as pd
import pyranges as pr
import pysam

from tqdm import tqdm

from scripts.get_labels import read_knrgl
from myutils.rmsk import read_rmsk

import seaborn as sns
import matplotlib.pyplot as plt

## Blast the L1 capture sequence

In [33]:
rmsk_file = "/iblm/netapp/data4/mcuoco/sz_slavseq/resources/hs38d1.fa.out"
rmsk = read_rmsk(rmsk_file)

rep_names = [
    "L1HS_3end",
    "L1PA2_3end",
    "L1PA3_3end",
    "L1PA4_3end",
    "L1PA5_3end",
    "L1PA6_3end",
]

rmsk = rmsk.loc[
    (rmsk["repName"].isin(rep_names))
    & (rmsk["repEnd"] > 860)
    & (rmsk["repStart"] < 765),
    :,
]
rmsk = rmsk.rename(
    columns={
        "genoName": "Chromosome",
        "genoStart": "Start",
        "genoEnd": "End",
        "strand": "Strand",
    }
)

KeyboardInterrupt: 

In [None]:
genome = "/iblm/netapp/data4/mcuoco/sz_slavseq/resources/hs38d1.fa"
L1_capture = "/iblm/logglun02/mcuoco/workflows/sz_slavseq/resources/L1_capture.fa"

In [None]:
# index the genome
cmd = f"makeblastdb -in {genome} -dbtype 'nucl' -blastdb_version 5 -parse_seqids -out {genome}"
Popen(cmd, shell=True).communicate()



Building a new DB, current time: 05/23/2023 19:00:49
New DB name:   /iblm/netapp/data4/mcuoco/sz_slavseq/resources/hs38d1.fa
New DB title:  /iblm/netapp/data4/mcuoco/sz_slavseq/resources/hs38d1.fa
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /iblm/netapp/data4/mcuoco/sz_slavseq/resources/hs38d1.fa
Keep MBits: T
Maximum file size: 3000000000B
Adding sequences from FASTA; added 2580 sequences in 53.692 seconds.




(None, None)

In [None]:
# define blast function
outfmt = "6 qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore sstrand qcov"


def blast(query, db, extra, outfmt=outfmt):
    with NamedTemporaryFile() as tmp:
        cmd = (
            f"blastn -query {query} {extra} -db {db} -outfmt '{outfmt}' -out {tmp.name}"
        )
        Popen(cmd, shell=True).communicate()
        df = pd.read_csv(tmp.name, sep="\t", header=None, names=outfmt.split(" ")[1:])

    df.rename(
        {"sseqid": "Chromosome", "sstart": "Start", "send": "End", "sstrand": "Strand"},
        axis=1,
        inplace=True,
    )
    df["Strand"] = df["Strand"].str.replace("minus", "-")
    df["Strand"] = df["Strand"].str.replace("plus", "+")
    return df


def overlap(a, b):
    a = pr.PyRanges(a)
    b = pr.PyRanges(b)
    return b.overlap(a).df

In [45]:
blast_df = blast(L1_capture, genome, f"-task blastn -ungapped -no_greedy")
hits = blast_df.shape[0]
ovl = overlap(rmsk, blast_df)
blast_covered_by_rmsk = ovl.shape[0]
ovl = overlap(blast_df, rmsk)
rmsk_covered_by_blast = ovl.shape[0]

print(f"hits: {hits}")
print(
    f"blast covered by rmsk: {blast_covered_by_rmsk} ({blast_covered_by_rmsk/hits:2.2%})"
)
print(
    f"rmsk covered by blast: {rmsk_covered_by_blast}/{rmsk.shape[0]} ({rmsk_covered_by_blast/rmsk.shape[0]:2.2%})"
)

hits: 19933
blast covered by rmsk: 19536 (98.01%)
rmsk covered by blast: 19929/72636 (27.44%)
