In [1]:
import os, sys, pysam, matplotlib.pyplot as plt, numpy as np, pandas as pd, seaborn as sns
from glob import glob
from Bio import SeqIO

In [2]:
def get_motifs(ref, seq, motif2pos):
    """Return position of motifs in sequence"""
    motifs = {m: set() for m in motif2pos}
    motifs_lens = set(len(m) for m in motifs)#; print(motifs_lens)
    # only count + strand
    strand = "+"
    for s in range(len(seq)-min(motifs_lens)):
        for l in motifs_lens:
            m = seq[s:s+l]
            if m in motifs:
                pos = "%s:%s%s"%(ref, s+motif2pos[m], strand)
                motifs[m].add(pos)
    return motifs

def fasta2data(fasta, motif2pos):
    """Return """
    s2d, s2c = {}, {}
    for r in SeqIO.parse(fasta, "fasta"):
        ref = r.name
        seq = str(r.seq)
        family, species = r.description.split()[1:3]
        spname = "%s%s"%(family.lower()[0], species)
        sys.stderr.write(" %s %s     \r"%(spname, ref))
        # get motifs
        motifs = get_motifs(ref, seq, motif2pos)
        # update species motifs
        if spname in s2d: 
            for m in motif2pos: 
                s2d[spname][m] = s2d[spname][m].union(motifs[m])
        # all motifs are fresh here
        else: 
            s2d[spname] = motifs #{m: set() for m in motif2pos}
            s2c[spname] = []
        s2c[spname].append(ref)
    return s2d, s2c

fasta = "/home/lpryszcz/cluster/dna_mods/ref/mock_community.fa"

mod2mers = {"6mA": ((2, "GATC"), ), "5mC": ((2, "CCAGG"), (2, "CCTGG"), (1, "CG"))}; mod2mers
motif2pos = {m: p for mod in mod2mers for p, m in mod2mers[mod]}; motif2pos

species2motifs, species2refs = fasta2data(fasta, motif2pos)

 lfermentum NZ_BJLV01000146.1     

In [8]:
# #/home/lpryszcz/cluster/dna_mods/ecoli/modPhred.2020/PRJNA477598/
fn = "/home/lpryszcz/cluster/dna_mods/ecoli/modPhred.2020/PRJEB22772/mod.gz" # PRJNA477598 PRJEB22772
df = pd.read_csv(fn, sep="\t", skiprows=16)#; df.head() 
# get ref as chr:pos + strand
df["ref"] = df['chr'].astype(str) + ":" + df['pos'].astype(str) + df['strand']; df.head()

Unnamed: 0,chr,pos,ref_base,strand,mod,modPhred/PRJEB22772/minimap2/MARC_ZFscreens_R9.4_1D-Ecoli-run_FAF05145.bam depth,modPhred/PRJEB22772/minimap2/MARC_ZFscreens_R9.4_1D-Ecoli-run_FAF05145.bam basecall_accuracy,modPhred/PRJEB22772/minimap2/MARC_ZFscreens_R9.4_1D-Ecoli-run_FAF05145.bam mod_frequency,modPhred/PRJEB22772/minimap2/MARC_ZFscreens_R9.4_1D-Ecoli-run_FAF05145.bam median_mod_prob,modPhred/PRJEB22772/minimap2/MARC_ZFscreens_R9.4_2D-Ecoli-run_FAF05711.bam depth,modPhred/PRJEB22772/minimap2/MARC_ZFscreens_R9.4_2D-Ecoli-run_FAF05711.bam basecall_accuracy,modPhred/PRJEB22772/minimap2/MARC_ZFscreens_R9.4_2D-Ecoli-run_FAF05711.bam mod_frequency,modPhred/PRJEB22772/minimap2/MARC_ZFscreens_R9.4_2D-Ecoli-run_FAF05711.bam median_mod_prob,ref
0,NC_000913.3,244,C,-,5mC,444,0.91,0.014,0.833,120,0.958,0.05,0.6,NC_000913.3:244-
1,NC_000913.3,420,C,+,5mC,464,0.978,0.713,0.967,132,0.962,0.644,0.967,NC_000913.3:420+
2,NC_000913.3,422,C,-,5mC,351,0.604,0.328,0.833,103,0.621,0.369,0.867,NC_000913.3:422-
3,NC_000913.3,423,C,-,5mC,455,0.993,0.022,0.683,123,0.984,0.057,0.767,NC_000913.3:423-
4,NC_000913.3,475,C,+,5mC,432,0.942,0.664,0.933,126,0.937,0.635,0.967,NC_000913.3:475+


In [9]:
min_freq = 0.05
spname = "ecoli"
for spcol in filter(lambda c: c.endswith("mod_frequency"), df.columns):
    #spname = spcol.split("_")[1]
    if spname not in species2motifs: 
        print(spname, "is missing")
        continue
    _df = df[(df[spcol]>=min_freq)&(df["chr"].isin(species2refs[spname]))]
    data = []
    # depth 6mA	5mC	mean 6mA freq	mean m5C freq	positions
    data.append(2*_df[spcol.replace("mod_frequency", "depth")].mean())
    # get number of every modification
    for mod in mod2mers.keys():
        data.append((_df["mod"]==mod).sum())
    # get avg mod_freq
    for mod in mod2mers.keys():
        data.append(_df.loc[_df["mod"]==mod, spcol].mean())
    # store total number of modifications
    data.append(len(_df))    
    # get each motif penetrance
    for m, mpos in species2motifs[spname].items():
        common = mpos.intersection(_df["ref"].to_numpy())
        data.append(len(common)/len(mpos))
    print(spname, *data)

ecoli 904.8964473568217 38897 29165 0.6316899760907011 0.42358745071146925 68062 0.9979606776824932 0.9966672221296451 0.9978508844437097 0.0011966792870675015
ecoli 250.80205956351296 38495 29869 0.5925278088063384 0.4410661220663565 68364 0.996705710102489 0.9988335277453758 0.9988427839312283 0.004253257707047144


In [5]:
# #/home/lpryszcz/cluster/dna_mods/ecoli/modPhred.2020/PRJNA477598/
fn = "/home/lpryszcz/cluster/dna_mods/ecoli/modPhred.2020/PRJNA477598/mod.gz" # PRJNA477598 PRJEB22772
df = pd.read_csv(fn, sep="\t", skiprows=16)#; df.head() 
# get ref as chr:pos + strand
df["ref"] = df['chr'].astype(str) + ":" + df['pos'].astype(str) + df['strand']; df.head()

Unnamed: 0,chr,pos,ref_base,strand,mod,modPhred/PRJNA477598/minimap2/barcode01_cneoformans_pass.bam depth,modPhred/PRJNA477598/minimap2/barcode01_cneoformans_pass.bam basecall_accuracy,modPhred/PRJNA477598/minimap2/barcode01_cneoformans_pass.bam mod_frequency,modPhred/PRJNA477598/minimap2/barcode01_cneoformans_pass.bam median_mod_prob,modPhred/PRJNA477598/minimap2/barcode01_paeruginosa_pass.bam depth,...,modPhred/PRJNA477598/minimap2/barcode08_scerevisiae_pass.bam median_mod_prob,modPhred/PRJNA477598/minimap2/nasa_chiu_r9.bam depth,modPhred/PRJNA477598/minimap2/nasa_chiu_r9.bam basecall_accuracy,modPhred/PRJNA477598/minimap2/nasa_chiu_r9.bam mod_frequency,modPhred/PRJNA477598/minimap2/nasa_chiu_r9.bam median_mod_prob,modPhred/PRJNA477598/minimap2/nasa_mason_r9.bam depth,modPhred/PRJNA477598/minimap2/nasa_mason_r9.bam basecall_accuracy,modPhred/PRJNA477598/minimap2/nasa_mason_r9.bam mod_frequency,modPhred/PRJNA477598/minimap2/nasa_mason_r9.bam median_mod_prob,ref
0,NC_000913.3,23,C,-,5mC,0,,,,0,...,,6,1.0,0.167,0.516,13,1.0,0.0,,NC_000913.3:23-
1,NC_000913.3,24,C,-,5mC,0,,,,0,...,,7,1.0,0.143,0.71,14,1.0,0.0,,NC_000913.3:24-
2,NC_000913.3,35,A,-,5mC,0,,,,0,...,,8,1.0,0.0,,16,0.938,0.062,0.774,NC_000913.3:35-
3,NC_000913.3,123,T,-,5mC,0,,,,0,...,,8,1.0,0.0,,18,0.944,0.056,0.806,NC_000913.3:123-
4,NC_000913.3,153,C,-,5mC,0,,,,0,...,,8,0.875,0.0,,19,0.947,0.105,0.919,NC_000913.3:153-


In [7]:
for spcol in filter(lambda c: c.endswith("mod_frequency"), df.columns):
    spname = spcol.split("_")[1]
    if spname not in species2motifs: 
        print(spname, "is missing")
        continue
    _df = df[(df[spcol]>=min_freq)&(df["chr"].isin(species2refs[spname]))]
    data = []
    # depth 6mA	5mC	mean 6mA freq	mean m5C freq	positions
    data.append(2*_df[spcol.replace("mod_frequency", "depth")].mean())
    # get number of every modification
    for mod in mod2mers.keys():
        data.append((_df["mod"]==mod).sum())
    # get avg mod_freq
    for mod in mod2mers.keys():
        data.append(_df.loc[_df["mod"]==mod, spcol].mean())    
    # store total number of modifications
    data.append(len(_df))
    # get each motif penetrance
    for m, mpos in species2motifs[spname].items():
        common = mpos.intersection(_df["ref"].to_numpy())
        data.append(len(common)/len(mpos))
    print(spname, *data)

cneoformans 65.78276113133576 17204 72071 0.13394576842594746 0.10239215495830499 89275 0.1017274472168906 0.0021475580187045374 0.001169751599807335 0.03904561765230666
paeruginosa 147.31385435637554 65784 9990 0.26019145992946613 0.07769729729729731 75774 0.776373881015684 0.0006533812479581836 0.0007201880069744522 0.00339133192333598
ecoli 346.96027108433736 36912 29488 0.6344618010403121 0.37014775501899083 66400 0.9391863626856306 0.9385102482919513 0.9378409654488346 0.004368600288933168
ecoli 45.61268275073702 39511 43594 0.592452582825036 0.26199488461714915 83105 0.9376699435264589 0.928178636893851 0.9332121011737478 0.013627726049833762
lfermentum nan 0 0 nan nan 0 0.0 0.0 0.0 0.0
efaecalis 374.61111966733404 2052 4441 0.1252982456140351 0.08072798919162352 6493 0.13530685920577618 0.0 0.0 0.019455486773531608
lmonocytogenes 195.55796120884077 1982 4669 0.12580575176589304 0.0788935532233883 6651 0.15637065637065636 0.0010384215991692627 0.001076426264800861 0.0168515574796

In [12]:
for c in filter(lambda c: c.endswith("median_mod_prob"), df.columns):
    for mod in mod2mers.keys():
        print(os.path.basename(c.split()[0]), mod, df.loc[df["mod"]==mod, c].mean())

MARC_ZFscreens_R9.4_1D-Ecoli-run_FAF05145.bam 6mA 0.8780236143837723
MARC_ZFscreens_R9.4_1D-Ecoli-run_FAF05145.bam 5mC 0.865344574507618
MARC_ZFscreens_R9.4_2D-Ecoli-run_FAF05711.bam 6mA 0.879114881637876
MARC_ZFscreens_R9.4_2D-Ecoli-run_FAF05711.bam 5mC 0.8731131579760765


In [13]:
2 < 3 < 4

True