# Fixing & editing feature collection code

In [377]:
import os
import pandas as pd
import numpy as np
import sys

from collections import defaultdict
from collections import Counter
import itertools

import functools
from functools import reduce
import itertools
from collections import defaultdict
from collections import Counter
from functools import reduce

import logging
import matplotlib.pyplot as plt

In [244]:
# set folders
project_dir = "/mmfs1/gscratch/stergachislab/mwperez/ctcf-footprinting"
data_dir = "{}/candidate_footprints".format(project_dir)
feature_dir = "{}/feature_data".format(project_dir)

## helper fxns

In [245]:
def get_motif_seq(subset_sequence):
    '''Get motif sequence from subset_sequence.'''
    center_idx = 100
    motif_len = 35
    motif_seq = subset_sequence[center_idx:(center_idx+motif_len)]
    return motif_seq

In [246]:
def get_kmers(seq, k):
    '''Decompose the input sequence (str) into k-mers (str).'''
    num_kmers = len(seq) - k + 1
    seqs = [seq[i:i+k] for i in range(num_kmers)]
    return seqs

In [282]:
def clean_sequences(df):
    ''''Remove rows with an N character in sequence.'''
    print("Removing sequences w/ N: {:,}".format(df["subset_sequence"].str.contains("N").sum()))
    return df[~df["subset_sequence"].str.contains("N")]

In [283]:
def clean_chroms(df):
    ''''Remove rows no in standard chromosomes (chr1-22, chrX).'''
    clean_chroms = (df["chrom"].isin([f"chr{x}" for x in list(range(1, 23))+["X"]]))
    print("Removing weird chromosomes: {:,}".format((df.shape[0] - clean_chroms.sum())))
    return df.loc[clean_chroms]

In [266]:
def filt_msps(df, motif_len=35):
    '''Filters for motif/query instances with a motif within 
    a MSP and adds MSP length to each motif/query group.'''
    
    # position of MSPs containing a motif
    msp_mask = (df["centered_position_type"] == "msp") & (df["centered_start"] <= 0) & (df["centered_end"] >= motif_len)
    print("MSP's with a motif: " + "{:,}".format(msp_mask.sum()))
    msp_groups = [(row["motif_name"], row["query_name"]) for idx, row in df[msp_mask].iterrows()]
    
    # filter for rows with motifs within an MSP (gets both MSP's & m6a's)
    df = df[df[["motif_name", "query_name"]].apply(tuple, 1).isin(msp_groups)]
    print("Total observations from motifs within an MSP: " + "{:,}".format(df.shape[0]))
    
    # add MSP size corresponding to each group
    # df with msp sizes
    df_msp = df.loc[msp_mask, ["motif_name", "query_name", "centered_end", "centered_start"]]
    df_msp["msp_size"] = df_msp["centered_end"] - df_msp["centered_start"]

    # match msp back to it's motif & fiber
    df = df.merge(df_msp[["motif_name", "query_name", "msp_size"]], on=["motif_name", "query_name"])
    print("Merged df shape w/ msp_size: " + "{:,}".format(df.shape[0]))
    return df

__Mitchell's code__

In [274]:
import itertools

COMPLEMENT = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}

def revcomp(seq):
    return "".join(COMPLEMENT[base] for base in reversed(seq))

# make all possible 3-mers
def make_kmer_dict(kmer_size, use_canonical=False):
    kmer_dict = {}
    kmers = []
    # cartesian product (set formed from 2+ sets w/ all ordered pairs)
    for kmer_list in itertools.product(["A", "C", "G", "T"], repeat=kmer_size):
        kmer = "".join(kmer_list)
        rc_kmer = revcomp(kmer)
        if rc_kmer < kmer and use_canonical:
            kmer = rc_kmer
        # skip no m6a possible kmers
        if kmer.count("A") + kmer.count("T") == 0:
            continue
        kmers.append(kmer)
        kmer_dict[f"{kmer}_count"] = 0
        kmer_dict[f"{kmer}_m6a_count"] = 0
    return kmer_dict, kmers

def kmer_features(seq, m6a_bool, kmer_size = 3, use_canonical = False):
    stop_index = len(seq) - kmer_size + 1
    kmer_counts, kmers = make_kmer_dict(kmer_size, use_canonical=use_canonical)
    kmer_feats = {}
    for i in range(stop_index):
        # get current index
        kmer = seq[i:i+kmer_size]
        m6a_count = m6a_bool[i:i+kmer_size].sum()
        # skip no m6a possible kmers
        if kmer.count("A") + kmer.count("T") == 0:
            continue
        # get canonical kmer
        rc_kmer = revcomp(kmer)
        if rc_kmer < kmer and use_canonical:
            kmer = rc_kmer
        # counts 
        kmer_counts[f"{kmer}_count"] += 1
        kmer_counts[f"{kmer}_m6a_count"] += m6a_count
        
    # get motif_m6a_prop
    for kmer in kmers:
        AT_count = (kmer.count("A") + kmer.count("T"))
        motif_m6a_prop = weird_division(kmer_counts[f"{kmer}_m6a_count"], 
                                        (AT_count*kmer_counts[f"{kmer}_count"]))
        #if motif_m6a_prop > 1:
            #raise ValueError(f"Motif m6A proportion > 1. {kmer} value: {motif_m6a_prop}")
        kmer_feats[f"{kmer}_count"] = kmer_counts[f"{kmer}_count"]
        kmer_feats[f"{kmer}_m6a_prop"] = weird_division(kmer_counts[f"{kmer}_m6a_count"],
                                                        (AT_count * kmer_counts[f"{kmer}_count"]))

    return kmer_feats

In [248]:
def weird_division(n, d):
    # prevent divide by 0 error
    return n/d if d else 0

In [217]:
# establish input data
subseq_motif = "ACGTACGTACGTACGTACGTATCGGG"
subseq_m6a   = "10010000000100010001110000"
# convert m6a boolean string to numpy array
m6a_bool = np.array([int(x) for x in subseq_m6a])

In [218]:
# features (canonical)
feats = kmer_features(subseq_motif, m6a_bool, kmer_size = 3, use_canonical=True)
print(len(feats))
for k,v in feats.items():
    if v == 0:
        continue
    else:
        print(k, v)

56
ACG_count 10
ACG_m6a_prop 0.5
ATA_count 1
ATA_m6a_prop 1.0
ATC_count 1
ATC_m6a_prop 1.0
CGA_count 1
CGA_m6a_prop 1.0
GTA_count 9
GTA_m6a_prop 0.4444444444444444


In [219]:
# features (non-canonical)
feats = kmer_features(subseq_motif, m6a_bool, kmer_size = 3, use_canonical=False)
print(len(feats))
for k,v in feats.items():
    if v == 0:
        continue
    else:
        print(k, v)

112
ACG_count 5
ACG_m6a_prop 0.2
ATC_count 1
ATC_m6a_prop 1.0
CGT_count 5
CGT_m6a_prop 0.8
GTA_count 5
GTA_m6a_prop 0.5
TAC_count 4
TAC_m6a_prop 0.375
TAT_count 1
TAT_m6a_prop 1.0
TCG_count 1
TCG_m6a_prop 1.0


## Positive (L)

In [249]:
# CTCF L cleaned
motif_type = "L"
data_file = "{}/CTCF_m6a_fiberseq_{}_100bp_positive-cleaned.txt".format(data_dir, motif_type)

In [250]:
%%time
# read in data
n_rows = None
df = pd.read_csv(data_file, sep="\t", nrows=n_rows)
print("ft center data - rows: {:,}, columns: {:,}".format(df.shape[0], df.shape[1]))

ft center data - rows: 9,495,739, columns: 16
CPU times: user 18.3 s, sys: 2.29 s, total: 20.6 s
Wall time: 20.7 s


In [251]:
print("rows: {:,}, columns: {:,}".format(df.shape[0], df.shape[1]))
df.head()

rows: 9,495,739, columns: 16


Unnamed: 0,motif_query,motif_name,chrom,centering_position,strand,subset_sequence,reference_start,reference_end,query_name,centered_query_start,centered_query_end,query_length,centered_position_type,centered_start,centered_end,msp_size
0,chr1_1033080_+/m54329U_210813_020940/24183559/ccs,chr1_1033080_+,chr1,1033080,+,GACCTACGGGGGCGGGTGTGGGGACGCCGGACTACGCGTCAGGAGT...,1008277,1033947,m54329U_210813_020940/24183559/ccs,-24816,861,25677,m6a,-77,-76,225
1,chr1_1033080_+/m54329U_210813_020940/24183559/ccs,chr1_1033080_+,chr1,1033080,+,GACCTACGGGGGCGGGTGTGGGGACGCCGGACTACGCGTCAGGAGT...,1008277,1033947,m54329U_210813_020940/24183559/ccs,-24816,861,25677,m6a,-70,-69,225
2,chr1_1033080_+/m54329U_210813_020940/24183559/ccs,chr1_1033080_+,chr1,1033080,+,GACCTACGGGGGCGGGTGTGGGGACGCCGGACTACGCGTCAGGAGT...,1008277,1033947,m54329U_210813_020940/24183559/ccs,-24816,861,25677,m6a,-67,-66,225
3,chr1_1033080_+/m54329U_210813_020940/24183559/ccs,chr1_1033080_+,chr1,1033080,+,GACCTACGGGGGCGGGTGTGGGGACGCCGGACTACGCGTCAGGAGT...,1008277,1033947,m54329U_210813_020940/24183559/ccs,-24816,861,25677,m6a,-62,-61,225
4,chr1_1033080_+/m54329U_210813_020940/24183559/ccs,chr1_1033080_+,chr1,1033080,+,GACCTACGGGGGCGGGTGTGGGGACGCCGGACTACGCGTCAGGAGT...,1008277,1033947,m54329U_210813_020940/24183559/ccs,-24816,861,25677,m6a,-60,-59,225


In [None]:
# establish input data
subseq_motif = "ACGTACGTACGTACGTACGTATCGGG"
subseq_m6a   = "10010000000100010001110000"
# convert m6a boolean string to numpy array
m6a_bool = np.array([int(x) for x in subseq_m6a])

In [252]:
# filter for m6As within 40 bp flank
m6a_range_mask = (df["centered_start"] >= -40) & (df["centered_end"] <= 75)
print("Total m6a's within 40 bp flank of motif: " + "{:,}".format(m6a_range_mask.sum()))
df = df[m6a_range_mask]

Total m6a's within 40 bp flank of motif: 6,388,435


In [253]:
# group by motif/query name
grouping_cols = ["motif_name", "query_name"]
df_grouped = df.groupby(grouping_cols)
# get group names (keys)
group_names = list(df_grouped.groups.keys())
print("Unique motif-sequence groups: " + "{:,}".format(len(group_names)))

Unique motif-sequence groups: 238,348


In [128]:
def agg_kmer_features(x):
    '''Collect k-mer features per group.'''
    # get sequence
    center = 100
    motif_len = 35
    subseq = x["subset_sequence"].unique()[0]
    subseq_motif = subseq[center:(center+motif_len)]
    
    # m6a instances in motif
    m6a_bool = x[(x["centered_start"] >= 0) & (x["centered_end"] <= 35)]["centered_start"].values
    m6a_bool = [1 if (i in m6a_bool) else 0 for i in range(0, 35)]
    # convert m6a bool to numpy array
    m6a_bool = np.array(m6a_bool)
    
    kmer_counts = kmer_features(subseq_motif, m6a_bool, kmer_size=3, use_canonical=True)
    
    return pd.Series(kmer_counts, index=list(kmer_counts.keys()))

In [303]:
def agg_features(x):
    '''Collect motif features per group.'''
    d = defaultdict()
    print(x['motif_query_start_end'])
    # msp size
    d["msp_size"] = x["msp_size"].unique()[0]
    
    # get sequences (left, right, motif)
    center = 100
    motif_len = 35
    flank = 40
    subseq = x["subset_sequence"].unique()[0]
    subseq_motif = subseq[center:(center+motif_len)]
    subseq_l = subseq[(center-flank):center]
    subseq_r = subseq[(center+motif_len):(center+motif_len+flank)]
    
    # AT count (left, right, motif)
    d["left_AT_count"] = (subseq_l.count("A") + subseq_l.count("T"))
    d["right_AT_count"] = (subseq_r.count("A") + subseq_r.count("T"))
    d["motif_AT_count"] = (subseq_motif.count("A") + subseq_motif.count("T"))
    
    # proportion of bases that are AT
    d["left_AT_prop"] = weird_division(d["left_AT_count"], flank)
    d["right_AT_prop"] = weird_division(d["right_AT_count"], flank)
    d["motif_AT_prop"] = weird_division(d["motif_AT_count"], motif_len)
    
    # m6a instances
    d["left_m6a_count"] = (x["centered_start"] < 0).sum()
    d["right_m6a_count"] = (x["centered_start"] >= 35).sum()
    d["motif_m6a_count"] = ((x["centered_start"] >= 0) & (x["centered_start"] < 35)).sum()
    
    # proportion of ATs that are methylated (m6a_count/AT_count) return 0 if no ATs
    d["left_m6a_prop"] = weird_division(d["left_m6a_count"], d["left_AT_count"])
    d["right_m6a_prop"] = weird_division(d["right_m6a_count"], d["right_AT_count"])
    d["motif_m6a_prop"] = weird_division(d["motif_m6a_count"], d["motif_AT_count"])
    
    # sanity check proportions
    for i in ["left", "right", "motif"]:
        if (d[f"{i}_AT_prop"] > 1) or (d[f"{i}_m6a_prop"] > 1):
            print("motif: ", x["motif_name"].unique()[0])
            print("query name: ", x["query_name"].unique()[0])
            print("motif-query-start-end: ", x["motif_query_start_end"].unique()[0])
            
            print(f"{i} AT prop: ", d[f"{i}_AT_prop"])
            print(f"{i} m6a prop: ", d[f"{i}_m6a_prop"])
            print(f"{i} m6a count: ", d[f"{i}_m6a_count"])
            print(f"{i} AT count: ", d[f"{i}_AT_count"])
            
            print("centered position type(s): {}".format(x.centering_position_type.value_counts()))
            logging.error("AT or m6a proportion > 1.")
    
    # ----- kmers -----
    # m6a instances in motif
    m6a_bool = x[(x["centered_start"] >= 0) & (x["centered_end"] <= 35)]["centered_start"].values
    m6a_bool = [1 if (i in m6a_bool) else 0 for i in range(0, 35)]
    # convert m6a bool to numpy array
    m6a_bool = np.array(m6a_bool)
    # get counts & m6a prop per k-mer
    kmer_counts = kmer_features(subseq_motif, m6a_bool, kmer_size=3, use_canonical=True)
    
    # ----- rle -----
    # make np array for sequence
    np_subseq_motif = np.frombuffer(bytes(subseq_motif, "utf-8"), dtype="S1")
    # make AT mask
    AT_bases = (np_subseq_motif == b"A" ) | (np_subseq_motif == b"T")
    # get run lengths using the AT base space
    run_lengths, run_starts, run_values = rle(m6a_bool[AT_bases])
    # subset the run lengths that encode zeros (non-m6a):
    non_m6a_run_lengths = run_lengths[run_values == 0]

    d["rle_max"] = max(non_m6a_run_lengths) if non_m6a_run_lengths.size else 0
    
    d = d | kmer_counts
    
    return pd.Series(d, index=list(d.keys()))

In [256]:
%%time
ix = 500
print("Unique motif-sequence groups: {:,}".format(len(list(df.iloc[:ix].groupby(grouping_cols, dropna=False).groups.keys()))))
res = df.iloc[:ix].groupby(grouping_cols, dropna=False).apply(lambda x: agg_features(x)).reset_index()
print(res.shape)
res.head()

Unique motif-sequence groups: 32
(32, 72)
CPU times: user 67.8 ms, sys: 3.89 ms, total: 71.7 ms
Wall time: 69.6 ms


Unnamed: 0,motif_name,query_name,msp_size,left_AT_count,right_AT_count,motif_AT_count,left_AT_prop,right_AT_prop,motif_AT_prop,left_m6a_count,...,GCA_count,GCA_m6a_prop,GGA_count,GGA_m6a_prop,GTA_count,GTA_m6a_prop,TAA_count,TAA_m6a_prop,TCA_count,TCA_m6a_prop
0,chr1_1033080_+,m54329U_210323_190418/144837022/ccs,171.0,9.0,11.0,10.0,0.225,0.275,0.285714,6.0,...,0.0,0.0,3.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0
1,chr1_1033080_+,m54329U_210323_190418/28969124/ccs,291.0,9.0,12.0,10.0,0.225,0.3,0.285714,3.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,chr1_1033080_+,m54329U_210323_190418/61606199/ccs,224.0,9.0,11.0,10.0,0.225,0.275,0.285714,7.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,chr1_1033080_+,m54329U_210323_190418/65799401/ccs,383.0,9.0,11.0,10.0,0.225,0.275,0.285714,5.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,chr1_1033080_+,m54329U_210323_190418/73794045/ccs,250.0,9.0,11.0,10.0,0.225,0.275,0.285714,6.0,...,0.0,0.0,3.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0


In [225]:
%%time
res = df.groupby(grouping_cols, dropna=False).apply(lambda x: agg_features(x)).reset_index()
print("rows: {:,} | columns: {:,}".format(res.shape[0], res.shape[1]))
res.head()

rows: 238,348 | columns: 72
CPU times: user 7min 8s, sys: 2.4 s, total: 7min 10s
Wall time: 7min 13s


Unnamed: 0,motif_name,query_name,msp_size,left_AT_count,right_AT_count,motif_AT_count,left_AT_prop,right_AT_prop,motif_AT_prop,left_m6a_count,...,GCA_count,GCA_m6a_prop,GGA_count,GGA_m6a_prop,GTA_count,GTA_m6a_prop,TAA_count,TAA_m6a_prop,TCA_count,TCA_m6a_prop
0,chr10_100338605_+,m54329U_210323_190418/12585455/ccs,200.0,18.0,18.0,13.0,0.45,0.45,0.325,14.0,...,3.0,0.666667,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0
1,chr10_100338605_+,m54329U_210323_190418/16515087/ccs,212.0,18.0,18.0,13.0,0.45,0.45,0.325,13.0,...,3.0,0.666667,0.0,0.0,2.0,0.25,1.0,0.333333,0.0,0.0
2,chr10_100338605_+,m54329U_210323_190418/169477611/ccs,181.0,18.0,18.0,13.0,0.45,0.45,0.325,10.0,...,3.0,0.666667,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0
3,chr10_100338605_+,m54329U_210323_190418/173999297/ccs,134.0,18.0,18.0,13.0,0.45,0.45,0.325,10.0,...,3.0,1.0,0.0,0.0,2.0,0.25,1.0,0.333333,0.0,0.0
4,chr10_100338605_+,m54329U_210323_190418/180160021/ccs,194.0,18.0,18.0,13.0,0.45,0.45,0.325,12.0,...,3.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0


In [None]:
# group by motif/query name
grouping_cols = ["motif_name", "query_name"]
df_grouped = df.groupby(grouping_cols)

In [135]:
print(group_names[0])
grouped = df_grouped.get_group(group_names[0])

('chr10_100338605_+', 'm54329U_210323_190418/12585455/ccs')


In [137]:
grouped.head()

Unnamed: 0,motif_query,motif_name,chrom,centering_position,strand,subset_sequence,reference_start,reference_end,query_name,centered_query_start,centered_query_end,query_length,centered_position_type,centered_start,centered_end,msp_size
877799,chr10_100338605_+/m54329U_210323_190418/125854...,chr10_100338605_+,chr10,100338605,+,ATGCTGTAAATGAGAGAGGGAGAGAACCCCAACCAGCAGAACAAAT...,100333398,100343546,m54329U_210323_190418/12585455/ccs,-5220,4949,10169,m6a,-40,-39,200
877800,chr10_100338605_+/m54329U_210323_190418/125854...,chr10_100338605_+,chr10,100338605,+,ATGCTGTAAATGAGAGAGGGAGAGAACCCCAACCAGCAGAACAAAT...,100333398,100343546,m54329U_210323_190418/12585455/ccs,-5220,4949,10169,m6a,-39,-38,200
877801,chr10_100338605_+/m54329U_210323_190418/125854...,chr10_100338605_+,chr10,100338605,+,ATGCTGTAAATGAGAGAGGGAGAGAACCCCAACCAGCAGAACAAAT...,100333398,100343546,m54329U_210323_190418/12585455/ccs,-5220,4949,10169,m6a,-37,-36,200
877802,chr10_100338605_+/m54329U_210323_190418/125854...,chr10_100338605_+,chr10,100338605,+,ATGCTGTAAATGAGAGAGGGAGAGAACCCCAACCAGCAGAACAAAT...,100333398,100343546,m54329U_210323_190418/12585455/ccs,-5220,4949,10169,m6a,-34,-33,200
877803,chr10_100338605_+/m54329U_210323_190418/125854...,chr10_100338605_+,chr10,100338605,+,ATGCTGTAAATGAGAGAGGGAGAGAACCCCAACCAGCAGAACAAAT...,100333398,100343546,m54329U_210323_190418/12585455/ccs,-5220,4949,10169,m6a,-29,-28,200


In [162]:
# get sequences
subseq = grouped.subset_sequence.unique()[0]
subseq_l = subseq[(100-40):100]
subseq_r = subseq[(100+35):(100+35+40)]
subseq_motif = subseq[100:(100+35)]

201 40 35 40


In [168]:
print((subseq_l.count("A") + subseq_l.count("T")))
print((subseq_r.count("A") + subseq_r.count("T")))
print((subseq_motif.count("A") + subseq_motif.count("T")))
print((subseq[60:175].count("A") + subseq[60:175].count("T")))
print((subseq_l.count("A") + subseq_l.count("T")) + 
      (subseq_r.count("A") + subseq_r.count("T")) + 
      (subseq_motif.count("A") + subseq_motif.count("T")))

18
18
13
49
49


In [154]:
# m6a counts
m6a_pos = grouped.centered_start.tolist()
# left flank
print((grouped["centered_start"] < 0).sum())
# right flank
print((grouped["centered_start"] >= 35).sum())
# motif
print(((grouped["centered_start"] >= 0) & (grouped["centered_start"] < 35)).sum())

14
13
3


In [156]:
#subseq[(center-flank_len):center]
len(subseq)

201

In [313]:
def rle(inarray):
    ''' run length encoding. Partial credit to R rle function. 
        Multi datatype arrays catered for including non Numpy
        returns: tuple (runlengths, startpositions, values) 
        input: array
        output: tuple of (run_lengths, start_positions, values)'''
    ia = np.asarray(inarray)                # force numpy
    n = len(ia)
    if n == 0: 
        return (None, None, None)
    else:
        y = ia[1:] != ia[:-1]               # pairwise unequal (string safe)
        i = np.append(np.where(y), n - 1)   # must include last element position
        z = np.diff(np.append(-1, i))       # run lengths
        p = np.cumsum(np.append(0, z))[:-1] # positions
        return(z, p, ia[i])

In [314]:
# establish input data
subseq_motif = "ACGTACGTACGTACGTACGT"
#subseq_m6a   = "10010000000100010001"
subseq_m6a   = "11111111111111111111"

# convert m6a boolean string to numpy array
m6a_bool = np.array([int(x) for x in subseq_m6a])

# make a numpy array for sequence
np_subseq_motif = np.frombuffer(bytes(subseq_motif, "utf-8"), dtype="S1")
np_subseq_motif

array([b'A', b'C', b'G', b'T', b'A', b'C', b'G', b'T', b'A', b'C', b'G',
       b'T', b'A', b'C', b'G', b'T', b'A', b'C', b'G', b'T'], dtype='|S1')

In [315]:
# make the AT mask
AT_bases = (np_subseq_motif == b"A" )| (np_subseq_motif == b"T")

# get run lengths using the AT base space
run_lengths, run_starts, run_values = rle(m6a_bool[AT_bases])

# subset the run lengths that encode zeros (non-m6a):
non_m6a_run_lengths = run_lengths[run_values == 0]

# set rle_max to 0 if there are only m6as
if non_m6a_run_lengths.size:
    print(non_m6a_run_lengths, max(non_m6a_run_lengths))
else:
    print("All m6a's.", 0)

# if array is empty ()
# print the results
print(non_m6a_run_lengths, max(non_m6a_run_lengths))

All m6a's. 0


ValueError: max() arg is an empty sequence

## Negative (L) 5%

In [541]:
%%time
# CTCF L NOT cleaned
motif_type = "L"
data_file = "{}/CTCF_m6a_fiberseq_{}_100bp_small_5_negative.txt".format(data_dir, motif_type)

# read in data
n_rows = None
df = pd.read_csv(data_file, sep="\t", nrows=n_rows)
print("ft center data - rows: {:,}, columns: {:,}".format(df.shape[0], df.shape[1]))

ft center data - rows: 39,485,334, columns: 13
CPU times: user 1min 10s, sys: 10.1 s, total: 1min 20s
Wall time: 1min 20s


In [542]:
# filter for only m6a & msp rows
df = df[df["centered_position_type"].isin(["m6a", "msp"])]
print("MSP & m6a instances - rows: {:,}, columns: {:,}".format(df.shape[0], df.shape[1]))

# remove rows with Ns in sequence & clean chroms
df = clean_sequences(df)
df = clean_chroms(df)
print("Cleaned rows: {:,}".format(df.shape[0]))

MSP & m6a instances - rows: 32,102,125, columns: 13
Removing sequences w/ N: 216,546
Removing weird chromosomes: 510,054
Cleaned rows: 31,375,525


<font color="red">__add centered_query_start & centered_query_end to grouping cols__</font>
* due to tandem repeats
* some fibers aligning multiple times to the same motif

In [543]:
# add columns of unique motif names & motif_query names
df.insert(loc=0, column="motif_name", 
          value=df["chrom"]+"_"+df["centering_position"].astype(str)+"_"+df["strand"].astype(str))
df.insert(loc=0, column="motif_query", value=df["motif_name"]+"/"+df["query_name"])
#df.insert(loc=0, column="motif_query_start_end", 
#          value=df["motif_name"].astype(str)+"/"+df["query_name"].astype(str)+"/"+
#                df["centered_query_start"].astype(str)+"/"+df["centered_query_end"].astype(str))

In [544]:
df.head()

Unnamed: 0,motif_query,motif_name,chrom,centering_position,strand,subset_sequence,reference_start,reference_end,query_name,centered_query_start,centered_query_end,query_length,centered_position_type,centered_start,centered_end
0,chr1_11256_-/m54329U_210814_130637/153159059/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,40236,m54329U_210814_130637/153159059/ccs,-4179,28937,33116,m6a,-81,-80
1,chr1_11256_-/m54329U_210814_130637/153159059/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,40236,m54329U_210814_130637/153159059/ccs,-4179,28937,33116,m6a,-78,-77
2,chr1_11256_-/m54329U_210814_130637/153159059/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,40236,m54329U_210814_130637/153159059/ccs,-4179,28937,33116,m6a,-75,-74
3,chr1_11256_-/m54329U_210814_130637/153159059/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,40236,m54329U_210814_130637/153159059/ccs,-4179,28937,33116,m6a,-74,-73
4,chr1_11256_-/m54329U_210814_130637/153159059/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,40236,m54329U_210814_130637/153159059/ccs,-4179,28937,33116,m6a,-73,-72


In [545]:
# save pre-msp filtered df so I don't need to reload it
#df_og = df.copy()

In [558]:
df = df_og.copy()

In [562]:
df.head()

Unnamed: 0,motif_query,motif_name,chrom,centering_position,strand,subset_sequence,reference_start,reference_end,query_name,centered_query_start,centered_query_end,query_length,centered_position_type,centered_start,centered_end
0,chr1_11256_-/m54329U_210814_130637/153159059/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,40236,m54329U_210814_130637/153159059/ccs,-4179,28937,33116,m6a,-81,-80
1,chr1_11256_-/m54329U_210814_130637/153159059/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,40236,m54329U_210814_130637/153159059/ccs,-4179,28937,33116,m6a,-78,-77
2,chr1_11256_-/m54329U_210814_130637/153159059/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,40236,m54329U_210814_130637/153159059/ccs,-4179,28937,33116,m6a,-75,-74
3,chr1_11256_-/m54329U_210814_130637/153159059/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,40236,m54329U_210814_130637/153159059/ccs,-4179,28937,33116,m6a,-74,-73
4,chr1_11256_-/m54329U_210814_130637/153159059/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,40236,m54329U_210814_130637/153159059/ccs,-4179,28937,33116,m6a,-73,-72


__test msp filtering__

In [600]:
# original version (w/ edits)
def filt_msps(df, motif_len=35):
    '''Filters for motif/query instances with a motif within 
    a MSP and adds MSP length to each motif/query group.'''
    
    # position of MSPs containing a motif
    msp_mask = (df["centered_position_type"] == "msp") & (df["centered_start"] <= 0) & (df["centered_end"] >= motif_len)
    print("MSP's with a motif: " + "{:,}".format(msp_mask.sum()))
    msp_groups = [(row["motif_name"], row["query_name"]) for idx, row in df[msp_mask].iterrows()]
    
    # filter for rows with motifs within an MSP (gets both MSP's & m6a's)
    df = df[df[["motif_name", "query_name"]].apply(tuple, 1).isin(msp_groups)]
    print("Total observations from motifs within an MSP: " + "{:,}".format(df.shape[0]))
    
    # add MSP size corresponding to each group
    # df with msp sizes
    df_msp = df.loc[msp_mask, ["motif_name", "query_name", "centered_end", "centered_start"]]
    df_msp["msp_size"] = df_msp["centered_end"] - df_msp["centered_start"]
    dup_alignments = df_msp.shape[0]
    df_msp = df_msp.drop_duplicates(subset=["motif_name", "query_name"], keep=False)
    print("Dropped motif-query instances w/ multiple MSPs (multiple alignments). \
          {:,}".format(dup_alignments - df_msp.shape[0]))

    # match msp back to it's motif & fiber
    df = df.merge(df_msp[["motif_name", "query_name", "msp_size"]], on=["motif_name", "query_name"])
    print("Merged df shape w/ msp_size: " + "{:,}".format(df.shape[0]))
    return df

In [602]:
%%time
df_filt = filt_msps(df)

MSP's with a motif: 326,847
Total observations from motifs within an MSP: 8,240,741
Dropping motif-query instances w/ multiple MSPs (multiple alignments).
Merged df shape w/ msp_size: 8,239,145
CPU times: user 2min 16s, sys: 5 s, total: 2min 21s
Wall time: 2min 22s


In [604]:
df = df_filt

In [290]:
# filter for regions within an MSP, MSP size per motif_query, remove MSP size
#df = filt_msps(df)
#df = df[df["centered_position_type"] == "m6a"]
#print("Total m6a observations: " + "{:,}".format(df.shape[0]))

MSP's with a motif: 326,847
Total observations from motifs within an MSP: 8,240,741
Merged df shape w/ msp_size: 8,242,521
Total m6a observations: 7,904,467


In [393]:
motif_len = 35
# position of MSPs containing a motif
msp_mask = (df["centered_position_type"] == "msp") & (df["centered_start"] <= 0) & (df["centered_end"] >= motif_len)
print("MSP's with a motif: " + "{:,}".format(msp_mask.sum()))
msp_groups = [(row["motif_name"], row["query_name"]) for idx, row in df[msp_mask].iterrows()]

MSP's with a motif: 326,847


In [394]:
df[df[["motif_name", "query_name"]].apply(tuple, 1).isin(msp_groups) & df["centered_position_type"] == "msp"]

KeyboardInterrupt: 

In [605]:
# filter for rows with motifs within an MSP (gets both MSP's & m6a's)
df = df[df[["motif_name", "query_name"]].apply(tuple, 1).isin(msp_groups)]
print("Total observations from motifs within an MSP: " + "{:,}".format(df.shape[0]))

KeyboardInterrupt: 

In [290]:
# filter for regions within an MSP, MSP size per motif_query, remove MSP size
df = filt_msps(df)
df = df[df["centered_position_type"] == "m6a"]
print("Total m6a observations: " + "{:,}".format(df.shape[0]))

MSP's with a motif: 326,847
Total observations from motifs within an MSP: 8,240,741
Merged df shape w/ msp_size: 8,242,521
Total m6a observations: 7,904,467


In [293]:
(32102125-31375525) / 32102125 * 100

2.2634015660957028

In [291]:
# save filtered m6a fiberseq data (within 100bp)
save_cleaned=True
if save_cleaned:
    output_file = data_file.replace(".txt", "-cleaned.txt")
    print("Saving to cleaned m6a fiberseq data to: {}".format(output_file))
    df.to_csv(output_file, header=True, index=None, sep="\t",)

Saving to cleaned m6a fiberseq data to: /mmfs1/gscratch/stergachislab/mwperez/ctcf-footprinting/candidate_footprints/CTCF_m6a_fiberseq_L_100bp_small_5_negative-cleaned.txt


In [607]:
# filter for m6a's within 40 bp range
m6a_range_mask = (df["centered_start"] >= -40) & (df["centered_end"] <= 75)
print("Total m6a's within 40 bp flank of motif: " + "{:,}".format(m6a_range_mask.sum()))
df = df[m6a_range_mask]

Total m6a's within 40 bp flank of motif: 6,730,171


In [609]:
df.head(1)

Unnamed: 0,motif_query,motif_name,chrom,centering_position,strand,subset_sequence,reference_start,reference_end,query_name,centered_query_start,centered_query_end,query_length,centered_position_type,centered_start,centered_end,msp_size
12,chr1_11256_-/m54329U_210814_130637/153159059/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,40236,m54329U_210814_130637/153159059/ccs,-4179,28937,33116,m6a,-20,-19,346


In [611]:
# group by motif/query name AND centered_query_start & centered_query_end 
# (same read aligning twice, multiple repeats in a row double counting m6A instances but NOT sequences)
grouping_cols = ["motif_name", "query_name", "centered_query_start", "centered_query_end"]
df_grouped = df.groupby(grouping_cols)
# get group names (keys)
group_names = list(df_grouped.groups.keys())
print("Unique motif-fiber-start-end groups: " + "{:,}".format(len(group_names)))

Unique motif-fiber-start-end groups: 327,004


In [612]:
# group by motif/query name AND centered_query_start & centered_query_end 
# (same read aligning twice, multiple repeats in a row double counting m6A instances but NOT sequences)
grouping_cols = ["motif_name", "query_name"]
df_grouped = df.groupby(grouping_cols)
# get group names (keys)
group_names = list(df_grouped.groups.keys())
print("Unique motif-fiber groups: " + "{:,}".format(len(group_names)))

Unique motif-fiber groups: 326,788


In [348]:
43/327086*100

0.013146389634530369

In [None]:
def rle(inarray):
    ''' run length encoding. Partial credit to R rle function. 
        Multi datatype arrays catered for including non Numpy
        returns: tuple (runlengths, startpositions, values) 
        input: array
        output: tuple of (run_lengths, start_positions, values)'''
    ia = np.asarray(inarray)                # force numpy
    n = len(ia)
    if n == 0: 
        return (None, None, None)
    else:
        y = ia[1:] != ia[:-1]               # pairwise unequal (string safe)
        i = np.append(np.where(y), n - 1)   # must include last element position
        z = np.diff(np.append(-1, i))       # run lengths
        p = np.cumsum(np.append(0, z))[:-1] # positions
        return(z, p, ia[i])

In [615]:
def agg_features(x):
    '''Collect motif features per group.'''
    d = defaultdict()
        
    # msp size
    d["msp_size"] = x["msp_size"].unique()[0]
    
    # get sequences (left, right, motif)
    center = 100
    motif_len = 35
    flank = 40
    subseq = x["subset_sequence"].unique()[0]
    subseq_motif = subseq[center:(center+motif_len)]
    subseq_l = subseq[(center-flank):center]
    subseq_r = subseq[(center+motif_len):(center+motif_len+flank)]
    
    # AT count (left, right, motif)
    d["left_AT_count"] = (subseq_l.count("A") + subseq_l.count("T"))
    d["right_AT_count"] = (subseq_r.count("A") + subseq_r.count("T"))
    d["motif_AT_count"] = (subseq_motif.count("A") + subseq_motif.count("T"))
    
    # proportion of bases that are AT
    d["left_AT_prop"] = weird_division(d["left_AT_count"], flank)
    d["right_AT_prop"] = weird_division(d["right_AT_count"], flank)
    d["motif_AT_prop"] = weird_division(d["motif_AT_count"], motif_len)
    
    # m6a instances
    d["left_m6a_count"] = (x["centered_start"] < 0).sum()
    d["right_m6a_count"] = (x["centered_start"] >= 35).sum()
    d["motif_m6a_count"] = ((x["centered_start"] >= 0) & (x["centered_start"] < 35)).sum()
    
    # proportion of ATs that are methylated (m6a_count/AT_count) return 0 if no ATs
    d["left_m6a_prop"] = weird_division(d["left_m6a_count"], d["left_AT_count"])
    d["right_m6a_prop"] = weird_division(d["right_m6a_count"], d["right_AT_count"])
    d["motif_m6a_prop"] = weird_division(d["motif_m6a_count"], d["motif_AT_count"])
    
    # sanity check proportions
    for i in ["left", "right", "motif"]:
        if (d[f"{i}_AT_prop"] > 1) or (d[f"{i}_m6a_prop"] > 1):
            print("motif: ", x["motif_name"].unique()[0])
            print("query name: ", x["query_name"].unique()[0])
            
            print(f"{i} AT prop: ", d[f"{i}_AT_prop"])
            print(f"{i} m6a prop: ", d[f"{i}_m6a_prop"])
            print(f"{i} m6a count: ", d[f"{i}_m6a_count"])
            print(f"{i} AT count: ", d[f"{i}_AT_count"])
            logging.error("AT or m6a proportion > 1.")
            print("----------")
    
    # ----- kmers -----
    # m6a instances in motif
    m6a_bool = x[(x["centered_start"] >= 0) & (x["centered_end"] <= 35)]["centered_start"].values
    m6a_bool = [1 if (i in m6a_bool) else 0 for i in range(0, 35)]
    # convert m6a bool to numpy array
    m6a_bool = np.array(m6a_bool)
    # get counts & m6a prop per k-mer
    kmer_counts = kmer_features(subseq_motif, m6a_bool, kmer_size=3, use_canonical=True)
    
    # ----- rle -----
    # make np array for sequence
    np_subseq_motif = np.frombuffer(bytes(subseq_motif, "utf-8"), dtype="S1")
    # make AT mask
    AT_bases = (np_subseq_motif == b"A" ) | (np_subseq_motif == b"T")
    if not any(AT_bases):
        d["rle_max"] = 0
    else:
        # get run lengths using the AT base space
        run_lengths, run_starts, run_values = rle(m6a_bool[AT_bases])
        if run_lengths is None:
            print(AT_bases)
            print(m6a_bool)
            print(subseq_motif)
        # subset the run lengths that encode zeros (non-m6a):
        non_m6a_run_lengths = run_lengths[run_values == 0]
        d["rle_max"] = max(non_m6a_run_lengths) if non_m6a_run_lengths.size else 0
    
    d = d | kmer_counts
    
    return pd.Series(d, index=list(d.keys()))

In [616]:
%%time
# extract features by motif/query group
print("\nAggregating features!")
res = df.groupby(grouping_cols).apply(lambda x: agg_features(x)).reset_index()
print("Features: {}".format(res.columns.tolist()[2:]))
print("Total rows: " + "{:,}".format(res.shape[0]))
print("Total columns: " + "{:,}".format(res.shape[1]))


Aggregating features!


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr10_100406861_+
query name:  m64076_210328_012155/70257353/ccs
left AT prop:  0.325
left m6a prop:  1.0769230769230769
left m6a count:  14
left AT count:  13
----------
motif:  chr10_100731612_-
query name:  m54329U_210326_192251/177734943/ccs
motif AT prop:  0.2571428571428571
motif m6a prop:  1.1111111111111112
motif m6a count:  10
motif AT count:  9
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr10_102120181_-
query name:  m64076_210328_012155/9241295/ccs
left AT prop:  0.075
left m6a prop:  1.3333333333333333
left m6a count:  4
left AT count:  3
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr10_110872633_+
query name:  m54329U_210326_192251/168627002/ccs
left AT prop:  0.2
left m6a prop:  1.125
left m6a count:  9
left AT count:  8
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr10_13079803_+
query name:  m64076_221119_202646/81200193/ccs
motif AT prop:  0.3142857142857143
motif m6a prop:  1.0909090909090908
motif m6a count:  12
motif AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr10_133254411_+
query name:  m54329U_210326_192251/49810245/ccs
left AT prop:  0.275
left m6a prop:  1.0909090909090908
left m6a count:  12
left AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr10_133687887_-
query name:  m54329U_210813_020940/85984192/ccs
motif AT prop:  0.6
motif m6a prop:  1.1428571428571428
motif m6a count:  24
motif AT count:  21
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr10_43997336_+
query name:  m64076_210328_012155/114559788/ccs
left AT prop:  0.375
left m6a prop:  1.0666666666666667
left m6a count:  16
left AT count:  15
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr10_70733937_+
query name:  m54329U_210323_190418/115935131/ccs
motif AT prop:  0.2571428571428571
motif m6a prop:  1.1111111111111112
motif m6a count:  10
motif AT count:  9
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr10_79178957_-
query name:  m64076_210328_012155/161941056/ccs
motif AT prop:  0.2571428571428571
motif m6a prop:  1.1111111111111112
motif m6a count:  10
motif AT count:  9
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr10_85984480_+
query name:  m54329U_210323_190418/168166993/ccs
left AT prop:  0.3
left m6a prop:  1.0833333333333333
left m6a count:  13
left AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr10_87364339_-
query name:  m54329U_210813_020940/138740812/ccs
left AT prop:  0.3
left m6a prop:  1.0833333333333333
left m6a count:  13
left AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr10_92226945_-
query name:  m64076_210328_012155/107282447/ccs
motif AT prop:  0.2857142857142857
motif m6a prop:  1.1
motif m6a count:  11
motif AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr11_104334163_+
query name:  m54329U_210326_192251/43190840/ccs
motif AT prop:  0.3142857142857143
motif m6a prop:  1.0909090909090908
motif m6a count:  12
motif AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr11_117663558_+
query name:  m54329U_210323_190418/36635686/ccs
left AT prop:  0.4
left m6a prop:  1.0625
left m6a count:  17
left AT count:  16
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr11_1442205_+
query name:  m64076_210328_012155/157616431/ccs
motif AT prop:  0.3142857142857143
motif m6a prop:  1.0909090909090908
motif m6a count:  12
motif AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr11_1749020_-
query name:  m54329U_210323_190418/68552641/ccs
left AT prop:  0.2
left m6a prop:  1.125
left m6a count:  9
left AT count:  8
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr11_2363736_+
query name:  m64076_210328_012155/177473142/ccs
left AT prop:  0.525
left m6a prop:  1.0476190476190477
left m6a count:  22
left AT count:  21
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr11_33891960_-
query name:  m54329U_210326_192251/5835121/ccs
motif AT prop:  0.37142857142857144
motif m6a prop:  1.0769230769230769
motif m6a count:  14
motif AT count:  13
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr11_47408602_+
query name:  m64076_210328_012155/147916933/ccs
left AT prop:  0.075
left m6a prop:  1.3333333333333333
left m6a count:  4
left AT count:  3
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr11_65646090_+
query name:  m54329U_210323_190418/1836458/ccs
left AT prop:  0.275
left m6a prop:  1.0909090909090908
left m6a count:  12
left AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr11_68212936_-
query name:  m64076_210328_012155/103351989/ccs
left AT prop:  0.05
left m6a prop:  1.5
left m6a count:  3
left AT count:  2
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr11_94236878_-
query name:  m54329U_210326_192251/81985824/ccs
left AT prop:  0.525
left m6a prop:  1.2380952380952381
left m6a count:  26
left AT count:  21
----------
motif:  chr11_94236878_-
query name:  m54329U_210326_192251/81985824/ccs
motif AT prop:  0.45714285714285713
motif m6a prop:  1.0625
motif m6a count:  17
motif AT count:  16
----------
motif:  chr11_94236878_-
query name:  m54329U_210813_020940/138741413/ccs
motif AT prop:  0.45714285714285713
motif m6a prop:  1.25
motif m6a count:  20
motif AT count:  16
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr12_102065857_-
query name:  m54329U_210326_192251/79235165/ccs
motif AT prop:  0.34285714285714286
motif m6a prop:  1.0833333333333333
motif m6a count:  13
motif AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr12_106986938_-
query name:  m64076_210328_012155/75367454/ccs
left AT prop:  0.25
left m6a prop:  1.1
left m6a count:  11
left AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr12_132216453_-
query name:  m64076_210328_012155/58264169/ccs
left AT prop:  0.25
left m6a prop:  1.1
left m6a count:  11
left AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr12_132717712_+
query name:  m54329U_210326_192251/112330806/ccs
left AT prop:  0.325
left m6a prop:  1.0769230769230769
left m6a count:  14
left AT count:  13
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr12_48180872_-
query name:  m64076_210328_012155/124388517/ccs
left AT prop:  0.425
left m6a prop:  1.0588235294117647
left m6a count:  18
left AT count:  17
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr13_27620350_-
query name:  m64076_210328_012155/126026758/ccs
motif AT prop:  0.2
motif m6a prop:  1.1428571428571428
motif m6a count:  8
motif AT count:  7
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr13_50753488_+
query name:  m54329U_210814_130637/7734060/ccs
motif AT prop:  0.2857142857142857
motif m6a prop:  1.1
motif m6a count:  11
motif AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr13_95475501_+
query name:  m54329U_210323_190418/162728460/ccs
motif AT prop:  0.37142857142857144
motif m6a prop:  1.0769230769230769
motif m6a count:  14
motif AT count:  13
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr14_104742570_-
query name:  m64076_210328_012155/103548505/ccs
left AT prop:  0.35
left m6a prop:  1.0714285714285714
left m6a count:  15
left AT count:  14
----------
motif:  chr14_104795359_-
query name:  m64076_221119_202646/75891342/ccs
left AT prop:  0.2
left m6a prop:  1.125
left m6a count:  9
left AT count:  8
----------
motif:  chr14_105051091_+
query name:  m64076_210328_012155/4522599/ccs
motif AT prop:  0.34285714285714286
motif m6a prop:  1.0833333333333333
motif m6a count:  13
motif AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr14_105592850_+
query name:  m64076_221119_202646/61604864/ccs
motif AT prop:  0.22857142857142856
motif m6a prop:  1.125
motif m6a count:  9
motif AT count:  8
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr14_19301004_+
query name:  m54329U_210814_130637/154207579/ccs
left AT prop:  0.275
left m6a prop:  1.0909090909090908
left m6a count:  12
left AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr14_70724413_-
query name:  m64076_210328_012155/28967336/ccs
motif AT prop:  0.37142857142857144
motif m6a prop:  1.0769230769230769
motif m6a count:  14
motif AT count:  13
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr14_77867411_-
query name:  m54329U_210323_190418/95486685/ccs
motif AT prop:  0.34285714285714286
motif m6a prop:  1.0833333333333333
motif m6a count:  13
motif AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr14_90533732_-
query name:  m54329U_210323_190418/140642483/ccs
left AT prop:  0.325
left m6a prop:  1.0769230769230769
left m6a count:  14
left AT count:  13
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr15_101978246_+
query name:  m54329U_210323_190418/166659558/ccs
left AT prop:  0.375
left m6a prop:  1.3333333333333333
left m6a count:  20
left AT count:  15
----------
motif:  chr15_101979530_+
query name:  m54329U_210323_190418/166659558/ccs
right AT prop:  0.425
right m6a prop:  1.1176470588235294
right m6a count:  19
right AT count:  17
----------
motif:  chr15_101979592_+
query name:  m54329U_210323_190418/166659558/ccs
motif AT prop:  0.3142857142857143
motif m6a prop:  1.1818181818181819
motif m6a count:  13
motif AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr15_30222987_+
query name:  m54329U_210813_020940/6817985/ccs
motif AT prop:  0.11428571428571428
motif m6a prop:  1.25
motif m6a count:  5
motif AT count:  4
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr15_74629104_-
query name:  m64076_210328_012155/70126056/ccs
motif AT prop:  0.4
motif m6a prop:  1.0714285714285714
motif m6a count:  15
motif AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr15_79896971_-
query name:  m64076_221119_202646/70189830/ccs
left AT prop:  0.275
left m6a prop:  1.0909090909090908
left m6a count:  12
left AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr15_94231570_+
query name:  m64076_210328_012155/106694117/ccs
left AT prop:  0.225
left m6a prop:  1.1111111111111112
left m6a count:  10
left AT count:  9
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr16_1242473_+
query name:  m54329U_210810_004956/59509081/ccs
left AT prop:  0.325
left m6a prop:  1.0769230769230769
left m6a count:  14
left AT count:  13
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr16_19353037_-
query name:  m64076_210328_012155/43385891/ccs
motif AT prop:  0.3142857142857143
motif m6a prop:  1.0909090909090908
motif m6a count:  12
motif AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr16_2496230_+
query name:  m64076_210328_012155/91816850/ccs
left AT prop:  0.35
left m6a prop:  1.0714285714285714
left m6a count:  15
left AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr16_2643299_-
query name:  m54329U_210323_190418/43714431/ccs
left AT prop:  0.375
left m6a prop:  1.0666666666666667
left m6a count:  16
left AT count:  15
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr16_3056932_+
query name:  m64076_210328_012155/108200912/ccs
left AT prop:  0.275
left m6a prop:  1.0909090909090908
left m6a count:  12
left AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr16_86335621_+
query name:  m64076_210328_012155/197793/ccs
left AT prop:  0.4
left m6a prop:  1.0625
left m6a count:  17
left AT count:  16
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr16_89736437_-
query name:  m54329U_210326_192251/131992380/ccs
left AT prop:  0.375
left m6a prop:  1.0666666666666667
left m6a count:  16
left AT count:  15
----------
motif:  chr16_90003315_-
query name:  m54329U_210813_020940/130155654/ccs
left AT prop:  0.3
left m6a prop:  1.0833333333333333
left m6a count:  13
left AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr17_1108858_-
query name:  m64076_221119_202646/91686271/ccs
left AT prop:  0.25
left m6a prop:  1.1
left m6a count:  11
left AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr17_15917516_-
query name:  m64076_221119_202646/34867044/ccs
left AT prop:  0.25
left m6a prop:  1.1
left m6a count:  11
left AT count:  10
----------
motif:  chr17_16477597_+
query name:  m54329U_210326_192251/120915766/ccs
motif AT prop:  0.2857142857142857
motif m6a prop:  1.1
motif m6a count:  11
motif AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr17_2716761_-
query name:  m64076_210328_012155/95749753/ccs
motif AT prop:  0.42857142857142855
motif m6a prop:  1.0666666666666667
motif m6a count:  16
motif AT count:  15
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr17_30291791_+
query name:  m54329U_210326_192251/75892354/ccs
left AT prop:  0.2
left m6a prop:  1.125
left m6a count:  9
left AT count:  8
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr17_41539277_-
query name:  m64076_210328_012155/86968523/ccs
left AT prop:  0.425
left m6a prop:  1.0588235294117647
left m6a count:  18
left AT count:  17
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr17_44768920_+
query name:  m54329U_210326_192251/164367151/ccs
left AT prop:  0.275
left m6a prop:  1.0909090909090908
left m6a count:  12
left AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr17_58195108_-
query name:  m64076_210328_012155/50397290/ccs
left AT prop:  0.35
left m6a prop:  1.0714285714285714
left m6a count:  15
left AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr17_63622673_-
query name:  m64076_221119_202646/136578640/ccs
motif AT prop:  0.11428571428571428
motif m6a prop:  1.25
motif m6a count:  5
motif AT count:  4
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr17_64326479_+
query name:  m54329U_210810_004956/74843950/ccs
left AT prop:  0.3
left m6a prop:  1.0833333333333333
left m6a count:  13
left AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr17_75823498_-
query name:  m54329U_210326_192251/141101309/ccs
left AT prop:  0.35
left m6a prop:  1.0714285714285714
left m6a count:  15
left AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr18_13485556_-
query name:  m64076_210328_012155/27264655/ccs
motif AT prop:  0.37142857142857144
motif m6a prop:  1.0769230769230769
motif m6a count:  14
motif AT count:  13
----------
motif:  chr18_13548818_-
query name:  m64076_221119_202646/46071889/ccs
motif AT prop:  0.2857142857142857
motif m6a prop:  1.1
motif m6a count:  11
motif AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr18_35151830_-
query name:  m64076_210328_012155/25036942/ccs
motif AT prop:  0.2571428571428571
motif m6a prop:  1.1111111111111112
motif m6a count:  10
motif AT count:  9
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr19_13795683_-
query name:  m64076_221119_202646/108069597/ccs
left AT prop:  0.125
left m6a prop:  1.2
left m6a count:  6
left AT count:  5
----------
motif:  chr19_14090074_+
query name:  m54329U_210813_020940/129960682/ccs
left AT prop:  0.175
left m6a prop:  1.1428571428571428
left m6a count:  8
left AT count:  7
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr19_18153403_+
query name:  m54329U_210326_192251/68749313/ccs
left AT prop:  0.25
left m6a prop:  1.1
left m6a count:  11
left AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr19_39826972_+
query name:  m64076_210328_012155/31720587/ccs
left AT prop:  0.25
left m6a prop:  1.1
left m6a count:  11
left AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr19_47274881_+
query name:  m54329U_210323_190418/59115197/ccs
motif AT prop:  0.22857142857142856
motif m6a prop:  1.125
motif m6a count:  9
motif AT count:  8
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr19_50500549_-
query name:  m64076_210328_012155/6489245/ccs
left AT prop:  0.225
left m6a prop:  1.1111111111111112
left m6a count:  10
left AT count:  9
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr19_7555716_+
query name:  m54329U_210814_130637/74582483/ccs
motif AT prop:  0.2
motif m6a prop:  1.1428571428571428
motif m6a count:  8
motif AT count:  7
----------
motif:  chr19_7845401_-
query name:  m54329U_210326_192251/80544406/ccs
left AT prop:  0.3
left m6a prop:  1.0833333333333333
left m6a count:  13
left AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_11106889_-
query name:  m54329U_210323_190418/93194848/ccs
left AT prop:  0.45
left m6a prop:  1.0555555555555556
left m6a count:  19
left AT count:  18
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_154848141_+
query name:  m54329U_210813_020940/112462021/ccs
left AT prop:  0.275
left m6a prop:  1.0909090909090908
left m6a count:  12
left AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_17045313_+
query name:  m54329U_210323_190418/147196057/ccs
motif AT prop:  0.37142857142857144
motif m6a prop:  1.0769230769230769
motif m6a count:  14
motif AT count:  13
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_204151323_-
query name:  m64076_221119_202646/165217646/ccs
left AT prop:  0.3
left m6a prop:  1.0833333333333333
left m6a count:  13
left AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_204812038_-
query name:  m54329U_210326_192251/69666040/ccs
motif AT prop:  0.34285714285714286
motif m6a prop:  1.0833333333333333
motif m6a count:  13
motif AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_207111441_+
query name:  m54329U_210326_192251/139921507/ccs
left AT prop:  0.45
left m6a prop:  1.0555555555555556
left m6a count:  19
left AT count:  18
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr1_209190969_-
query name:  m64076_210328_012155/122814956/ccs
motif AT prop:  0.3142857142857143
motif m6a prop:  1.0909090909090908
motif m6a count:  12
motif AT count:  11
----------
motif:  chr1_209561661_-
query name:  m54329U_210323_190418/166855364/ccs
motif AT prop:  0.3142857142857143
motif m6a prop:  1.0909090909090908
motif m6a count:  12
motif AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_220466263_+
query name:  m64076_221119_202646/67961554/ccs
left AT prop:  0.25
left m6a prop:  1.1
left m6a count:  11
left AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_236281769_-
query name:  m54329U_210326_192251/124192811/ccs
left AT prop:  0.325
left m6a prop:  1.0769230769230769
left m6a count:  14
left AT count:  13
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_241792439_+
query name:  m64076_210328_012155/86051316/ccs
left AT prop:  0.4
left m6a prop:  1.0625
left m6a count:  17
left AT count:  16
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_2685379_+
query name:  m54329U_210326_192251/104925581/ccs
left AT prop:  0.425
left m6a prop:  1.0588235294117647
left m6a count:  18
left AT count:  17
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_29837564_+
query name:  m64076_210328_012155/24447602/ccs
left AT prop:  0.35
left m6a prop:  1.0714285714285714
left m6a count:  15
left AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_31772271_-
query name:  m54329U_210323_190418/83755753/ccs
left AT prop:  0.25
left m6a prop:  1.1
left m6a count:  11
left AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_3313326_+
query name:  m64076_221119_202646/46074296/ccs
motif AT prop:  0.2857142857142857
motif m6a prop:  1.1
motif m6a count:  11
motif AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_37662657_-
query name:  m54329U_210323_190418/133607/ccs
motif AT prop:  0.37142857142857144
motif m6a prop:  1.0769230769230769
motif m6a count:  14
motif AT count:  13
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_38046893_-
query name:  m54329U_210323_190418/53937733/ccs
left AT prop:  0.05
left m6a prop:  1.5
left m6a count:  3
left AT count:  2
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr1_42887833_-
query name:  m54329U_210323_190418/90376410/ccs
left AT prop:  0.4
left m6a prop:  1.0625
left m6a count:  17
left AT count:  16
----------
motif:  chr1_43135447_-
query name:  m54329U_210326_192251/8522321/ccs
left AT prop:  0.45
left m6a prop:  1.0555555555555556
left m6a count:  19
left AT count:  18
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_5444620_-
query name:  m64076_221119_202646/55443523/ccs
left AT prop:  0.275
left m6a prop:  1.0909090909090908
left m6a count:  12
left AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr1_8342559_-
query name:  m64076_210328_012155/38797356/ccs
left AT prop:  0.425
left m6a prop:  1.0588235294117647
left m6a count:  18
left AT count:  17
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr20_33054974_+
query name:  m54329U_210323_190418/28770645/ccs
motif AT prop:  0.4
motif m6a prop:  1.0714285714285714
motif m6a count:  15
motif AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr20_37384487_+
query name:  m54329U_210326_192251/3148617/ccs
motif AT prop:  0.2857142857142857
motif m6a prop:  1.1
motif m6a count:  11
motif AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr20_61999461_+
query name:  m54329U_210323_190418/71173383/ccs
motif AT prop:  0.3142857142857143
motif m6a prop:  1.0909090909090908
motif m6a count:  12
motif AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr21_45973355_+
query name:  m54329U_210814_130637/76940322/ccs
left AT prop:  0.3
left m6a prop:  1.0833333333333333
left m6a count:  13
left AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr21_8434710_-
query name:  m54329U_210323_190418/14091657/ccs
motif AT prop:  0.22857142857142856
motif m6a prop:  1.125
motif m6a count:  9
motif AT count:  8
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr22_18892487_+
query name:  m54329U_210323_190418/105842915/ccs
motif AT prop:  0.42857142857142855
motif m6a prop:  1.1333333333333333
motif m6a count:  17
motif AT count:  15
----------
motif:  chr22_18893952_+
query name:  m54329U_210323_190418/105842915/ccs
left AT prop:  0.375
left m6a prop:  1.5333333333333334
left m6a count:  23
left AT count:  15
----------
motif:  chr22_18893952_+
query name:  m54329U_210323_190418/159580665/ccs
left AT prop:  0.375
left m6a prop:  1.3333333333333333
left m6a count:  20
left AT count:  15
----------
motif:  chr22_18893952_+
query name:  m54329U_210323_190418/159580665/ccs
motif AT prop:  0.42857142857142855
motif m6a prop:  1.3333333333333333
motif m6a count:  20
motif AT count:  15
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr22_20299234_+
query name:  m64076_210328_012155/166463083/ccs
left AT prop:  0.15
left m6a prop:  1.1666666666666667
left m6a count:  7
left AT count:  6
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr22_28772849_+
query name:  m54329U_210326_192251/124979076/ccs
left AT prop:  0.2
left m6a prop:  1.125
left m6a count:  9
left AT count:  8
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr22_38164547_-
query name:  m64076_221119_202646/11141578/ccs
left AT prop:  0.35
left m6a prop:  1.0714285714285714
left m6a count:  15
left AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr22_41022280_+
query name:  m54329U_210326_192251/51841675/ccs
left AT prop:  0.225
left m6a prop:  1.1111111111111112
left m6a count:  10
left AT count:  9
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr22_42813615_-
query name:  m54329U_210326_192251/39324373/ccs
left AT prop:  0.3
left m6a prop:  1.0833333333333333
left m6a count:  13
left AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr22_49966318_-
query name:  m54329U_210813_020940/82511169/ccs
motif AT prop:  0.3142857142857143
motif m6a prop:  1.0909090909090908
motif m6a count:  12
motif AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr2_105557423_+
query name:  m64076_210328_012155/105777293/ccs
motif AT prop:  0.45714285714285713
motif m6a prop:  1.0625
motif m6a count:  17
motif AT count:  16
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr2_1222642_+
query name:  m54329U_210814_130637/63963181/ccs
right AT prop:  0.425
right m6a prop:  1.3529411764705883
right m6a count:  23
right AT count:  17
----------
motif:  chr2_1222712_+
query name:  m54329U_210814_130637/63963181/ccs
left AT prop:  0.45
left m6a prop:  1.1666666666666667
left m6a count:  21
left AT count:  18
----------
motif:  chr2_1222712_+
query name:  m54329U_210814_130637/63963181/ccs
motif AT prop:  0.3142857142857143
motif m6a prop:  1.7272727272727273
motif m6a count:  19
motif AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr2_126889245_+
query name:  m64076_221119_202646/1050733/ccs
motif AT prop:  0.34285714285714286
motif m6a prop:  1.0833333333333333
motif m6a count:  13
motif AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr2_219235179_-
query name:  m54329U_210810_004956/180357046/ccs
left AT prop:  0.375
left m6a prop:  1.0666666666666667
left m6a count:  16
left AT count:  15
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr2_229922501_-
query name:  m54329U_210813_020940/40634295/ccs
left AT prop:  0.35
left m6a prop:  1.0714285714285714
left m6a count:  15
left AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr2_233190636_-
query name:  m64076_221119_202646/29950070/ccs
motif AT prop:  0.3142857142857143
motif m6a prop:  1.0909090909090908
motif m6a count:  12
motif AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr2_237901750_+
query name:  m64076_221119_202646/114035177/ccs
left AT prop:  0.275
left m6a prop:  1.0909090909090908
left m6a count:  12
left AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr2_238895597_-
query name:  m54329U_210810_004956/146344383/ccs
motif AT prop:  0.3142857142857143
motif m6a prop:  1.0909090909090908
motif m6a count:  12
motif AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr2_240751088_-
query name:  m54329U_210810_004956/150602889/ccs
left AT prop:  0.25
left m6a prop:  1.1
left m6a count:  11
left AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr2_41895771_-
query name:  m54329U_210810_004956/103810584/ccs
motif AT prop:  0.34285714285714286
motif m6a prop:  1.0833333333333333
motif m6a count:  13
motif AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr2_642613_+
query name:  m54329U_210323_190418/11666567/ccs
left AT prop:  0.375
left m6a prop:  1.0666666666666667
left m6a count:  16
left AT count:  15
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr2_79512958_-
query name:  m54329U_210323_190418/31064509/ccs
left AT prop:  0.15
left m6a prop:  1.1666666666666667
left m6a count:  7
left AT count:  6
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr2_90293982_+
query name:  m64076_210328_012155/92473259/ccs
right AT prop:  0.375
right m6a prop:  1.0666666666666667
right m6a count:  16
right AT count:  15
----------
motif:  chr2_90295551_+
query name:  m54329U_210814_130637/99485147/ccs
right AT prop:  0.375
right m6a prop:  2.466666666666667
right m6a count:  37
right AT count:  15
----------
motif:  chr2_90295551_+
query name:  m64076_210328_012155/92473259/ccs
motif AT prop:  0.4
motif m6a prop:  1.0714285714285714
motif m6a count:  15
motif AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr2_9474135_-
query name:  m64076_221119_202646/7799438/ccs
left AT prop:  0.225
left m6a prop:  1.1111111111111112
left m6a count:  10
left AT count:  9
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr3_127548160_+
query name:  m54329U_210326_192251/127076776/ccs
left AT prop:  0.425
left m6a prop:  1.0588235294117647
left m6a count:  18
left AT count:  17
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr3_134359425_-
query name:  m54329U_210326_192251/163643809/ccs
motif AT prop:  0.2571428571428571
motif m6a prop:  1.1111111111111112
motif m6a count:  10
motif AT count:  9
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr3_196503476_+
query name:  m64076_210328_012155/96930451/ccs
left AT prop:  0.275
left m6a prop:  1.0909090909090908
left m6a count:  12
left AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr3_46712307_+
query name:  m54329U_210326_192251/136251056/ccs
left AT prop:  0.3
left m6a prop:  1.0833333333333333
left m6a count:  13
left AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr3_61563425_-
query name:  m54329U_210814_130637/143984161/ccs
left AT prop:  0.275
left m6a prop:  1.0909090909090908
left m6a count:  12
left AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr3_75575788_+
query name:  m54329U_210810_004956/172296177/ccs
motif AT prop:  0.4
motif m6a prop:  1.0714285714285714
motif m6a count:  15
motif AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr4_100190401_-
query name:  m54329U_210323_190418/69665648/ccs
left AT prop:  0.2
left m6a prop:  1.125
left m6a count:  9
left AT count:  8
----------
motif:  chr4_10078698_+
query name:  m64076_221119_202646/329357/ccs
left AT prop:  0.25
left m6a prop:  1.1
left m6a count:  11
left AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr4_123942432_+
query name:  m54329U_210326_192251/55642692/ccs
motif AT prop:  0.3142857142857143
motif m6a prop:  1.0909090909090908
motif m6a count:  12
motif AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr4_136918911_-
query name:  m64076_210328_012155/26673639/ccs
left AT prop:  0.375
left m6a prop:  1.0666666666666667
left m6a count:  16
left AT count:  15
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr4_163547101_+
query name:  m54329U_210326_192251/87753650/ccs
left AT prop:  0.4
left m6a prop:  1.0625
left m6a count:  17
left AT count:  16
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr4_177231285_+
query name:  m54329U_210326_192251/23593316/ccs
left AT prop:  0.325
left m6a prop:  1.0769230769230769
left m6a count:  14
left AT count:  13
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr4_189233043_-
query name:  m64076_210328_012155/107022985/ccs
right AT prop:  0.5
right m6a prop:  1.2
right m6a count:  24
right AT count:  20
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr4_4095178_+
query name:  m64076_221119_202646/136839673/ccs
motif AT prop:  0.2857142857142857
motif m6a prop:  1.1
motif m6a count:  11
motif AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr4_8381782_+
query name:  m54329U_210326_192251/172296676/ccs
motif AT prop:  0.22857142857142856
motif m6a prop:  1.125
motif m6a count:  9
motif AT count:  8
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr5_14713736_+
query name:  m64076_210328_012155/109447063/ccs
left AT prop:  0.3
left m6a prop:  1.0833333333333333
left m6a count:  13
left AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr5_151104352_+
query name:  m54329U_210323_190418/164039250/ccs
left AT prop:  0.35
left m6a prop:  1.0714285714285714
left m6a count:  15
left AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr5_176661212_-
query name:  m64076_221119_202646/106496455/ccs
left AT prop:  0.275
left m6a prop:  1.0909090909090908
left m6a count:  12
left AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr5_178205176_+
query name:  m54329U_210814_130637/24511549/ccs
left AT prop:  0.15
left m6a prop:  1.1666666666666667
left m6a count:  7
left AT count:  6
----------
motif:  chr5_178323631_+
query name:  m64076_221119_202646/127666088/ccs
left AT prop:  0.325
left m6a prop:  1.0769230769230769
left m6a count:  14
left AT count:  13
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr5_179368937_+
query name:  m64076_210328_012155/77201539/ccs
motif AT prop:  0.3142857142857143
motif m6a prop:  1.0909090909090908
motif m6a count:  12
motif AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr5_302190_-
query name:  m54329U_210323_190418/6621178/ccs
left AT prop:  0.45
left m6a prop:  1.0555555555555556
left m6a count:  19
left AT count:  18
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr5_31639334_-
query name:  m54329U_210323_190418/180290970/ccs
left AT prop:  0.15
left m6a prop:  1.1666666666666667
left m6a count:  7
left AT count:  6
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr5_5535441_+
query name:  m54329U_210813_020940/90505888/ccs
left AT prop:  0.35
left m6a prop:  1.0714285714285714
left m6a count:  15
left AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr5_56816991_-
query name:  m64076_221119_202646/66388618/ccs
left AT prop:  0.225
left m6a prop:  1.1111111111111112
left m6a count:  10
left AT count:  9
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr5_77030758_-
query name:  m54329U_210814_130637/4391215/ccs
left AT prop:  0.2
left m6a prop:  1.125
left m6a count:  9
left AT count:  8
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr6_107141207_+
query name:  m54329U_210326_192251/45286092/ccs
left AT prop:  0.35
left m6a prop:  1.0714285714285714
left m6a count:  15
left AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr6_21954482_-
query name:  m64076_221119_202646/13109991/ccs
left AT prop:  0.3
left m6a prop:  1.0833333333333333
left m6a count:  13
left AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr6_74055529_+
query name:  m64076_210328_012155/147130615/ccs
left AT prop:  0.35
left m6a prop:  1.0714285714285714
left m6a count:  15
left AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr7_101282942_+
query name:  m54329U_210326_192251/127206689/ccs
left AT prop:  0.35
left m6a prop:  1.0714285714285714
left m6a count:  15
left AT count:  14
----------
motif:  chr7_101309155_-
query name:  m54329U_210810_004956/144179752/ccs
left AT prop:  0.4
left m6a prop:  1.0625
left m6a count:  17
left AT count:  16
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr7_1196501_+
query name:  m54329U_210323_190418/162136615/ccs
left AT prop:  0.35
left m6a prop:  1.0714285714285714
left m6a count:  15
left AT count:  14
----------
motif:  chr7_1196682_+
query name:  m54329U_210323_190418/133106259/ccs
right AT prop:  0.325
right m6a prop:  1.4615384615384615
right m6a count:  19
right AT count:  13
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr7_140673611_+
query name:  m54329U_210813_020940/174787105/ccs
left AT prop:  0.25
left m6a prop:  1.1
left m6a count:  11
left AT count:  10
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr7_149887_+
query name:  m54329U_210323_190418/115477131/ccs
motif AT prop:  0.2571428571428571
motif m6a prop:  1.1111111111111112
motif m6a count:  10
motif AT count:  9
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr7_42617981_-
query name:  m54329U_210814_130637/131924616/ccs
left AT prop:  0.3
left m6a prop:  1.0833333333333333
left m6a count:  13
left AT count:  12
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr7_47794356_+
query name:  m64076_210328_012155/10683557/ccs
left AT prop:  0.375
left m6a prop:  1.0666666666666667
left m6a count:  16
left AT count:  15
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr7_49774191_-
query name:  m54329U_210326_192251/86245471/ccs
left AT prop:  0.275
left m6a prop:  1.0909090909090908
left m6a count:  12
left AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr7_56370000_+
query name:  m54329U_210814_130637/140510731/ccs
motif AT prop:  0.2571428571428571
motif m6a prop:  1.3333333333333333
motif m6a count:  12
motif AT count:  9
----------
motif:  chr7_56370000_+
query name:  m64076_210328_012155/97453524/ccs
left AT prop:  0.275
left m6a prop:  1.0909090909090908
left m6a count:  12
left AT count:  11
----------
motif:  chr7_56370000_+
query name:  m64076_210328_012155/97453524/ccs
motif AT prop:  0.2571428571428571
motif m6a prop:  1.5555555555555556
motif m6a count:  14
motif AT count:  9
----------
motif:  chr7_56370000_+
query name:  m64076_221119_202646/176423034/ccs
left AT prop:  0.25
left m6a prop:  1.3
left m6a count:  13
left AT count:  10
----------
motif:  chr7_56370000_+
query name:  m64076_221119_202646/176423034/ccs
right AT prop:  0.3
right m6a prop:  1.3333333333333333
right m6a count:  16
right AT count:  12
----------
motif:  chr7_56370000_+
query name:  m64076_221119_202646/176423034/ccs
motif AT prop:  0.314

ERROR:root:AT or m6a proportion > 1.


motif:  chr7_7567261_+
query name:  m54329U_210810_004956/28837171/ccs
left AT prop:  0.15
left m6a prop:  1.1666666666666667
left m6a count:  7
left AT count:  6
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr8_10731513_-
query name:  m54329U_210323_190418/11010997/ccs
left AT prop:  0.275
left m6a prop:  1.0909090909090908
left m6a count:  12
left AT count:  11
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr8_21789072_+
query name:  m54329U_210326_192251/41486654/ccs
left AT prop:  0.15
left m6a prop:  1.1666666666666667
left m6a count:  7
left AT count:  6
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr8_27559083_-
query name:  m54329U_210323_190418/13699591/ccs
motif AT prop:  0.4
motif m6a prop:  1.0714285714285714
motif m6a count:  15
motif AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr8_80873974_+
query name:  m54329U_210326_192251/78579260/ccs
motif AT prop:  0.11428571428571428
motif m6a prop:  1.25
motif m6a count:  5
motif AT count:  4
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr8_85804443_-
query name:  m64076_210328_012155/10421026/ccs
left AT prop:  0.275
left m6a prop:  1.0909090909090908
left m6a count:  12
left AT count:  11
----------
motif:  chr8_85806118_-
query name:  m64076_210328_012155/61408625/ccs
motif AT prop:  0.42857142857142855
motif m6a prop:  1.6
motif m6a count:  24
motif AT count:  15
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr9_119932437_-
query name:  m54329U_210323_190418/84806089/ccs
motif AT prop:  0.4
motif m6a prop:  1.0714285714285714
motif m6a count:  15
motif AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr9_120918914_+
query name:  m54329U_210323_190418/159842446/ccs
motif AT prop:  0.42857142857142855
motif m6a prop:  1.0666666666666667
motif m6a count:  16
motif AT count:  15
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr9_127350676_+
query name:  m64076_210328_012155/151388618/ccs
left AT prop:  0.35
left m6a prop:  1.0714285714285714
left m6a count:  15
left AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr9_128846101_+
query name:  m64076_210328_012155/10291412/ccs
motif AT prop:  0.37142857142857144
motif m6a prop:  1.0769230769230769
motif m6a count:  14
motif AT count:  13
----------
motif:  chr9_128892276_-
query name:  m54329U_210326_192251/48496769/ccs
left AT prop:  0.325
left m6a prop:  1.0769230769230769
left m6a count:  14
left AT count:  13
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr9_136327387_-
query name:  m54329U_210323_190418/64751296/ccs
left AT prop:  0.1
left m6a prop:  1.25
left m6a count:  5
left AT count:  4
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr9_137200471_-
query name:  m54329U_210813_020940/106889294/ccs
left AT prop:  0.075
left m6a prop:  1.3333333333333333
left m6a count:  4
left AT count:  3
----------


ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.
ERROR:root:AT or m6a proportion > 1.


motif:  chr9_137716090_+
query name:  m64076_221119_202646/34341802/ccs
left AT prop:  0.4
left m6a prop:  1.375
left m6a count:  22
left AT count:  16
----------
motif:  chr9_137716090_+
query name:  m64076_221119_202646/34341802/ccs
motif AT prop:  0.45714285714285713
motif m6a prop:  1.0625
motif m6a count:  17
motif AT count:  16
----------
motif:  chr9_137716148_+
query name:  m64076_221119_202646/34341802/ccs
left AT prop:  0.4
left m6a prop:  1.0625
left m6a count:  17
left AT count:  16
----------
motif:  chr9_137716377_+
query name:  m64076_221119_202646/34341802/ccs
left AT prop:  0.425
left m6a prop:  1.8235294117647058
left m6a count:  31
left AT count:  17
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr9_63822769_-
query name:  m64076_221119_202646/124388632/ccs
left AT prop:  0.35
left m6a prop:  1.0714285714285714
left m6a count:  15
left AT count:  14
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chr9_70044853_-
query name:  m64076_210328_012155/49611215/ccs
left AT prop:  0.175
left m6a prop:  1.1428571428571428
left m6a count:  8
left AT count:  7
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chrX_117769273_-
query name:  m54329U_210323_190418/177212055/ccs
left AT prop:  0.225
left m6a prop:  1.1111111111111112
left m6a count:  10
left AT count:  9
----------


ERROR:root:AT or m6a proportion > 1.


motif:  chrX_139691715_-
query name:  m64076_210328_012155/90701923/ccs
left AT prop:  0.225
left m6a prop:  1.1111111111111112
left m6a count:  10
left AT count:  9
----------
Features: ['msp_size', 'left_AT_count', 'right_AT_count', 'motif_AT_count', 'left_AT_prop', 'right_AT_prop', 'motif_AT_prop', 'left_m6a_count', 'right_m6a_count', 'motif_m6a_count', 'left_m6a_prop', 'right_m6a_prop', 'motif_m6a_prop', 'rle_max', 'AAA_count', 'AAA_m6a_prop', 'AAC_count', 'AAC_m6a_prop', 'AAG_count', 'AAG_m6a_prop', 'AAT_count', 'AAT_m6a_prop', 'ACA_count', 'ACA_m6a_prop', 'ACC_count', 'ACC_m6a_prop', 'ACG_count', 'ACG_m6a_prop', 'ACT_count', 'ACT_m6a_prop', 'AGA_count', 'AGA_m6a_prop', 'AGC_count', 'AGC_m6a_prop', 'AGG_count', 'AGG_m6a_prop', 'ATA_count', 'ATA_m6a_prop', 'ATC_count', 'ATC_m6a_prop', 'ATG_count', 'ATG_m6a_prop', 'CAA_count', 'CAA_m6a_prop', 'CAC_count', 'CAC_m6a_prop', 'CAG_count', 'CAG_m6a_prop', 'CCA_count', 'CCA_m6a_prop', 'CGA_count', 'CGA_m6a_prop', 'CTA_count', 'CTA_m6a_prop

In [617]:
# group grouping cols as column
res.insert(loc=0, column="motif_query", 
          value=res[grouping_cols].apply(lambda row: "/".join(row.values.astype(str)), axis=1))

In [625]:
# all proportion cols
proportion_columns = res.columns.str.contains("prop")

res_prop = res.loc[:,res.columns.str.contains("prop")]
print("rows: {:,} | cols: {:,}".format(res_prop.shape[0], res_prop.shape[1]))

rows: 326,788 | cols: 34


In [631]:
326553-235

326318

In [635]:
res[~(res.loc[:,res.columns.str.contains("prop")] > 1).any(1)]

  res[~(res.loc[:,res.columns.str.contains("prop")] > 1).any(1)]


Unnamed: 0,motif_query,motif_name,query_name,msp_size,left_AT_count,right_AT_count,motif_AT_count,left_AT_prop,right_AT_prop,motif_AT_prop,...,GCA_count,GCA_m6a_prop,GGA_count,GGA_m6a_prop,GTA_count,GTA_m6a_prop,TAA_count,TAA_m6a_prop,TCA_count,TCA_m6a_prop
0,chr10_100001134_+/m54329U_210326_192251/136053...,chr10_100001134_+,m54329U_210326_192251/136053044/ccs,154.0,16.0,14.0,16.0,0.400,0.350,0.457143,...,2.0,0.000000,3.0,0.0,0.0,0.00,0.0,0.000000,1.0,0.000
1,chr10_100009901_+/m54329U_210813_020940/108857...,chr10_100009901_+,m54329U_210813_020940/108857329/ccs,268.0,9.0,10.0,7.0,0.225,0.250,0.200000,...,0.0,0.000000,2.0,0.5,0.0,0.00,0.0,0.000000,1.0,0.500
2,chr10_100009901_+/m54329U_210814_130637/104466...,chr10_100009901_+,m54329U_210814_130637/104466624/ccs,254.0,9.0,9.0,8.0,0.225,0.225,0.228571,...,0.0,0.000000,2.0,0.5,0.0,0.00,0.0,0.000000,1.0,1.000
3,chr10_100021043_+/m64076_210328_012155/1038111...,chr10_100021043_+,m64076_210328_012155/103811166/ccs,108.0,12.0,11.0,10.0,0.300,0.275,0.285714,...,7.0,0.285714,1.0,0.0,0.0,0.00,0.0,0.000000,0.0,0.000
4,chr10_100024809_-/m64076_210328_012155/1038111...,chr10_100024809_-,m64076_210328_012155/103811166/ccs,65.0,24.0,15.0,16.0,0.600,0.375,0.457143,...,1.0,0.000000,1.0,1.0,0.0,0.00,0.0,0.000000,4.0,0.375
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
326783,chrX_99821819_+/m64076_210328_012155/19794280/ccs,chrX_99821819_+,m64076_210328_012155/19794280/ccs,51.0,24.0,25.0,18.0,0.600,0.625,0.514286,...,3.0,1.000000,1.0,0.0,0.0,0.00,1.0,0.333333,1.0,0.000
326784,chrX_99861319_-/m64076_210328_012155/20318277/ccs,chrX_99861319_-,m64076_210328_012155/20318277/ccs,103.0,25.0,20.0,19.0,0.625,0.500,0.542857,...,0.0,0.000000,1.0,0.0,2.0,0.75,2.0,0.000000,1.0,0.000
326785,chrX_99868718_+/m64076_210328_012155/143918154...,chrX_99868718_+,m64076_210328_012155/143918154/ccs,50.0,23.0,21.0,19.0,0.575,0.525,0.542857,...,0.0,0.000000,3.0,0.0,1.0,0.00,1.0,0.000000,0.0,0.000
326786,chrX_99955847_-/m54329U_210814_130637/13631698...,chrX_99955847_-,m54329U_210814_130637/136316982/ccs,69.0,24.0,24.0,19.0,0.600,0.600,0.542857,...,1.0,1.000000,2.0,0.0,1.0,0.00,0.0,0.000000,2.0,0.250


In [624]:
res_prop[(res_prop > 1).any(1)]

  res_prop[(res_prop > 1).any(1)]


Unnamed: 0,left_AT_prop,right_AT_prop,motif_AT_prop,left_m6a_prop,right_m6a_prop,motif_m6a_prop,AAA_m6a_prop,AAC_m6a_prop,AAG_m6a_prop,AAT_m6a_prop,...,CGA_m6a_prop,CTA_m6a_prop,CTC_m6a_prop,GAA_m6a_prop,GAC_m6a_prop,GCA_m6a_prop,GGA_m6a_prop,GTA_m6a_prop,TAA_m6a_prop,TCA_m6a_prop
68,0.325,0.575,0.314286,1.076923,0.173913,1.000000,0.0,0.0,0.00,0.0,...,0.0,1.0,1.0,0.00,1.0,1.000000,1.0,1.0,0.0,0.000
149,0.375,0.400,0.257143,0.000000,0.187500,1.111111,0.0,0.0,0.00,0.0,...,1.0,0.0,0.0,0.00,1.0,1.000000,0.0,0.0,0.0,0.000
419,0.075,0.325,0.171429,1.333333,0.230769,1.000000,0.0,0.0,0.00,0.0,...,1.0,0.0,1.0,0.00,1.0,1.000000,1.0,0.0,0.0,0.000
1619,0.200,0.225,0.228571,1.125000,0.222222,0.625000,0.0,0.5,0.00,0.0,...,0.5,0.5,0.5,0.50,0.0,0.000000,0.5,0.0,0.0,0.000
4909,0.400,0.550,0.314286,0.000000,0.136364,1.090909,0.0,0.0,0.00,0.0,...,0.0,0.0,1.0,0.00,0.0,1.000000,1.0,0.0,0.0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311065,0.175,0.550,0.428571,1.142857,0.227273,0.800000,0.0,0.0,0.50,0.0,...,0.0,0.0,1.0,1.00,0.0,0.666667,1.0,0.0,0.0,0.750
316417,0.225,0.425,0.228571,1.111111,0.294118,0.750000,0.0,0.0,0.00,0.0,...,0.0,0.0,1.0,0.00,1.0,0.666667,1.0,0.0,0.0,0.000
318284,0.225,0.350,0.285714,1.111111,0.214286,0.900000,0.0,1.0,0.00,0.0,...,1.0,0.0,1.0,1.00,0.0,0.500000,1.0,0.0,0.0,0.000
323831,0.525,0.500,0.428571,0.142857,0.050000,0.933333,0.0,0.0,1.00,0.0,...,0.0,0.0,2.0,0.00,1.0,1.000000,1.0,0.0,0.0,0.875


In [623]:
%%time
res_prop[res_prop.apply(lambda x: (x > 1).any(), axis = 1)]

Unnamed: 0,left_AT_prop,right_AT_prop,motif_AT_prop,left_m6a_prop,right_m6a_prop,motif_m6a_prop,AAA_m6a_prop,AAC_m6a_prop,AAG_m6a_prop,AAT_m6a_prop,...,CGA_m6a_prop,CTA_m6a_prop,CTC_m6a_prop,GAA_m6a_prop,GAC_m6a_prop,GCA_m6a_prop,GGA_m6a_prop,GTA_m6a_prop,TAA_m6a_prop,TCA_m6a_prop
68,0.325,0.575,0.314286,1.076923,0.173913,1.000000,0.0,0.0,0.00,0.0,...,0.0,1.0,1.0,0.00,1.0,1.000000,1.0,1.0,0.0,0.000
149,0.375,0.400,0.257143,0.000000,0.187500,1.111111,0.0,0.0,0.00,0.0,...,1.0,0.0,0.0,0.00,1.0,1.000000,0.0,0.0,0.0,0.000
419,0.075,0.325,0.171429,1.333333,0.230769,1.000000,0.0,0.0,0.00,0.0,...,1.0,0.0,1.0,0.00,1.0,1.000000,1.0,0.0,0.0,0.000
1619,0.200,0.225,0.228571,1.125000,0.222222,0.625000,0.0,0.5,0.00,0.0,...,0.5,0.5,0.5,0.50,0.0,0.000000,0.5,0.0,0.0,0.000
4909,0.400,0.550,0.314286,0.000000,0.136364,1.090909,0.0,0.0,0.00,0.0,...,0.0,0.0,1.0,0.00,0.0,1.000000,1.0,0.0,0.0,0.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
311065,0.175,0.550,0.428571,1.142857,0.227273,0.800000,0.0,0.0,0.50,0.0,...,0.0,0.0,1.0,1.00,0.0,0.666667,1.0,0.0,0.0,0.750
316417,0.225,0.425,0.228571,1.111111,0.294118,0.750000,0.0,0.0,0.00,0.0,...,0.0,0.0,1.0,0.00,1.0,0.666667,1.0,0.0,0.0,0.000
318284,0.225,0.350,0.285714,1.111111,0.214286,0.900000,0.0,1.0,0.00,0.0,...,1.0,0.0,1.0,1.00,0.0,0.500000,1.0,0.0,0.0,0.000
323831,0.525,0.500,0.428571,0.142857,0.050000,0.933333,0.0,0.0,1.00,0.0,...,0.0,0.0,2.0,0.00,1.0,1.000000,1.0,0.0,0.0,0.875


In [622]:
res_prop[res_prop > 1].count()

left_AT_prop        0
right_AT_prop       0
motif_AT_prop       0
left_m6a_prop     129
right_m6a_prop     14
motif_m6a_prop     76
AAA_m6a_prop        0
AAC_m6a_prop        1
AAG_m6a_prop        1
AAT_m6a_prop        0
ACA_m6a_prop        0
ACC_m6a_prop        0
ACG_m6a_prop        1
ACT_m6a_prop        0
AGA_m6a_prop        1
AGC_m6a_prop        1
AGG_m6a_prop        5
ATA_m6a_prop        0
ATC_m6a_prop        2
ATG_m6a_prop        4
CAA_m6a_prop        1
CAC_m6a_prop        6
CAG_m6a_prop        9
CCA_m6a_prop        2
CGA_m6a_prop        1
CTA_m6a_prop        1
CTC_m6a_prop       10
GAA_m6a_prop        2
GAC_m6a_prop        2
GCA_m6a_prop        2
GGA_m6a_prop        4
GTA_m6a_prop        0
TAA_m6a_prop        0
TCA_m6a_prop        0
dtype: int64

In [None]:
# remove rows w/  proportion > 1

In [262]:
# group by motif/query name
grouping_cols = ["motif_name", "query_name"]
df_grouped = df.groupby(grouping_cols)
# get group names (keys)
group_names = list(df_grouped.groups.keys())
print("Unique motif-sequence groups: " + "{:,}".format(len(group_names)))

Unique motif-sequence groups: 326,816


In [None]:
# extract features by motif/query group
print("\nAggregating features!")
res = df.groupby(grouping_cols).apply(lambda x: agg_features(x)).reset_index()
print("Features: {}".format(res.columns.tolist()[2:]))
print("Total rows: " + "{:,}".format(res.shape[0]))
print("Total columns: " + "{:,}".format(res.shape[1]))

## filter max-rle pin

In [419]:
# CTCF L rle-max
pin_file = "{}/CTCF_m6a_fiberseq_L_100bp_features-rle-max.pin".format(feature_dir)
pin = pd.read_csv(pin_file, sep="\t")
print("rows: {:,}, columns: {:,}".format(pin.shape[0], pin.shape[1]))

rows: 569,891, columns: 132


In [431]:
print("{:,}".format(25034022+300))
print("{:,}".format(25034022-300))

25,034,322
25,033,722


In [582]:
pin.Label.value_counts()

-1    331543
 1    238348
Name: Label, dtype: int64

In [583]:
pin.groupby("Label").rle_max.value_counts(sort=False)

Label  rle_max
-1     0.0        16714
       1.0        61394
       2.0        60860
       3.0        49040
       4.0        36095
       5.0        26452
       6.0        19715
       7.0        14999
       8.0        11211
       9.0         8475
       10.0        6309
       11.0        4713
       12.0        3610
       13.0        2934
       14.0        2478
       15.0        1900
       16.0        1419
       17.0        1076
       18.0         702
       19.0         561
       20.0         380
       21.0         254
       22.0         144
       23.0          61
       24.0          33
       25.0          10
       26.0           3
       27.0           1
 1     0.0        15056
       1.0        40295
       2.0        28968
       3.0        18886
       4.0        12970
       5.0         9280
       6.0         8278
       7.0         7853
       8.0         7482
       9.0         7957
       10.0        8580
       11.0        8667
       12.0        9159
 

In [576]:
pin.sort_values(by="rle_max")

Unnamed: 0,SpecID,Label,msp_size,left_m6a_count,right_m6a_count,motif_m6a_count,left_AT_count,right_AT_count,motif_AT_count,left_AT_prop,...,TTC_count,TTC_m6a_prop,TTG_count,TTG_m6a_prop,TTT_count,TTT_m6a_prop,rle_max,Peptide,Proteins,scannr
379538,chr1_25034022_-/m54329U_210326_192251/89260563...,-1,264.0,13.0,12.0,13.0,17.0,13.0,13.0,0.425,...,1.0,1.0,0.0,0.0,0.0,0.0,0.0,379538,379538,379538
489658,chr5_78497949_+/m54329U_210323_190418/39584337...,-1,381.0,19.0,14.0,11.0,21.0,14.0,11.0,0.525,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,489658,489658,489658
489659,chr5_78498031_-/m54329U_210323_190418/11377169...,-1,93.0,10.0,3.0,10.0,18.0,14.0,10.0,0.450,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,489659,489659,489659
119486,chr20_1467041_-/m54329U_210813_020940/59049924...,1,289.0,9.0,8.0,7.0,11.0,11.0,7.0,0.275,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,119486,119486,119486
60862,chr16_47018428_+/m54329U_210323_190418/1388070...,1,300.0,20.0,18.0,18.0,22.0,23.0,18.0,0.550,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60862,60862,60862
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510945,chr7_12639022_+/m64076_221119_202646/157420071...,-1,154.0,8.0,4.0,0.0,26.0,28.0,25.0,0.650,...,0.0,0.0,0.0,0.0,4.0,0.0,25.0,510945,510945,510945
394256,chr20_34834538_+/m54329U_210813_020940/7012548...,-1,125.0,12.0,2.0,0.0,27.0,28.0,26.0,0.675,...,0.0,0.0,0.0,0.0,2.0,0.0,26.0,394256,394256,394256
394259,chr20_34834538_+/m64076_210328_012155/5637530/ccs,-1,179.0,11.0,5.0,0.0,27.0,28.0,26.0,0.675,...,0.0,0.0,0.0,0.0,2.0,0.0,26.0,394259,394259,394259
285090,chr12_85477349_+/m54329U_210810_004956/1633823...,-1,143.0,7.0,7.0,0.0,29.0,25.0,26.0,0.725,...,1.0,0.0,1.0,0.0,1.0,0.0,26.0,285090,285090,285090


In [53]:
# removes positive rows w/ rle_max < rle_min
def filt_rle_max(df, rle_min = 2):
    drop_rows = (df["Label"] == 1) & (df.rle_max < rle_min)
    print("Removing {:,} rows ({:0.2f}%).".format(sum(drop_rows), (sum(drop_rows)/df.shape[0])*100))
    return(df.loc[~drop_rows])

In [56]:
print("Pre-filt: {:,}".format(pin.shape[0]))
# filter positive rows by rle_max
pin = filt_rle_max(pin, rle_min=2)
print("Total: {:,}".format(pin.shape[0]))
print("Pos: {:,}".format(sum(pin["Label"] == 1)))
print("Neg: {:,}".format(sum(pin["Label"] == -1)))

Pre-filt: 569,891
Removing 55,351 rows (9.71%).
Total: 514,540
Pos: 182,997
Neg: 331,543


In [58]:
# save filtered pin
output_file = pin_file.replace(".pin", "-filt.pin")
print("Saving to output file: {}".format(output_file))
pin.to_csv(output_file, header=True, index=None, sep="\t",)

Saving to output file: /mmfs1/gscratch/stergachislab/mwperez/ctcf-footprinting/feature_data/CTCF_m6a_fiberseq_L_100bp_features-rle-max-filt.pin
