# Running mokapot (merge res_feat w/ fiberseq data)

In [2]:
import os
import pandas as pd
import numpy as np

In [3]:
import mokapot
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [4]:
import logging

# Change to True enable messages and nicely format them:
log = False
if log:
    logging.basicConfig(
        level=logging.INFO,
        format="%(levelname)s: %(message)s",
    )

In [5]:
# set folders
project_dir = "/mmfs1/gscratch/stergachislab/mwperez/ctcf-footprinting"
data_dir = "{}/feature_data".format(project_dir)
fiber_dir = "{}/candidate_footprints".format(project_dir)
moka_dir = "{}/mokapot_res".format(project_dir)

## v2: CTCF L pos & neg (5%) | rle & kmer features

* CTCF_L motifs for BOTH pos & neg
* mokapot
    * res: (238,349)
    * fibers
        * merged: CTCF_100bp_L_5_v2.mokapot.m6a_fiberseq.txt (13,062,453)
        * pos: CTCF_100bp_L_5_v2.mokapot.m6a_fiberseq-positive.txt (6,388,436)
        * neg: CTCF_100bp_L_5_v2.mokapot.m6a_fiberseq-negative.txt (6,674,018)
* features
    * pin: CTCF_m6a_fiberseq_L_100bp_features-v2.pin (569,889 motif-query combined)
    * pos: CTCF_m6a_fiberseq_L_100bp_positive_features-v2.txt (238,349 motif-query)
    * neg: CTCF_m6a_fiberseq_L_100bp_small_5_negative_features-v2.txt (331,543 motif-query)
* fiberseq
    * pos: CTCF_m6a_fiberseq_L_100bp_positive.txt (12,739,713 observations)
    * neg: CTCF_m6a_fiberseq_L_100bp_small_5_negative.txt (39,485,335 observations)

### format positive data

In [6]:
# CTCF L (positive)
data_file = os.path.join(fiber_dir, "CTCF_m6a_fiberseq_L_100bp_positive.txt")

In [7]:
%%time
# read in data
n_rows = None
df = pd.read_csv(data_file, sep="\t", nrows=n_rows)
print(f"{df.shape[0]:,d}")

# filter for only m6a & msp rows
df = df[df["centered_position_type"].isin(["m6a", "msp"])]
print(f"{df.shape[0]:,d}")

12,739,712
11,860,730
CPU times: user 20.8 s, sys: 3.26 s, total: 24.1 s
Wall time: 24.2 s


In [8]:
# add col to df of unique motifs names
df.insert(loc=0, column="motif_name", 
          value=df["chrom"]+"_"+df["centering_position"].astype(str)+"_"+df["strand"].astype(str))
# add uniuqe motif/query name column
df.insert(loc=0, column="motif_query", 
          value=df["motif_name"].astype(str)+"/"+df["query_name"].astype(str))
print(f"{df.shape[0]:,d}")

11,860,730


In [9]:
print("Columns: {}".format(df.shape[1]))
print("Total rows: " + "{:,}".format(df.shape[0]))
print("Total unique motifs: {:,}".format(df.motif_name.nunique()))
print("Total unique query_names: " + "{:,}".format(df.query_name.nunique()))
print("Total unique motif-query: " + "{:,}".format(df.motif_query.nunique()))

print("MSP's: " + "{:,}".format(df[df["centered_position_type"] == "msp"].shape[0]))
print("m6a's: " + "{:,}".format(df[df["centered_position_type"] == "m6a"].shape[0]))

Columns: 15
Total rows: 11,860,730
Total unique motifs: 6,504
Total unique query_names: 328,835
Total unique motif-query: 360,604
MSP's: 396,372
m6a's: 11,464,358


In [11]:
df_pos = df

In [10]:
def clean_sequences(df):
    '''Remove rows with N characters.'''
    print("Removing "+ "{:,} rows.".format(df["subset_sequence"].str.contains("N").sum()))
    return df[~df["subset_sequence"].str.contains("N")]

In [12]:
# remove rows with Ns in sequence
df_pos = clean_sequences(df_pos)
print("rows: {:,} | cols: {:,}".format(df_pos.shape[0], df_pos.shape[1]))

Removing 114,621 rows.
rows: 11,746,109 | cols: 15


In [13]:
def filt_msps(df, motif_len=35):
    '''Filters for motif/query instances with a motif within 
    a MSP and adds MSP length to each motif/query group.'''
    
    # position of MSPs containing a motif
    msp_mask = (df["centered_position_type"] == "msp") & (df["centered_start"] <= 0) & (df["centered_end"] >= motif_len)
    print("MSPs containing a motif: " + "{:,}".format(msp_mask.sum()))
    msp_groups = [(row["motif_name"], row["query_name"]) for idx, row in df[msp_mask].iterrows()]
    
    # filter for rows with motifs within an MSP (gets both MSP's & m6a's)
    df = df[df[["motif_name", "query_name"]].apply(tuple, 1).isin(msp_groups)]
    print("Total observations from motifs within an MSP: " + "{:,}".format(df.shape[0]))
    
    # add MSP size corresponding to each group
    # df with msp sizes
    df_msp = df.loc[msp_mask, ["motif_name", "query_name", "centered_end", "centered_start"]]
    df_msp["msp_size"] = df_msp["centered_end"] - df_msp["centered_start"]

    # match msp back to it's motif & fiber
    df = df.merge(df_msp[["motif_name", "query_name", "msp_size"]], on=["motif_name", "query_name"])
    print("Merged df shape w/ msp_size: " + "{:,}".format(df.shape[0]))
    return df

In [14]:
# filter for regions with a motif within an MSP & add MSP size col
df_pos = filt_msps(df_pos)
# remove MSP rows
df_pos = df_pos[df_pos["centered_position_type"] == "m6a"]
print("Total m6a observations: " + "{:,}".format(df_pos.shape[0]))

MSP's with a motif: 238,348
Total observations from motifs within an MSP: 9,735,406
Merged df shape w/ msp_size: 9,735,406
Total m6a observations: 9,495,739


In [16]:
print("{:,}".format(df_pos.shape[0]))

9,495,739


In [17]:
# group by motif & query name
grouping_cols = ["motif_name", "query_name"]
df_grouped = df_pos.groupby(grouping_cols)

# get group names (keys)
group_names = list(df_grouped.groups.keys())
print("Unique motif-sequence groups: " + "{:,}".format(len(group_names)))

Unique motif-sequence groups: 238,348


In [18]:
df_pos.head()

Unnamed: 0,motif_query,motif_name,chrom,centering_position,strand,subset_sequence,reference_start,reference_end,query_name,centered_query_start,centered_query_end,query_length,centered_position_type,centered_start,centered_end,msp_size
0,chr1_1033080_+/m54329U_210813_020940/24183559/ccs,chr1_1033080_+,chr1,1033080,+,GACCTACGGGGGCGGGTGTGGGGACGCCGGACTACGCGTCAGGAGT...,1008277,1033947,m54329U_210813_020940/24183559/ccs,-24816,861,25677,m6a,-77,-76,225
1,chr1_1033080_+/m54329U_210813_020940/24183559/ccs,chr1_1033080_+,chr1,1033080,+,GACCTACGGGGGCGGGTGTGGGGACGCCGGACTACGCGTCAGGAGT...,1008277,1033947,m54329U_210813_020940/24183559/ccs,-24816,861,25677,m6a,-70,-69,225
2,chr1_1033080_+/m54329U_210813_020940/24183559/ccs,chr1_1033080_+,chr1,1033080,+,GACCTACGGGGGCGGGTGTGGGGACGCCGGACTACGCGTCAGGAGT...,1008277,1033947,m54329U_210813_020940/24183559/ccs,-24816,861,25677,m6a,-67,-66,225
3,chr1_1033080_+/m54329U_210813_020940/24183559/ccs,chr1_1033080_+,chr1,1033080,+,GACCTACGGGGGCGGGTGTGGGGACGCCGGACTACGCGTCAGGAGT...,1008277,1033947,m54329U_210813_020940/24183559/ccs,-24816,861,25677,m6a,-62,-61,225
4,chr1_1033080_+/m54329U_210813_020940/24183559/ccs,chr1_1033080_+,chr1,1033080,+,GACCTACGGGGGCGGGTGTGGGGACGCCGGACTACGCGTCAGGAGT...,1008277,1033947,m54329U_210813_020940/24183559/ccs,-24816,861,25677,m6a,-60,-59,225


In [21]:
# save to file
output_file = os.path.basename(data_file).replace(".txt", "-cleaned_100bp.txt")
output_file = os.path.join(moka_dir, output_file)
df_pos.to_csv(output_file, sep="\t", header=True, index=False)
print("Saving positive data to: {}".format(os.path.basename(output_file)))

Saving positive data to: CTCF_m6a_fiberseq_L_100bp_positive-cleaned_100bp.txt


### format negative data

In [22]:
# CTCF L (negative 5%)
data_file = os.path.join(fiber_dir, "CTCF_m6a_fiberseq_L_100bp_small_5_negative.txt")

In [23]:
%%time
# read in data
n_rows = None
df_neg = pd.read_csv(data_file, sep="\t", nrows=n_rows)
print(f"{df_neg.shape[0]:,d}")

# filter for only m6a & msp rows
df_neg = df_neg[df_neg["centered_position_type"].isin(["m6a", "msp"])]
print(f"{df_neg.shape[0]:,d}")

39,485,334
32,102,125
CPU times: user 1min 4s, sys: 9.59 s, total: 1min 14s
Wall time: 1min 14s


In [24]:
# add col to df of unique motifs names
df_neg.insert(loc=0, column="motif_name", 
              value=df_neg["chrom"]+"_"+df_neg["centering_position"].astype(str)+"_"+df_neg["strand"].astype(str))
# add uniuqe motif/query name column
df_neg.insert(loc=0, column="motif_query", 
              value=df_neg["motif_name"].astype(str)+"/"+df_neg["query_name"].astype(str))
print(f"{df_neg.shape[0]:,d}")

32,102,125


In [25]:
print("Columns: {}".format(df_neg.shape[1]))
print("Total rows: " + "{:,}".format(df_neg.shape[0]))
print("Total unique motifs: {:,}".format(df_neg.motif_name.nunique()))
print("Total unique query_names: " + "{:,}".format(df_neg.query_name.nunique()))
print("Total unique motif-query: " + "{:,}".format(df_neg.motif_query.nunique()))

print("MSP's: " + "{:,}".format(df_neg[df_neg["centered_position_type"] == "msp"].shape[0]))
print("m6a's: " + "{:,}".format(df_neg[df_neg["centered_position_type"] == "m6a"].shape[0]))

Columns: 15
Total rows: 32,102,125
Total unique motifs: 702,809
Total unique query_names: 430,490
Total unique motif-query: 2,112,659
MSP's: 2,717,416
m6a's: 29,384,709


In [27]:
# remove rows with Ns in sequence
df_neg = clean_sequences(df_neg)
print("rows: {:,} | cols: {:,}".format(df_neg.shape[0], df_neg.shape[1]))

Removing 0 rows.
rows: 31,885,579 | cols: 15


In [28]:
# filter for regions with a motif within an MSP & add MSP size col
df_neg = filt_msps(df_neg)
# remove MSP rows
df_neg = df_neg[df_neg["centered_position_type"] == "m6a"]
print("Total m6a observations: " + "{:,}".format(df_neg.shape[0]))

MSP's with a motif: 331,589
Total observations from motifs within an MSP: 8,380,444
Merged df shape w/ msp_size: 8,383,268
Total m6a observations: 8,040,195


In [29]:
print("Total unique motif-query: " + "{:,}".format(df_neg.motif_query.nunique()))

Total unique motif-query: 331,543


In [30]:
# group by motif & query name
grouping_cols = ["motif_name", "query_name"]
df_grouped = df_neg.groupby(grouping_cols)

# get group names (keys)
group_names = list(df_grouped.groups.keys())
print("Unique motif-sequence groups: " + "{:,}".format(len(group_names)))

Unique motif-sequence groups: 331,543


In [31]:
df_neg.head()

Unnamed: 0,motif_query,motif_name,chrom,centering_position,strand,subset_sequence,reference_start,reference_end,query_name,centered_query_start,centered_query_end,query_length,centered_position_type,centered_start,centered_end,msp_size
0,chr1_11256_-/m54329U_210814_130637/153159059/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,40236,m54329U_210814_130637/153159059/ccs,-4179,28937,33116,m6a,-81,-80,346
1,chr1_11256_-/m54329U_210814_130637/153159059/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,40236,m54329U_210814_130637/153159059/ccs,-4179,28937,33116,m6a,-78,-77,346
2,chr1_11256_-/m54329U_210814_130637/153159059/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,40236,m54329U_210814_130637/153159059/ccs,-4179,28937,33116,m6a,-75,-74,346
3,chr1_11256_-/m54329U_210814_130637/153159059/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,40236,m54329U_210814_130637/153159059/ccs,-4179,28937,33116,m6a,-74,-73,346
4,chr1_11256_-/m54329U_210814_130637/153159059/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,40236,m54329U_210814_130637/153159059/ccs,-4179,28937,33116,m6a,-73,-72,346


In [32]:
# save to file
output_file = os.path.basename(data_file).replace(".txt", "-cleaned_100bp.txt")
output_file = os.path.join(moka_dir, output_file)
df_neg.to_csv(output_file, sep="\t", header=True, index=False)
print("Saving negative data to: {}".format(os.path.basename(output_file)))

Saving negative data to: CTCF_m6a_fiberseq_L_100bp_small_5_negative-cleaned_100bp.txt


### merge w/ mokapot res

__positive data__

In [33]:
print("positive rows: {:,} | cols: {:,}".format(df_pos.shape[0], df_pos.shape[1]))
print("negative rows: {:,} | cols: {:,}".format(df_neg.shape[0], df_neg.shape[1]))

positive rows: 9,495,739 | cols: 16
negative rows: 8,040,195 | cols: 16


In [35]:
print(sum((df_pos["centered_start"] >= -40) & (df_pos["centered_end"] <= 75)))
print(sum((df_neg["centered_start"] >= -40) & (df_neg["centered_end"] <= 75)))

6388435
6674017


read in mokapot results with q-values

__initial CTCF mokapot run__
* positive set
    * features: CTCF_m6a_fiberseq_L_100bp_positive_features-motif_query.txt (rows: 238,348)
    * fiberseq: CTCF_m6a_fiberseq_L_100bp_positive.txt (rows: 12,739,713)
* small (1%) negative set
    * features: CTCF_m6a_fiberseq_L_100bp_small_negative_features-motif_query.txt (rows: 65,944)
    * fiberseq: CTCF_m6a_fiberseq_L_100bp_small_negative.txt (rows: 7,878,027)

In [6]:
# mokapot results w/ features
dataset = "CTCF_L"
mokapot_file = "{}/{}.mokapot.features.txt".format(moka_dir, dataset)
assert os.path.exists(mokapot_file), f"ERROR: file not found: {mokapot_file}"

# read res
res = pd.read_csv(mokapot_file, sep="\t")
print("Mokapot res - rows: {:,} | cols: {:,}".format(res.shape[0], res.shape[1]))

Mokapot res - rows: 304,371 | cols: 131


### format fiberseq data (positive)

read in CTCF fiberseq reads (positive)

In [10]:
# positive fiberseq reads
pos_file = "{}/CTCF_m6a_fiberseq_L_100bp_positive.txt".format(fiber_dir)
assert os.path.exists(pos_file), f"ERROR: file not found: {pos_file}"

# read feature file
df_pos = pd.read_csv(pos_file, sep="\t")
print("Pos features - rows: {:,} | cols: {:,}".format(df_pos.shape[0], df_pos.shape[1]))

Pos features - rows: 12,739,712 | cols: 13


In [13]:
# filter for only m6a & msp rows
df_pos = df_pos[df_pos["centered_position_type"].isin(["m6a", "msp"])]
print("rows: {:,} | cols: {:,}".format(df_pos.shape[0], df_pos.shape[1]))

# make column of motif_name & motif_query names
df_pos.insert(loc=0, column="motif_name", value=
              df_pos["chrom"]+"_"+df_pos["centering_position"].astype(str)+"_"+df_pos["strand"].astype(str))
df_pos.insert(loc=0, column="motif_query", value=
              df_pos["motif_name"].astype(str)+"/"+df_pos["query_name"].astype(str))

print("rows: {:,} | cols: {:,}".format(df_pos.shape[0], df_pos.shape[1]))

rows: 11,860,730 | cols: 13
rows: 11,860,730 | cols: 15


In [15]:
print("Columns: {:,}".format(df_pos.shape[1]))
print("Total rows: " + "{:,}".format(df_pos.shape[0]))
print("Total unique motifs: {:,}".format(df_pos.motif_name.nunique()))
print("Total unique query_names: " + "{:,}".format(df_pos.query_name.nunique()))

print("MSP's: " + "{:,}".format(df_pos[df_pos["centered_position_type"] == "msp"].shape[0]))
print("m6a's: " + "{:,}".format(df_pos[df_pos["centered_position_type"] == "m6a"].shape[0]))

Columns: 15
Total rows: 11,860,730
Total unique motifs: 6,504
Total unique query_names/strands: 328,835
MSP's: 396,372
m6a's: 11,464,358


In [18]:
print("rows: {:,} | cols: {:,}".format(df_pos.shape[0], df_pos.shape[1]))
print("unique motif/query groups: {:,}".format(df_pos["motif_query"].nunique()))

rows: 11,860,730 | cols: 15
unique motif/query groups: 360,604


In [22]:
def clean_sequences(df):
    '''Remove rows with N characters.'''
    print("Removing "+ "{:,} rows.".format(df["subset_sequence"].str.contains("N").sum()))
    return df[~df["subset_sequence"].str.contains("N")]

In [23]:
# remove rows with Ns in sequence
df_pos = clean_sequences(df_pos)

Removing 114,621 rows.


In [24]:
def filt_msps(df, motif_len=35):
    '''Filters for motif/query instances with a motif within 
    a MSP and adds MSP length to each motif/query group.'''
    
    # position of MSPs containing a motif
    msp_mask = (df["centered_position_type"] == "msp") & (df["centered_start"] <= 0) & (df["centered_end"] >= motif_len)
    print("MSP's with a motif: " + "{:,}".format(msp_mask.sum()))
    msp_groups = [(row["motif_name"], row["query_name"]) for idx, row in df[msp_mask].iterrows()]
    
    # filter for rows with motifs within an MSP (gets both MSP's & m6a's)
    df = df[df[["motif_name", "query_name"]].apply(tuple, 1).isin(msp_groups)]
    print("Total observations from motifs within an MSP: " + "{:,}".format(df.shape[0]))
    
    # add MSP size corresponding to each group
    # df with msp sizes
    df_msp = df.loc[msp_mask, ["motif_name", "query_name", "centered_end", "centered_start"]]
    df_msp["msp_size"] = df_msp["centered_end"] - df_msp["centered_start"]

    # match msp back to it's motif & fiber
    df = df.merge(df_msp[["motif_name", "query_name", "msp_size"]], on=["motif_name", "query_name"])
    print("Merged df shape w/ msp_size: " + "{:,}".format(df.shape[0]))
    return df

In [25]:
# filter for regions with a motif within an MSP & add MSP size col
df_pos = filt_msps(df_pos)
# remove MSP rows
df_pos = df_pos[df_pos["centered_position_type"] == "m6a"]
print("Total m6a observations: " + "{:,}".format(df_pos.shape[0]))

MSP's with a motif: 238,348
Total observations from motifs within an MSP: 9,735,406
Merged df shape w/ msp_size: 9,735,406
Total m6a observations: 9,495,739


In [26]:
# group by motif & query name
grouping_cols = ["motif_name", "query_name"]
df_grouped = df_pos.groupby(grouping_cols)

# get group names (keys)
group_names = list(df_grouped.groups.keys())
print("Unique motif-sequence groups: " + "{:,}".format(len(group_names)))

Unique motif-sequence groups: 238,348


### format fiberseq data (negative)

In [64]:
# negative fiberseq reads
neg_file = "{}/CTCF_m6a_fiberseq_L_100bp_small_negative.txt".format(fiber_dir)
assert os.path.exists(neg_file), f"ERROR: file not found: {neg_file}"

# read feature file
df_neg = pd.read_csv(neg_file, sep="\t")
print("Neg features - rows: {:,} | cols: {:,}".format(df_neg.shape[0], df_neg.shape[1]))

Neg features - rows: 7,878,026 | cols: 13


In [66]:
# filter for only m6a & msp rows
df_neg = df_neg[df_neg["centered_position_type"].isin(["m6a", "msp"])]
print("rows: {:,} | cols: {:,}".format(df_pos.shape[0], df_pos.shape[1]))

# make column of motif_name & motif_query names
df_neg.insert(loc=0, column="motif_name", value=
              df_neg["chrom"]+"_"+df_neg["centering_position"].astype(str)+"_"+df_neg["strand"].astype(str))
df_neg.insert(loc=0, column="motif_query", value=
              df_neg["motif_name"].astype(str)+"/"+df_neg["query_name"].astype(str))

print("rows: {:,} | cols: {:,}".format(df_neg.shape[0], df_neg.shape[1]))

rows: 9,495,739 | cols: 16
rows: 6,405,114 | cols: 15


In [68]:
print("Columns: {:,}".format(df_neg.shape[1]))
print("Total rows: " + "{:,}".format(df_neg.shape[0]))
print("Total unique motifs: {:,}".format(df_neg.motif_name.nunique()))
print("Total unique query_names: " + "{:,}".format(df_neg.query_name.nunique()))
print("unique motif/query groups: {:,}".format(df_neg["motif_query"].nunique()))

print("MSP's: " + "{:,}".format(df_neg[df_neg["centered_position_type"] == "msp"].shape[0]))
print("m6a's: " + "{:,}".format(df_neg[df_neg["centered_position_type"] == "m6a"].shape[0]))

Columns: 15
Total rows: 6,405,114
Total unique motifs: 317,997
Total unique query_names: 86,064
unique motif/query groups: 422,061
MSP's: 543,117
m6a's: 5,861,997


In [70]:
# remove rows with Ns in sequence
df_neg = clean_sequences(df_neg)

Removing 43,252 rows.


In [71]:
# filter for regions with a motif within an MSP & add MSP size col
df_neg = filt_msps(df_neg)

# remove MSP rows
df_neg = df_neg[df_neg["centered_position_type"] == "m6a"]
print("Total m6a observations: " + "{:,}".format(df_neg.shape[0]))

MSP's with a motif: 65,952
Total observations from motifs within an MSP: 1,663,294
Merged df shape w/ msp_size: 1,663,752
Total m6a observations: 1,595,563


In [72]:
# group by motif & query name
grouping_cols = ["motif_name", "query_name"]
df_grouped = df_neg.groupby(grouping_cols)

# get group names (keys)
group_names = list(df_grouped.groups.keys())
print("Unique motif-sequence groups: " + "{:,}".format(len(group_names)))

Unique motif-sequence groups: 65,943


### merge formated fiberseq data with mokapot q-value

<font color="red">__Why are there more rows after merging? Where did they come from?__</font>

In [34]:
# fix motif_query name
res["motif_query"] = res["motif_name"]+"/"+res["query_name"]

In [73]:
print("res rows: {:,} | cols: {:,}".format(res.shape[0], res.shape[1]))
print("pos rows: {:,} | cols: {:,}".format(df_pos.shape[0], df_pos.shape[1]))
print("neg rows: {:,} | cols: {:,}".format(df_neg.shape[0], df_neg.shape[1]))

res rows: 304,371 | cols: 131
pos rows: 9,495,739 | cols: 16
neg rows: 1,595,563 | cols: 16


In [79]:
res.tail(1)

Unnamed: 0,motif_query,motif_name,query_name,msp_size,left_m6a_count,right_m6a_count,motif_m6a_count,left_AT_count,right_AT_count,motif_AT_count,...,TTA_count,TTA_m6a_prop,TTC_count,TTC_m6a_prop,TTG_count,TTG_m6a_prop,TTT_count,TTT_m6a_prop,Label,mokapot q-value
304370,chrY_56848671_+/m54329U_210810_004956/39979417...,chrY_56848671_+,m54329U_210810_004956/39979417/ccs,59.0,1.0,3.0,3.0,19.0,21.0,17.0,...,0.0,0.0,1.0,0.0,2.0,0.0,1.0,0.333333,-1,1.0


In [77]:
df_pos.head(1)

Unnamed: 0,motif_query,motif_name,chrom,centering_position,strand,subset_sequence,reference_start,reference_end,query_name,centered_query_start,centered_query_end,query_length,centered_position_type,centered_start,centered_end,msp_size
0,chr1_1033080_+/m54329U_210813_020940/24183559/ccs,chr1_1033080_+,chr1,1033080,+,GACCTACGGGGGCGGGTGTGGGGACGCCGGACTACGCGTCAGGAGT...,1008277,1033947,m54329U_210813_020940/24183559/ccs,-24816,861,25677,m6a,-77,-76,225


In [78]:
df_neg.head(1)

Unnamed: 0,motif_query,motif_name,chrom,centering_position,strand,subset_sequence,reference_start,reference_end,query_name,centered_query_start,centered_query_end,query_length,centered_position_type,centered_start,centered_end,msp_size
0,chr1_11256_-/m54329U_210810_004956/34799917/ccs,chr1_11256_-,chr1,11256,-,TGCCAGCAGGCGGCGTGCCACCACTATACAGTAAGCAAGAGGGCCC...,10000,26121,m54329U_210810_004956/34799917/ccs,-6067,14864,20931,m6a,-81,-80,317


In [86]:
print("res rows: {:,} | cols: {:,}".format(res.shape[0], res.shape[1]))
print("pos rows: {:,} | cols: {:,}".format(df_pos.shape[0], df_pos.shape[1]))
print("neg rows: {:,} | cols: {:,}".format(df_neg.shape[0], df_neg.shape[1]))

res rows: 304,371 | cols: 131
pos rows: 9,495,739 | cols: 16
neg rows: 1,595,563 | cols: 16


In [41]:
# add mokapot q-value to POS table
d = pd.merge(df_pos, res[["motif_query", "Label", "mokapot q-value"]], on="motif_query", how="inner")
print("rows: {:,} | cols: {:,}".format(d.shape[0], d.shape[1]))

rows: 9,498,533 | cols: 18


In [63]:
# save merged POS table
output_file = mokapot_file.replace("features.txt", "m6a_fiberseq-positive.txt")
d.to_csv(output_file, sep="\t", header=True, index=False)
print("Saving to: {}".format(os.path.basename(output_file)))

Saving to: CTCF_L.mokapot.m6a_fiberseq-positive.txt


In [85]:
# add mokapot q-value to NEG table
d_n = pd.merge(df_neg, res[["motif_query", "Label", "mokapot q-value"]], on="motif_query", how="inner")
print("rows: {:,} | cols: {:,}".format(d_n.shape[0], d_n.shape[1]))

rows: 1,595,563 | cols: 18


In [87]:
# save merged NEG table
output_file = mokapot_file.replace("features.txt", "m6a_fiberseq-negative.txt")
d_n.to_csv(output_file, sep="\t", header=True, index=False)
print("Saving to: {}".format(os.path.basename(output_file)))

Saving to: CTCF_L.mokapot.m6a_fiberseq-negative.txt


__merge POS & NEG data__

In [94]:
print("pos rows: {:,} | cols: {:,}".format(d.shape[0], d.shape[1]))
print("neg rows: {:,} | cols: {:,}".format(d_n.shape[0], d_n.shape[1]))

pos rows: 9,498,533 | cols: 18
neg rows: 1,595,563 | cols: 18


In [100]:
# merge negative features with positive features (append rows)
d_merged = pd.concat([d, d_n])
print("rows: {:,} | cols: {:,}".format(d_merged.shape[0], d_merged.shape[1]))

rows: 11,094,096 | cols: 18


In [102]:
# save merged table
output_file = mokapot_file.replace("features.txt", "m6a_fiberseq.txt")
d_merged.to_csv(output_file, sep="\t", header=True, index=False)
print("Saving to: {}".format(os.path.basename(output_file)))

Saving to: CTCF_L.mokapot.m6a_fiberseq.txt


In [34]:
df_pos.insert(loc=0, column="motif_query", value=(df_pos["motif_name"] + df_pos["query_name"]))
print("Pos features - rows: {:,} | cols: {:,}".format(df_pos.shape[0], df_pos.shape[1]))

Pos features - rows: 238,348 | cols: 129


In [14]:
# feature file
neg_file = "{}/CTCF_m6a_fiberseq_L_100bp_small_negative_features-motif_query.txt".format(data_dir)
assert os.path.exists(neg_file), f"ERROR: file not found: {neg_file}"

# read feature file
df_neg = pd.read_csv(neg_file, sep="\t")
print("Neg features - rows: {:,} | cols: {:,}".format(df_neg.shape[0], df_neg.shape[1]))

Neg features - rows: 65,943 | cols: 129


In [23]:
print("Total feature rows: {:,}".format(df_pos.shape[0]+df_neg.shape[0]))

Total feature rows: 304,291


In [24]:
# input pin file
pin_file = "{}/CTCF_m6a_fiberseq_L_100bp_features.pin".format(data_dir)
assert os.path.exists(pin_file), f"ERROR: file not found: {pin_file}"

# read pin
pin = pd.read_csv(pin_file, sep="\t")
print("Pin features - rows: {:,} | cols: {:,}".format(pin.shape[0], pin.shape[1]))

Pin features - rows: 304,291 | cols: 131


### match mokapot res to features

In [43]:
res = res.sort_values(by=["SpecID"], ascending=True, ignore_index=True)
res.head()

Unnamed: 0,SpecID,Label,Peptide,scannr,mokapot score,mokapot q-value,mokapot PEP,Proteins
0,0,True,0,0,0.972169,0.001651,0.004676,0
1,1,True,1,1,0.985478,0.000817,0.002941,1
2,2,True,2,2,0.975089,0.001452,0.004233,2
3,3,True,3,3,0.75493,0.010283,0.126648,3
4,4,True,4,4,0.987861,0.000692,0.002701,4


In [50]:
df_pos.shape[0]

238348

In [53]:
pin_pos = pd.merge(df_pos, pin.iloc[:df_pos.shape[0]], on=df_pos.columns.tolist()[3:], how="inner")
print("rows: {:,} | cols: {:,}".format(pin_pos.shape[0], pin_pos.shape[1]))

rows: 238,428 | cols: 134


In [57]:
# remove unnecessary columns
to_remove = ["Peptide", "Proteins", "scannr"]
pin_pos = pin_pos.drop(to_remove, axis=1)
print("rows: {:,} | cols: {:,}".format(pin_pos.shape[0], pin_pos.shape[1]))

rows: 238,428 | cols: 131


In [59]:
# change Label to bool
res.Label = res.Label.replace({True: 1, False: 0})
res.head()

Unnamed: 0,SpecID,Label,Peptide,scannr,mokapot score,mokapot q-value,mokapot PEP,Proteins
0,0,1,0,0,0.972169,0.001651,0.004676,0
1,1,1,1,1,0.985478,0.000817,0.002941,1
2,2,1,2,2,0.975089,0.001452,0.004233,2
3,3,1,3,3,0.75493,0.010283,0.126648,3
4,4,1,4,4,0.987861,0.000692,0.002701,4


In [65]:
# merge "mokapot q-value" with pin_pos
res_feat = pd.merge(pin_pos, res[["SpecID", "Label", "mokapot q-value"]], on=["SpecID", "Label"], how="inner")
print("rows: {:,} | cols: {:,}".format(pin_pos.shape[0], pin_pos.shape[1]))

rows: 238,428 | cols: 131


In [77]:
# remove unnecessary columns
to_remove = ["SpecID"]
res_feat = res_feat.drop(to_remove, axis=1)
print("rows: {:,} | cols: {:,}".format(res_feat.shape[0], res_feat.shape[1]))

rows: 238,428 | cols: 131


In [72]:
# add Label column = -1 & mokapot q-value = 0
df_neg["Label"] = -1
# add mokapot q-value's = 1
df_neg["mokapot q-value"] = 1

In [83]:
# merge negative features with positive features
res_feat = pd.concat([res_feat, df_neg])
print("rows: {:,} | cols: {:,}".format(res_feat.shape[0], res_feat.shape[1]))

rows: 304,371 | cols: 131


In [89]:
# save output
print(os.path.basename(mokapot_file))
output_file = mokapot_file.replace("psms.txt", "features.txt")
res_feat.to_csv(output_file, sep="\t", header=True, index=False)
print("Saving to: {}".format(os.path.basename(output_file)))

CTCF_L.mokapot.psms.txt
Saving to: CTCF_L.mokapot.features.txt


### make & save mokapot plot

In [92]:
print("Making accessibility model.")
print("dataset size: {:,}".format(pin.shape[0]))

train = pin
# PSM (peptide-spectrum matches, proteomics thing)
train_psms = mokapot.read_pin(train)
print("Train - rows: {:,} | columns: {:,}".format(train.shape[0], train.shape[1]))
# weighing for imbalanced data
scale_pos_weight = sum(train.Label == -1) / sum(train.Label == 1)

# hyper paramaters for XGBoost
grid = {
    # decision trees in model
    "n_estimators": [25, 50, 100],
    "scale_pos_weight": [scale_pos_weight],
    # depth of tree
    "max_depth": [3, 6, 9],
    # rows a terminal leaf need to touch to be kept
    "min_child_weight": [3, 6, 9],
    # learning rate
    "gamma": [0.1, 1, 10],
}

# set up model (ensemble classifier)
xgb_mod = GridSearchCV(
    XGBClassifier(eval_metric = "auc"),
    param_grid = grid,
    cv = 3,
    # receiver operator curve
    scoring = "roc_auc",
    verbose = 2,
)

Making accessibility model.
dataset size: 304,291
Train - rows: 304,291 | columns: 131


In [None]:
print("Running mokapot.")
train_fdr = 0.10
test_fdr = 0.05
subset_max_train = 2_000_000

# machine learning model to re-score PSMs
mod = mokapot.Model(xgb_mod, train_fdr=train_fdr, subset_max_train=subset_max_train)
# run mokapot
moka_conf, models = mokapot.brew(psms=train_psms, model=mod, test_fdr=test_fdr)

Running mokapot.




Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [None]:
# run mokapot
moka_conf, models = 

### match mokapot res to fiberseq data (from ft-center)

In [1]:
input_file

NameError: name 'input_file' is not defined