In [4]:
import pandas as pd

df = pd.read_csv("unified_peptide_db.csv")
df.head()


Unnamed: 0.1,Unnamed: 0,protein_id,peptide,observed_frequency_SILAC,observed_intensity_sum_SILAC,observed_frequency_Label_free,observed_intensity_sum_Label_free,presence_SILAC,presence_Label_free,pep_len,protein_sequence
0,0,gi|151220213|ref|YP_001331036.1|,AYNPLFIYGGVGLGK,42.0,157109609.0,11.0,25094390.0,3,3,15,MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSSI...
1,1,gi|151220213|ref|YP_001331036.1|,DHTTVIHAHEK,0.0,0.0,0.0,0.0,2,2,11,MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSSI...
2,2,gi|151220213|ref|YP_001331036.1|,DHTTVIHAHEKISK,0.0,0.0,0.0,0.0,2,2,14,MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSSI...
3,3,gi|151220213|ref|YP_001331036.1|,DIIQAPK,0.0,0.0,0.0,0.0,2,2,7,MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSSI...
4,4,gi|151220213|ref|YP_001331036.1|,DIIQAPKSK,0.0,0.0,0.0,0.0,2,2,9,MSEKEIWEKVLEIAQEKLSAVSYSTFLKDTELYTIKDGEAIVLSSI...


In [5]:
import pandas as pd

# --- Load data ---
df = pd.read_csv("unified_peptide_db.csv")

# Columns:
# N,protein_id,peptide,observed_frequency_SILAC,observed_intensity_sum_SILAC,
# observed_frequency_Label_free,observed_intensity_sum_Label_free,
# presence_SILAC,presence_Label_free,pep_len,protein_sequence

# --- Define missing vs present codes according to your scheme ---
MISSING_CODES  = [1, 2, 6]
PRESENT_CODES  = [3, 4, 5]

# SILAC missing / present
df["missing_SILAC"] = df["presence_SILAC"].isin(MISSING_CODES)
df["present_SILAC"] = df["presence_SILAC"].isin(PRESENT_CODES)

# Label-free missing / present
df["missing_LF"] = df["presence_Label_free"].isin(MISSING_CODES)
df["present_LF"] = df["presence_Label_free"].isin(PRESENT_CODES)

# --- 1) Overlap of peptide sequences: missing vs present (SILAC) ---

silac_missing_peps     = set(df[df["missing_SILAC"]]["peptide"])
silac_present_peps     = set(df[df["present_SILAC"]]["peptide"])
silac_overlap_peps     = silac_missing_peps.intersection(silac_present_peps)

print("SILAC:")
print(f"  # unique peptides missing:      {len(silac_missing_peps)}")
print(f"  # unique peptides present:      {len(silac_present_peps)}")
print(f"  # unique peptides in BOTH sets: {len(silac_overlap_peps)}")

# --- 2) Overlap of peptide sequences: missing vs present (Label-free) ---

lf_missing_peps     = set(df[df["missing_LF"]]["peptide"])
lf_present_peps     = set(df[df["present_LF"]]["peptide"])
lf_overlap_peps     = lf_missing_peps.intersection(lf_present_peps)

print("\nLabel-free:")
print(f"  # unique peptides missing:      {len(lf_missing_peps)}")
print(f"  # unique peptides present:      {len(lf_present_peps)}")
print(f"  # unique peptides in BOTH sets: {len(lf_overlap_peps)}")

# --- 3) Peptides with the most contradictory behavior (SILAC) ---
#     i.e., appear as missing in some rows and present in others

silac_counts = (
    df.assign(group_silac=df["missing_SILAC"].map({True: "missing", False: "present"}))
      .groupby(["peptide", "group_silac"])
      .size()
      .unstack(fill_value=0)
)

# Only keep peptides that appear in both categories (missing & present)
# If some never appear in one of the groups, ensure columns exist
if "missing" not in silac_counts.columns:
    silac_counts["missing"] = 0
if "present" not in silac_counts.columns:
    silac_counts["present"] = 0

silac_ambiguous = silac_counts[(silac_counts["missing"] > 0) & (silac_counts["present"] > 0)]
silac_ambiguous_sorted = silac_ambiguous.sort_values(
    by=["missing", "present"], ascending=False
)

print("\nTop SILAC peptides with contradictory behavior (missing & present):")
print(silac_ambiguous_sorted.head(20))

# --- 4) Peptides with the most contradictory behavior (Label-free) ---

lf_counts = (
    df.assign(group_lf=df["missing_LF"].map({True: "missing", False: "present"}))
      .groupby(["peptide", "group_lf"])
      .size()
      .unstack(fill_value=0)
)

if "missing" not in lf_counts.columns:
    lf_counts["missing"] = 0
if "present" not in lf_counts.columns:
    lf_counts["present"] = 0

lf_ambiguous = lf_counts[(lf_counts["missing"] > 0) & (lf_counts["present"] > 0)]
lf_ambiguous_sorted = lf_ambiguous.sort_values(
    by=["missing", "present"], ascending=False
)

print("\nTop Label-free peptides with contradictory behavior (missing & present):")
print(lf_ambiguous_sorted.head(20))



SILAC:
  # unique peptides missing:      59054
  # unique peptides present:      12461
  # unique peptides in BOTH sets: 0

Label-free:
  # unique peptides missing:      60678
  # unique peptides present:      10837
  # unique peptides in BOTH sets: 0

Top SILAC peptides with contradictory behavior (missing & present):
Empty DataFrame
Columns: [missing, present]
Index: []

Top Label-free peptides with contradictory behavior (missing & present):
Empty DataFrame
Columns: [missing, present]
Index: []


In [6]:
import pandas as pd
from difflib import SequenceMatcher
from tqdm import tqdm

df = pd.read_csv("unified_peptide_db.csv")

# Missing vs Present coding (you defined):
MISSING_CODES  = [1, 2, 6]
PRESENT_CODES  = [3, 4, 5]

df["missing_SILAC"] = df["presence_SILAC"].isin(MISSING_CODES)
df["present_SILAC"] = df["presence_SILAC"].isin(PRESENT_CODES)

# Unique peptides in each group
silac_missing_peps   = list(set(df[df["missing_SILAC"]]["peptide"]))
silac_present_peps   = list(set(df[df["present_SILAC"]]["peptide"]))

print(len(silac_missing_peps), "unique missing peptides")
print(len(silac_present_peps), "unique present peptides")

# ---------- Helper: fuzzy similarity ----------
def seq_identity(a, b):
    """Return similarity between two peptide sequences (0–1)."""
    return SequenceMatcher(None, a, b).ratio()

def fuzzy_match(a, b, threshold=0.90):
    """Return True if a and b are similar peptides."""
    if a == b:
        return True
    if a in b or b in a:   # substring match
        return True
    if seq_identity(a, b) >= threshold:
        return True
    return False

# ---------- Find fuzzy overlap ----------
fuzzy_overlap = []

print("\nComputing fuzzy overlap (this may take a few seconds)...")
for pm in tqdm(silac_missing_peps):
    for pp in silac_present_peps:
        if fuzzy_match(pm, pp, threshold=0.90):
            fuzzy_overlap.append((pm, pp))

# Convert to DataFrame for inspection
overlap_df = pd.DataFrame(fuzzy_overlap, columns=["missing_peptide", "present_peptide"])
print("\nNumber of fuzzy-overlapping peptide pairs:", len(overlap_df))
overlap_df.head(20)


59054 unique missing peptides
12461 unique present peptides

Computing fuzzy overlap (this may take a few seconds)...


  2%|▌                                   | 1013/59054 [01:25<1:21:20, 11.89it/s]


KeyboardInterrupt: 

In [7]:
overlap_df

NameError: name 'overlap_df' is not defined

In [8]:
import pandas as pd
from collections import Counter
from tqdm import tqdm

# --- Load data ---
df = pd.read_csv("unified_peptide_db.csv")

# Label mapping:
# 1: predicted, neither peptide nor protein seen
# 2: predicted, protein seen but peptide not seen
# 3: predicted & seen
# 4: seen under other protein ID
# 5: seen, but not predicted
# 6: seen only in other experiment

MISSING_CODES  = [1, 2, 6]
PRESENT_CODES  = [3, 4, 5]

# --- Define missing/present flags ---
df["missing_SILAC"] = df["presence_SILAC"].isin(MISSING_CODES)
df["present_SILAC"] = df["presence_SILAC"].isin(PRESENT_CODES)

df["missing_LF"] = df["presence_Label_free"].isin(MISSING_CODES)
df["present_LF"] = df["presence_Label_free"].isin(PRESENT_CODES)

def substring_overlap(missing_peps, present_peps, label="SILAC"):
    """Find overlaps where one peptide is substring of the other."""
    missing_peps = list(set(missing_peps))
    present_peps = list(set(present_peps))

    print(f"\n=== {label}: substring-based overlap ===")
    print(f"# unique missing peptides:  {len(missing_peps)}")
    print(f"# unique present peptides:  {len(present_peps)}")

    pairs = []
    for pm in tqdm(missing_peps, desc=f"{label} missing vs present"):
        for pp in present_peps:
            if pm in pp or pp in pm:
                pairs.append((pm, pp))

    print(f"# substring-overlapping pairs (missing–present): {len(pairs)}")

    if not pairs:
        return None

    overlap_df = pd.DataFrame(pairs, columns=["missing_peptide", "present_peptide"])

    # How many missing peptides have at least one present match?
    miss_with_match = overlap_df["missing_peptide"].nunique()
    pres_with_match = overlap_df["present_peptide"].nunique()
    print(f"# missing peptides with ≥1 present substring match: {miss_with_match}")
    print(f"# present peptides with ≥1 missing substring match: {pres_with_match}")

    # Peptides with the most overlaps
    miss_counts = Counter(overlap_df["missing_peptide"])
    pres_counts = Counter(overlap_df["present_peptide"])

    top_missing = pd.DataFrame(
        miss_counts.most_common(20), columns=["missing_peptide", "n_present_matches"]
    )
    top_present = pd.DataFrame(
        pres_counts.most_common(20), columns=["present_peptide", "n_missing_matches"]
    )

    print("\nTop missing peptides with most present substring matches:")
    print(top_missing)

    print("\nTop present peptides with most missing substring matches:")
    print(top_present)

    return overlap_df, top_missing, top_present


# --- SILAC substring overlap ---
silac_missing_peps = df.loc[df["missing_SILAC"], "peptide"]
silac_present_peps = df.loc[df["present_SILAC"], "peptide"]

silac_overlap = substring_overlap(silac_missing_peps, silac_present_peps, label="SILAC")

# --- Label-free substring overlap ---
lf_missing_peps = df.loc[df["missing_LF"], "peptide"]
lf_present_peps = df.loc[df["present_LF"], "peptide"]

lf_overlap = substring_overlap(lf_missing_peps, lf_present_peps, label="Label-free")



=== SILAC: substring-based overlap ===
# unique missing peptides:  59054
# unique present peptides:  12461


SILAC missing vs present: 100%|█████████| 59054/59054 [00:21<00:00, 2749.90it/s]


# substring-overlapping pairs (missing–present): 10031
# missing peptides with ≥1 present substring match: 9687
# present peptides with ≥1 missing substring match: 7448

Top missing peptides with most present substring matches:
       missing_peptide  n_present_matches
0               LSGGQK                  4
1               ISEIEK                  3
2     MNAYDAYMKEIAQQMR                  3
3               DVDALK                  3
4    GDLAEIVKAFLVEFQEK                  2
5      MNTLIERNTTIPTSK                  2
6    MEGANEEIRELSYANAR                  2
7     LVEVFEQLKDIQGQIK                  2
8    VFLTVLGNDKEVENTFK                  2
9    TTFIIDEQGKVLDVIEK                  2
10  TVLDHLKDGGYDVIAIGK                  2
11     SGLYNLKLFAQVLGK                  2
12    AEEGLFKATVDSMLFR                  2
13              VSIEDK                  2
14              EVVELK                  2
15       NPRNAEIEVILEK                  2
16  LSQIDPERDVPYVLDTIK                  2
17     MYEFLDKLI

Label-free missing vs present: 100%|████| 60678/60678 [00:19<00:00, 3146.62it/s]

# substring-overlapping pairs (missing–present): 8937
# missing peptides with ≥1 present substring match: 8645
# present peptides with ≥1 missing substring match: 6622

Top missing peptides with most present substring matches:
         missing_peptide  n_present_matches
0      SAEDQLFTMKAYLNANR                  2
1      GDLAEIVKAFLVEFQEK                  2
2        MNTLIERNTTIPTSK                  2
3      TTFIIDEQGKVLDVIEK                  2
4     TVLDHLKDGGYDVIAIGK                  2
5                 VSIEDK                  2
6                 EVVELK                  2
7          NPRNAEIEVILEK                  2
8        MYEFLDKLISVSLPR                  2
9     YGNVMASRGSVIPLFIDK                  2
10     ELTDFSLPKIGEEFGGR                  2
11     GLISSEEMAKELVELSK                  2
12   VVIPSGPYDAKGLLISSIR                  2
13        NFYLIQRLAQFQVK                  2
14       FGIVLKEVTPLEIEK                  2
15    NSDSTVFNDAKALFDLNK                  2
16    IESGGFVLKDSLTLEQIK 




In [9]:
import pandas as pd
from tqdm import tqdm

df = pd.read_csv("unified_peptide_db.csv")

# Missing / present definitions
MISSING_CODES  = [1, 2, 6]
PRESENT_CODES  = [3, 4, 5]

df["missing_SILAC"] = df["presence_SILAC"].isin(MISSING_CODES)
df["present_SILAC"] = df["presence_SILAC"].isin(PRESENT_CODES)

df["missing_LF"] = df["presence_Label_free"].isin(MISSING_CODES)
df["present_LF"] = df["presence_Label_free"].isin(PRESENT_CODES)


# ---------- SUBSTRING OVERLAP FUNCTION ----------
def compute_substring_overlap(missing_peps, present_peps, label="SILAC"):
    missing_peps = list(set(missing_peps))
    present_peps = list(set(present_peps))

    print(f"\n=== {label}: substring-overlap analysis ===")
    print(f"# missing peptides: {len(missing_peps)}")
    print(f"# present peptides: {len(present_peps)}")

    rows = []

    # For each missing peptide, collect all present peptides that contain it (or vice-versa)
    for pm in tqdm(missing_peps, desc=f"{label} overlaps"):
        matched = []
        for pp in present_peps:
            if pm in pp or pp in pm:
                rows.append((pm, pp))
                matched.append(pp)

    overlap_df = pd.DataFrame(rows, columns=["missing_peptide", "present_peptide"])

    print(f"# overlapping missing–present pairs: {len(overlap_df)}")

    # Show summary: for each missing peptide how many present peptides matched
    summary = (
        overlap_df.groupby("missing_peptide")
                  .size()
                  .reset_index(name="num_matching_present_peptides")
                  .sort_values("num_matching_present_peptides", ascending=False)
    )

    print("\nTop missing peptides with the most present matches:")
    print(summary.head(20))

    return overlap_df, summary



# ---------- SILAC ----------
silac_missing = df.loc[df["missing_SILAC"], "peptide"]
silac_present = df.loc[df["present_SILAC"], "peptide"]

silac_overlap_df, silac_summary = compute_substring_overlap(
    silac_missing, silac_present, label="SILAC"
)

# Save full mapping
silac_overlap_df.to_csv("silac_missing_to_present_overlap.csv", index=False)
print("\nSaved: silac_missing_to_present_overlap.csv")


# ---------- LABEL-FREE ----------
lf_missing = df.loc[df["missing_LF"], "peptide"]
lf_present = df.loc[df["present_LF"], "peptide"]

lf_overlap_df, lf_summary = compute_substring_overlap(
    lf_missing, lf_present, label="Label-free"
)

lf_overlap_df.to_csv("labelfree_missing_to_present_overlap.csv", index=False)
print("\nSaved: labelfree_missing_to_present_overlap.csv")


# ---------- SHOW EXAMPLES ----------
print("\n=== Example: One missing peptide and all its present matches (SILAC) ===")
example_missing = silac_summary.iloc[0]["missing_peptide"]
print(f"Missing peptide: {example_missing}\nMatches:")
print(silac_overlap_df[silac_overlap_df["missing_peptide"] == example_missing])



=== SILAC: substring-overlap analysis ===
# missing peptides: 59054
# present peptides: 12461


SILAC overlaps: 100%|███████████████████| 59054/59054 [00:21<00:00, 2735.27it/s]


# overlapping missing–present pairs: 10031

Top missing peptides with the most present matches:
          missing_peptide  num_matching_present_peptides
5258               LSGGQK                              4
3873               ISEIEK                              3
5731     MNAYDAYMKEIAQQMR                              3
1286               DVDALK                              3
6404       NQMQNMLKGMNLPF                              2
423      APAFTEAKLQDPIPAK                              2
7575     SFTFITKTPPAPVLLK                              2
2167  EYLIAVKGPLTTPIGGGIR                              2
5300    LSYQPQNKINVVDVPTK                              2
6400    NQIQDWIKAGLVVANDK                              2
8095    TFGLIFSQRVLLALINK                              2
5873      MSNEILIVDDEDRIR                              2
9284    WNLVTNMGKFLDPLADK                              2
407    ANGLSGNNIRNGQQIVIP                              2
2874    GNDGEDVYLKDIWPSIK                        

Label-free overlaps: 100%|██████████████| 60678/60678 [00:19<00:00, 3161.50it/s]

# overlapping missing–present pairs: 8937

Top missing peptides with the most present matches:
         missing_peptide  num_matching_present_peptides
4642   LQAQFDAVKVLNDTQSK                              2
8138   VSQVIPVKITLESEPSK                              2
4639   LPYGVQKDADEVEQALR                              2
6677   SALFAPEKYNIISEIEK                              2
4868  LYLIGDTKDDFGGSQLEK                              2
2033     FGIVLKEVTPLEIEK                              2
8135   VSNLPYSIRVLLESLLR                              2
1809   ESMGIVRTTFIIDEQGK                              2
5127    MMLESIRYVDLVIPEK                              2
5660   NLGNPPFATRESLPSIR                              2
7099  SVGTFAIKTGAITSAVDR                              2
3089      IFEDVQKSAYESFK                              2
1387   EFIEMAKIPVIHSLPAK                              2
7298   TIGIFGFGRIGQLVAER                              2
4624      LPLDTLKQLYYLSK                              2
4873    L


