In [1]:
domains = dict(
    RBD=(319, 541),
    FP=(788, 806),
    HR1=(912, 984),
    HR2=(1163, 1213),
    S1_S2_boundary=(685,686),  # unclear functionality of the S1/S2 boundary?
)

In [2]:
df_sars2_S = pd.read_csv("linear-bcell-epitopes-SARS2-S.csv")



In [3]:
glycosites = set([])
with open("../glycosylation/glycosites-Watanabe.txt") as f:
    for l in f:
        l = l.strip()
        if l.startswith("#"):
            continue
        if not l:
            continue
        glycosites.add(int(l))
print(glycosites)
print("%d glycosites" % len(glycosites))


{1158, 17, 657, 149, 1173, 282, 801, 165, 1194, 1074, 61, 709, 74, 331, 1098, 717, 343, 603, 616, 234, 1134, 122}
22 glycosites


In [4]:
df_accessibility = pd.read_csv("../solvent-accessibility/Woods-Glycans-MD-Site-Specific-Accessibility.csv")
df_accessibility

Unnamed: 0,Residue Proper Numbering,SiteSpecific Accessiblity,Nude Accessibility,Difference
0,27,0.94,0.99,0.05
1,28,0.99,1.00,0.01
2,29,0.15,0.26,0.11
3,30,0.36,0.76,0.40
4,31,0.00,0.00,0.00
...,...,...,...,...
1115,1142,0.78,0.89,0.11
1116,1143,0.66,0.70,0.04
1117,1144,0.78,0.79,0.01
1118,1145,0.85,0.86,0.00


In [5]:
df_accessibility[df_accessibility["Residue Proper Numbering"] >= 580]

Unnamed: 0,Residue Proper Numbering,SiteSpecific Accessiblity,Nude Accessibility,Difference
553,580,0.49,0.90,0.42
554,581,0.92,0.95,0.03
555,582,0.86,0.94,0.08
556,583,0.90,0.91,0.01
557,584,0.18,0.20,0.01
...,...,...,...,...
1115,1142,0.78,0.89,0.11
1116,1143,0.66,0.70,0.04
1117,1144,0.78,0.79,0.01
1118,1145,0.85,0.86,0.00


In [6]:
accessible_positions = set([])
aa_num_to_nude_accessibility = {
    aa_num: accessibility
    for (aa_num, accessibility) 
    in zip(
        df_accessibility["Residue Proper Numbering"],
        df_accessibility["Nude Accessibility"])
}

aa_num_to_glycosylated_accessibility = {
    aa_num: accessibility
    for (aa_num, accessibility) 
    in zip(
        df_accessibility["Residue Proper Numbering"],
        df_accessibility["SiteSpecific Accessiblity"])
}


In [7]:
accessibility_threshold = 0.25
min_accessility_kmer = 3


# don't use residues which were excluded from the structure

first_pos =  min(aa_num_to_glycosylated_accessibility.keys())
last_pos = max(aa_num_to_glycosylated_accessibility.keys()) 

# avoid edge effects creating artificial accessibility 
accessible_start = first_pos + 3
accessible_end = last_pos - 3

# determine accessible kmers
n_accessible = 0
accessible_positions = set([])
for aa_num in range(accessible_start, accessible_end + 1):
    accessibility = aa_num_to_glycosylated_accessibility[aa_num]
    if (accessibility > accessibility_threshold) or aa_num == accessible_end:
        n_accessible += 1
    else:
        if n_accessible >= min_accessility_kmer:
            for accessible_aa_num in range(aa_num - n_accessible, aa_num):
                accessible_positions.add(accessible_aa_num)
        n_accessible = 0
print("%d accessible positions" % len(accessible_positions))


205 accessible positions


In [8]:
accessible_positions

{64,
 65,
 66,
 68,
 69,
 70,
 71,
 72,
 73,
 74,
 75,
 76,
 77,
 78,
 144,
 145,
 146,
 147,
 148,
 149,
 150,
 151,
 152,
 153,
 154,
 176,
 177,
 178,
 179,
 180,
 181,
 182,
 183,
 184,
 185,
 186,
 187,
 211,
 212,
 213,
 214,
 215,
 245,
 246,
 247,
 248,
 249,
 250,
 251,
 252,
 253,
 254,
 255,
 256,
 257,
 258,
 259,
 260,
 261,
 323,
 324,
 325,
 333,
 334,
 335,
 439,
 440,
 441,
 444,
 445,
 446,
 447,
 448,
 449,
 450,
 456,
 457,
 458,
 459,
 460,
 461,
 462,
 463,
 466,
 467,
 468,
 470,
 471,
 472,
 473,
 474,
 477,
 478,
 479,
 480,
 481,
 482,
 483,
 484,
 485,
 486,
 487,
 488,
 489,
 490,
 498,
 499,
 500,
 501,
 502,
 503,
 527,
 528,
 529,
 530,
 531,
 532,
 533,
 534,
 535,
 536,
 537,
 580,
 581,
 582,
 583,
 622,
 623,
 624,
 625,
 626,
 627,
 628,
 629,
 630,
 631,
 632,
 633,
 634,
 635,
 636,
 637,
 638,
 639,
 640,
 641,
 642,
 643,
 644,
 654,
 655,
 656,
 657,
 658,
 659,
 660,
 661,
 674,
 675,
 676,
 677,
 678,
 679,
 680,
 681,
 682,
 683,
 684,
 685,


In [9]:

df_polymorphic = pd.read_csv("../polymorphism/spike_protein_entropy_above_1_per_thousand.csv")
polymorphic_sites = set(df_polymorphic.aa_num)
assert 614 in polymorphic_sites
print("%d polymorphic sites" % len(polymorphic_sites))

27 polymorphic sites


In [10]:
longest_accessible_subsequences = []
accessible_starts = []
accessible_ends = []


min_accessibility_in_epitope = 0.15
min_mean_accessiblity = 0.35
min_max_accessibility = 0.5
for (epitope_start, epitope_end, seq) in zip(df_sars2_S.Start, df_sars2_S.End, df_sars2_S.Sequence):
    
    full_epitope_length = epitope_end - epitope_start + 1
    longest_subsequence = ""
    longest_subsequence_length = 0
    longest_start = 0
    longest_end = 0
    for i_base0_inclusive in range(full_epitope_length):
        for j_base0_inclusive in range(i_base0_inclusive, full_epitope_length):
            subsequence = seq[i_base0_inclusive:j_base0_inclusive + 1]
            length = len(subsequence)
            if length <= longest_subsequence_length:
                continue
            
            start_pos_base1 = epitope_start + i_base0_inclusive
            end_pos_base1 = epitope_start + j_base0_inclusive
            
            if (start_pos_base1 in accessible_positions) and (end_pos_base1 in accessible_positions):
                accessibility_scores = [
                    aa_num_to_glycosylated_accessibility.get(pos, 0)
                    for pos in range(start_pos_base1, end_pos_base1 + 1)
                ]
                mean_accessiblity = np.mean(accessibility_scores)
                min_accessibility = np.min(accessibility_scores)
                max_accessibility = np.max(accessibility_scores)
                if ((mean_accessiblity >= min_mean_accessiblity) and 
                        (min_accessibility >= min_accessibility_in_epitope) and 
                        (max_accessibility >= min_max_accessibility)):
                    longest_start = start_pos_base1
                    longest_end = end_pos_base1
                    longest_subsequence_length = length
                    longest_subsequence = subsequence
                   
                    assert len(longest_subsequence) == longest_subsequence_length, (
                        longest_subsequence,
                        longest_subsequence_length)
    print(epitope_start, epitope_end, seq, longest_start, longest_end, longest_subsequence)
    longest_accessible_subsequences.append(longest_subsequence)
    accessible_starts.append(longest_start)
    accessible_ends.append(longest_end)
    
df_sars2_S["accessible_subsequence"] = longest_accessible_subsequences
df_sars2_S["accessible_subsequence_start"] = accessible_starts
df_sars2_S["accessible_subsequence_end"] = accessible_ends
df_sars2_S["accessible_subsequence_length"] = df_sars2_S["accessible_subsequence"].str.len()
df_sars2_S["accessible_subsequence_is_4mer_or_longer"] = df_sars2_S["accessible_subsequence_length"] >= 4;

df_sars2_S


553 570 TESNKKFLPFQQFGRDIA 0 0 
809 826 PSKPSKRSFIEDLLFNKV 809 812 PSKP
26 30 PAYTN 0 0 
186 190 FKNLR 186 187 FK
356 360 KRISN 0 0 
456 460 FRKSN 456 460 FRKSN
806 820 LPDPSKPSKRSFIED 809 812 PSKP
1196 1200 SLIDL 0 0 
886 890 WTFGA 0 0 
1046 1050 GYHLM 0 0 
459 473 SNLKPFERDISTEIY 459 473 SNLKPFERDISTEIY
503 517 VGYQPYRVVVLSFEL 503 503 V
505 519 YQPYRVVVLSFELLH 0 0 
569 583 IADTTDAVRDPQTLE 580 583 QTLE
571 585 DTTDAVRDPQTLEIL 580 583 QTLE
573 587 TDAVRDPQTLEILDI 580 583 QTLE
767 781 LTGIAVEQDKNTQEV 0 0 
805 819 ILPDPSKPSKRSFIE 809 812 PSKP
807 821 PDPSKPSKRSFIEDL 809 812 PSKP
1193 1207 LNESLIDLQELGKYE 0 0 
1195 1209 ESLIDLQELGKYEQY 0 0 
127 141 VIKVCEFQFCNDPFL 0 0 
131 145 CEFQFCNDPFLGVYY 144 145 YY
343 357 NATRFASVYAWNRKR 0 0 
619 633 EVPVAIHADQLTPTW 622 633 VAIHADQLTPTW
649 663 CLIGAEHVNNSYECD 654 661 EHVNNSYE
763 777 LNRALTGIAVEQDKN 0 0 
1089 1103 FPREGVFVSNGTHWF 1099 1101 GTH
1141 1155 LQPELDSFKEELDKY 0 0 
1181 1195 KEIDRLNEVAKNLNE 0 0 
45 59 SSVLHSTQDLFLPFF 0 0 
63 91 TWFHAIHVSGT

Unnamed: 0,Source,Virus,Protein,Start,End,Animal,Isotype,Immunization,Monoclonal Name,In vitro function,Vaccine function,Notes,Sequence,Length,Sequence_length,accessible_subsequence,accessible_subsequence_start,accessible_subsequence_end,accessible_subsequence_length,accessible_subsequence_is_4mer_or_longer
0,Poh 2020,SARS-CoV-2,S,553,570,Human,IgG,Infection,Polyclonal,,,,TESNKKFLPFQQFGRDIA,18,18,,0,0,0,False
1,Poh 2020,SARS-CoV-2,S,809,826,Human,IgG,Infection,Polyclonal,,,,PSKPSKRSFIEDLLFNKV,18,18,PSKP,809,812,4,True
2,Wang 2020,SARS-CoV-2,S,26,30,Human,IgG,Infection,Polyclonal,,,,PAYTN,5,5,,0,0,0,False
3,Wang 2020,SARS-CoV-2,S,186,190,Human,IgG,Infection,Polyclonal,,,,FKNLR,5,5,FK,186,187,2,False
4,Wang 2020,SARS-CoV-2,S,356,360,Human,IgG,Infection,Polyclonal,,,,KRISN,5,5,,0,0,0,False
5,Wang 2020,SARS-CoV-2,S,456,460,Human,IgG,Infection,Polyclonal,,,,FRKSN,5,5,FRKSN,456,460,5,True
6,Wang 2020,SARS-CoV-2,S,806,820,Human,"IgG,IgM",Infection,Polyclonal,,,IgM epitpe is 816-SFIED-820,LPDPSKPSKRSFIED,15,15,PSKP,809,812,4,True
7,Wang 2020,SARS-CoV-2,S,1196,1200,Human,IgG,Infection,Polyclonal,,,,SLIDL,5,5,,0,0,0,False
8,Wang 2020,SARS-CoV-2,S,886,890,Human,IgM,Infection,Polyclonal,,,,WTFGA,5,5,,0,0,0,False
9,Wang 2020,SARS-CoV-2,S,1046,1050,Human,IgM,Infection,Polyclonal,,,,GYHLM,5,5,,0,0,0,False


In [11]:
df_sars2_S[["Start", "Sequence", "accessible_subsequence", "accessible_subsequence_start", "accessible_subsequence_end"]]

Unnamed: 0,Start,Sequence,accessible_subsequence,accessible_subsequence_start,accessible_subsequence_end
0,553,TESNKKFLPFQQFGRDIA,,0,0
1,809,PSKPSKRSFIEDLLFNKV,PSKP,809,812
2,26,PAYTN,,0,0
3,186,FKNLR,FK,186,187
4,356,KRISN,,0,0
5,456,FRKSN,FRKSN,456,460
6,806,LPDPSKPSKRSFIED,PSKP,809,812
7,1196,SLIDL,,0,0
8,886,WTFGA,,0,0
9,1046,GYHLM,,0,0


In [12]:

df_sars2_S["accessible_subsequence_contains_glycosite"] = [
    any([i in glycosites for i in range(start, end + 1)])
    for (start, end) in zip(df_sars2_S["accessible_subsequence_start"], df_sars2_S["accessible_subsequence_end"])
];

df_sars2_S["accessible_subsequence_contains_polymorphism"] = [
    any([i in polymorphic_sites for i in range(start, end + 1)])
    for (start, end) in zip(df_sars2_S["accessible_subsequence_start"], df_sars2_S["accessible_subsequence_end"])
];

def dist(epitope_start, epitope_end, feature_start, feature_end):
    """
    Distance between two intervals, defined as 0 if any 
    epitope AAs contained in feature
    """
    if epitope_end <= feature_start:
        return feature_start - epitope_end
    elif epitope_start >= feature_end:
        return epitope_start - feature_end 
    else:
        return 0
        
exact_overlap_columns = []
padded_overlap_columns = []
for feature, (feature_start, feature_end) in domains.items():
    padding = 50 if feature == "RBD" else 15
    exact_distance_column_name = "distance_to_%s" % feature
    df_sars2_S[exact_distance_column_name] = [
        dist(start, end, feature_start, feature_end) 
        for (start, end)
        in zip(df_sars2_S.Start, df_sars2_S.End)
    ]
    exact_overlap_column_name = "in_%s" % feature
    df_sars2_S["in_%s" % feature] = df_sars2_S[exact_distance_column_name] == 0
    exact_overlap_columns.append(exact_overlap_column_name)
    padded_distance_column_name = exact_distance_column_name + "_padded"
    df_sars2_S[padded_distance_column_name] = [
        dist(start, end, feature_start - padding, feature_end + padding) 
        for (start, end)
        in zip(df_sars2_S.Start, df_sars2_S.End)
    ]
    padded_overlap_column_name = "near_%s" % feature
    df_sars2_S[padded_overlap_column_name] = df_sars2_S[padded_distance_column_name] == 0
    padded_overlap_columns.append(padded_overlap_column_name)
        
in_any_feature = np.zeros(len(df_sars2_S), dtype=bool)
for col in exact_overlap_columns:
    in_any_feature |= df_sars2_S[col]
df_sars2_S["in_any_feature"] = in_any_feature

near_any_feature = np.zeros(len(df_sars2_S), dtype=bool)
for col in padded_overlap_columns:
    near_any_feature |= df_sars2_S[col]
df_sars2_S["near_any_feature"] = near_any_feature

df_sars2_S["IgA"] = df_sars2_S.Isotype.str.contains("IgA")
df_sars2_S["IgG"] = df_sars2_S.Isotype.str.contains("IgG")
df_sars2_S["IgG_and_IgA"] = df_sars2_S["IgG"] & df_sars2_S["IgA"];

df_sars2_S.to_csv("linear-bcell-epitopes-SARS2-S-with-filters.csv")
df_sars2_S


Unnamed: 0,Source,Virus,Protein,Start,End,Animal,Isotype,Immunization,Monoclonal Name,In vitro function,...,near_HR2,distance_to_S1_S2_boundary,in_S1_S2_boundary,distance_to_S1_S2_boundary_padded,near_S1_S2_boundary,in_any_feature,near_any_feature,IgA,IgG,IgG_and_IgA
0,Poh 2020,SARS-CoV-2,S,553,570,Human,IgG,Infection,Polyclonal,,...,False,115,False,100,False,False,True,False,True,False
1,Poh 2020,SARS-CoV-2,S,809,826,Human,IgG,Infection,Polyclonal,,...,False,123,False,108,False,False,True,False,True,False
2,Wang 2020,SARS-CoV-2,S,26,30,Human,IgG,Infection,Polyclonal,,...,False,655,False,640,False,False,False,False,True,False
3,Wang 2020,SARS-CoV-2,S,186,190,Human,IgG,Infection,Polyclonal,,...,False,495,False,480,False,False,False,False,True,False
4,Wang 2020,SARS-CoV-2,S,356,360,Human,IgG,Infection,Polyclonal,,...,False,325,False,310,False,True,True,False,True,False
5,Wang 2020,SARS-CoV-2,S,456,460,Human,IgG,Infection,Polyclonal,,...,False,225,False,210,False,True,True,False,True,False
6,Wang 2020,SARS-CoV-2,S,806,820,Human,"IgG,IgM",Infection,Polyclonal,,...,False,120,False,105,False,True,True,False,True,False
7,Wang 2020,SARS-CoV-2,S,1196,1200,Human,IgG,Infection,Polyclonal,,...,True,510,False,495,False,True,True,False,True,False
8,Wang 2020,SARS-CoV-2,S,886,890,Human,IgM,Infection,Polyclonal,,...,False,200,False,185,False,False,False,False,False,False
9,Wang 2020,SARS-CoV-2,S,1046,1050,Human,IgM,Infection,Polyclonal,,...,False,360,False,345,False,False,False,False,False,False


In [13]:
df_sars2_S[["Sequence", "accessible_subsequence"]]

Unnamed: 0,Sequence,accessible_subsequence
0,TESNKKFLPFQQFGRDIA,
1,PSKPSKRSFIEDLLFNKV,PSKP
2,PAYTN,
3,FKNLR,FK
4,KRISN,
5,FRKSN,FRKSN
6,LPDPSKPSKRSFIED,PSKP
7,SLIDL,
8,WTFGA,
9,GYHLM,


In [14]:
from collections import OrderedDict

inherited_col_names = [
    "accessible_subsequence_length",
    "accessible_subsequence_is_4mer_or_longer",
    "IgG",
    "IgA",
    "IgG_and_IgA",
    "accessible_subsequence_contains_polymorphism",
    "accessible_subsequence_contains_glycosite",
    "near_any_feature",
    "in_any_feature",
     
]
key_col_names = [
    "accessible_subsequence_start",
    "accessible_subsequence_end", 
    "accessible_subsequence", 
]
combined_col_names = key_col_names + inherited_col_names
grouped_cols = OrderedDict(
    [("num_sources", []), ("sources", [])] + [(col_name, []) for col_name in combined_col_names]
)

for key, group in \
        sorted(df_sars2_S.groupby([
            "accessible_subsequence_start", 
            "accessible_subsequence_end",
            "accessible_subsequence"])):
    if key[0] == 0:
        # skip epitopes without any accessible parts
        continue
    for (key_elt, key_col_name) in zip(key, key_col_names):
        grouped_cols[key_col_name].append(key_elt)
    for col_name in inherited_col_names:
        value = group[col_name].mean()
        if "length" in col_name:
            value = int(value)
        elif "Ig" not in col_name:
        
            value = bool(value)
        grouped_cols[col_name].append(value)
    grouped_cols["num_sources"].append(len(group))
    sources = []
    if len(group) > 1:
        print(key, group[["Start", "Sequence"]])
    for (source, protein, start, end) in zip(
            group.Source, group.Protein, group.Start, group.End):
        source_with_coords = "%s %s%d-%d" % (source, protein, start, end)
        if len(group) > 1:
            print(key, source_with_coords)
        sources.append(source_with_coords)
    grouped_cols["sources"].append("; ".join(sources))
    
df_grouped = pd.DataFrame(grouped_cols);

df_grouped.to_csv("accessible-linear-bcell-epitopes-grouped-by-sequence.csv", index=False)
print("%d entries grouped by accessible sequence" % (
    len(df_grouped)))

(580, 583, 'QTLE')     Start                                Sequence
13    569                         IADTTDAVRDPQTLE
14    571                         DTTDAVRDPQTLEIL
15    573                         TDAVRDPQTLEILDI
39    565                 FGRDIADTTDAVRDPQTLEILDI
57    552  LTESNKKFLPFQQFGRDIADTTDAVRDPQTLEILDITP
(580, 583, 'QTLE') Charite 2020 S569-583
(580, 583, 'QTLE') Charite 2020 S571-585
(580, 583, 'QTLE') Charite 2020 S573-587
(580, 583, 'QTLE') Dahlke 2020 S565-587
(580, 583, 'QTLE') ReScan S552-589
(654, 661, 'EHVNNSYE')     Start             Sequence
25    649      CLIGAEHVNNSYECD
41    647  AGCLIGAEHVNNSYECDIP
(654, 661, 'EHVNNSYE') Charite 2020 S649-663
(654, 661, 'EHVNNSYE') Dahlke 2020 S647-665
(809, 812, 'PSKP')     Start                                Sequence
1     809                      PSKPSKRSFIEDLLFNKV
6     806                         LPDPSKPSKRSFIED
17    805                         ILPDPSKPSKRSFIE
18    807                         PDPSKPSKRSFIEDL
43    809

In [15]:
df_grouped

Unnamed: 0,num_sources,sources,accessible_subsequence_start,accessible_subsequence_end,accessible_subsequence,accessible_subsequence_length,accessible_subsequence_is_4mer_or_longer,IgG,IgA,IgG_and_IgA,accessible_subsequence_contains_polymorphism,accessible_subsequence_contains_glycosite,near_any_feature,in_any_feature
0,1,Dahlke 2020 S63-91,68,78,IHVSGTNGTKR,11,True,0.0,1.0,0.0,True,True,False,False
1,1,Charite 2020 S131-145,144,145,YY,2,False,1.0,1.0,1.0,False,False,False,False
2,1,Dahlke 2020 S125-181,144,154,YYHKNNKSWME,11,True,0.0,1.0,0.0,True,True,False,False
3,1,Wang 2020 S186-190,186,187,FK,2,False,1.0,0.0,0.0,False,False,False,False
4,1,Dahlke 2020 S215-229,215,215,D,1,False,0.0,1.0,0.0,False,False,False,False
5,1,Dahlke 2020 S241-265,245,261,HRSYLTPGDSSSGWTAG,17,True,0.0,1.0,0.0,True,False,False,False
6,1,Wang 2020 S456-460,456,460,FRKSN,5,True,1.0,0.0,0.0,False,False,True,True
7,1,Dahlke 2020 S449-463,456,463,FRKSNLKP,8,True,0.0,1.0,0.0,False,False,True,True
8,1,Charite 2020 S459-473,459,473,SNLKPFERDISTEIY,15,True,1.0,1.0,1.0,False,False,True,True
9,1,Charite 2020 S503-517,503,503,V,1,False,1.0,0.0,0.0,False,False,True,True


In [16]:
def combine_overlapping_epitopes(df_grouped):
    merged_intervals = []
    candidate_intervals = [
        (start, end, seq, {i}) 
        for i, (start, end, seq) in 
            enumerate(
                zip(df_grouped.accessible_subsequence_start, 
                    df_grouped.accessible_subsequence_end,
                    df_grouped.accessible_subsequence))
    ]
    candidate_intervals = sorted(candidate_intervals, key=lambda x: (x[0], x[1]))
    
    merged_start, merged_end, merged_sequence, merged_indices = candidate_intervals[0]
    for (curr_start, curr_end, curr_seq, curr_indices) in candidate_intervals[1:]:
        if curr_start <= merged_end:
            merged_indices.update(curr_indices)
            n_extra_chars = curr_end - merged_end
          
            if n_extra_chars > 0:
                
                merged_sequence += curr_seq[-n_extra_chars:]
               
            merged_end = max(merged_end, curr_end)
            
        else:
            merged_intervals.append((merged_start, merged_end, merged_sequence, merged_indices))
            merged_start = curr_start
            merged_end = curr_end
            merged_indices = curr_indices
            merged_sequence = curr_seq
    merged_intervals.append((merged_start, merged_end, merged_sequence, merged_indices))
    result_cols = OrderedDict([
        (col, [])
        for col in df_grouped.columns
    ])
    for (start, end, seq, indices) in merged_intervals:
        rows = df_grouped.iloc[list(indices)]
        result_cols["num_sources"].append(rows["num_sources"].sum())
        result_cols["sources"].append("; ".join(rows["sources"].values))
        result_cols["accessible_subsequence_start"].append(start)
        result_cols["accessible_subsequence_end"].append(end)
        result_cols["accessible_subsequence"].append(seq)
        result_cols["accessible_subsequence_length"].append(len(seq))
        result_cols["accessible_subsequence_is_4mer_or_longer"].append(len(seq) >= 4)
        IgG = rows["IgG"].mean() 
        result_cols["IgG"].append(IgG > 0)
        IgA = rows["IgA"].mean()
        result_cols["IgA"].append(IgA > 0)
        result_cols["IgG_and_IgA"].append(IgG > 0 and IgA > 0)
        result_cols["accessible_subsequence_contains_polymorphism"].append(
            rows["accessible_subsequence_contains_polymorphism"].any())
        result_cols["accessible_subsequence_contains_glycosite"].append(
            rows["accessible_subsequence_contains_glycosite"].any())
        result_cols["near_any_feature"].append(rows["near_any_feature"].any())
        result_cols["in_any_feature"].append(rows["in_any_feature"].all())
    return pd.DataFrame(result_cols)


In [17]:
print("Filtered %d individual B-cell epitope entries into %d accessible regions" % (
    len(df_sars2_S),
    len(df_grouped)))


df_grouped_overlapping = combine_overlapping_epitopes(df_grouped)
df_grouped_overlapping.to_csv("accessible-linear-bcell-epitopes-merged-overlap.csv", index=False)

print("Collapsed overlapping accessible regions into %d/%d epitopes" % (
    len(df_grouped_overlapping),
    len(df_grouped)))


df_no_glycosites = df_grouped_overlapping[~df_grouped_overlapping.accessible_subsequence_contains_glycosite]

print("Glycosite filter: %d/%d " % (
    len(df_no_glycosites), len(df_grouped_overlapping)))

df_no_polymorphisms = df_no_glycosites[~df_no_glycosites.accessible_subsequence_contains_polymorphism]

print("Polymorphism filter: %d/%d" % (
    len(df_no_polymorphisms), len(df_no_glycosites)))

df_near_feature = df_no_polymorphisms[df_no_polymorphisms.near_any_feature]

print("Near RBD (+/- 50aa), FP (+/- 10aa), S1/S2 boundary (+/- 10aa), HR1 (+/- 10aa), HR2 (+/- 10aa): %d/%d" % (
    len(df_near_feature), len(df_no_polymorphisms)))

df_accessible_4mer = df_near_feature[df_near_feature.accessible_subsequence_is_4mer_or_longer]

print("4mer or longer: %d/%d" % (
    len(df_accessible_4mer), len(df_near_feature)))

df_accessible_4mer.to_csv("accessible-linear-bcell-epitopes-grouped-merged-filtered.csv");


Filtered 58 individual B-cell epitope entries into 19 accessible regions
Collapsed overlapping accessible regions into 14/19 epitopes
Glycosite filter: 11/14 
Polymorphism filter: 9/11
Near RBD (+/- 50aa), FP (+/- 10aa), S1/S2 boundary (+/- 10aa), HR1 (+/- 10aa), HR2 (+/- 10aa): 4/9
4mer or longer: 3/4


In [18]:
df_accessible_4mer

Unnamed: 0,num_sources,sources,accessible_subsequence_start,accessible_subsequence_end,accessible_subsequence,accessible_subsequence_length,accessible_subsequence_is_4mer_or_longer,IgG,IgA,IgG_and_IgA,accessible_subsequence_contains_polymorphism,accessible_subsequence_contains_glycosite,near_any_feature,in_any_feature
5,3,Charite 2020 S459-473; Wang 2020 S456-460; Dah...,456,473,FRKSNLKPFERDISTEIY,18,True,True,True,True,False,False,True,True
7,5,Charite 2020 S569-583; Charite 2020 S571-585; ...,580,583,QTLE,4,True,True,True,True,False,False,True,False
11,7,Poh 2020 S809-826; Wang 2020 S806-820; Charite...,809,812,PSKP,4,True,True,True,True,False,False,True,False


In [19]:
df_accessible_4mer["sources"].values

array(['Charite 2020 S459-473; Wang 2020 S456-460; Dahlke 2020 S449-463',
       'Charite 2020 S569-583; Charite 2020 S571-585; Charite 2020 S573-587; Dahlke 2020 S565-587; ReScan S552-589',
       'Poh 2020 S809-826; Wang 2020 S806-820; Charite 2020 S805-819; Charite 2020 S807-821; Dahlke 2020 S809-827; ReScan S799-836; Dahlke 2020 S811-831'],
      dtype=object)