In [1]:
import pandas as pd

In [2]:
from malid import config, helpers
from malid.datamodels import GeneLocus

In [3]:
# cov-abdab
cov_abdab = pd.read_csv(config.paths.base_data_dir / "CoV-AbDab_260722.csv")
cov_abdab.shape

(10005, 23)

In [4]:
cov_abdab.dropna(subset=["Binds to"], inplace=True)
cov_abdab.shape

(9234, 23)

In [5]:
cov_abdab = cov_abdab[
    cov_abdab["Binds to"].str.lower().apply(lambda s: "sars-cov2" in s)
]
cov_abdab.shape

(8802, 23)

In [6]:
cov_abdab["Binds to"].value_counts()

SARS-CoV2_WT                                                                                                                                                                            5429
SARS-CoV1;SARS-CoV2_WT                                                                                                                                                                   382
SARS-CoV2_WT;SARS-CoV1                                                                                                                                                                   228
SARS-CoV2_WT;SARS-CoV2_Omicron-BA1;SARS-CoV2_Omicron-BA1.1;SARS-CoV2_Omicron-BA3;SARS-CoV2_Omicron-BA2;SARS-CoV2_Omicron-BA2.12.1;SARS-CoV2_Omicron-BA2.13;SARS-CoV2_Omicron-BA4/BA5     219
SARS-CoV2_WT;SARS-CoV2_Omicron-BA1;SARS-CoV2_Beta;SARS-CoV2_Delta;SARS-CoV2_Omicron-BA2;SARS-CoV1                                                                                        200
                                                       

In [7]:
# # remove weak binders
# cov_abdab = cov_abdab[
#     ~cov_abdab["Binds to"].str.lower().apply(lambda s: "sars-cov2_wt (weak)" in s)
# ]
# cov_abdab.shape

In [8]:
cov_abdab["Binds to"].value_counts()

SARS-CoV2_WT                                                                                                                                                                            5429
SARS-CoV1;SARS-CoV2_WT                                                                                                                                                                   382
SARS-CoV2_WT;SARS-CoV1                                                                                                                                                                   228
SARS-CoV2_WT;SARS-CoV2_Omicron-BA1;SARS-CoV2_Omicron-BA1.1;SARS-CoV2_Omicron-BA3;SARS-CoV2_Omicron-BA2;SARS-CoV2_Omicron-BA2.12.1;SARS-CoV2_Omicron-BA2.13;SARS-CoV2_Omicron-BA4/BA5     219
SARS-CoV2_WT;SARS-CoV2_Omicron-BA1;SARS-CoV2_Beta;SARS-CoV2_Delta;SARS-CoV2_Omicron-BA2;SARS-CoV1                                                                                        200
                                                       

In [9]:
# cov_abdab.dropna(subset=["Neutralising Vs"], inplace=True)
# cov_abdab.shape

In [10]:
cov_abdab["Neutralising Vs"].isna().value_counts()

True     4541
False    4261
Name: Neutralising Vs, dtype: int64

In [11]:
# cov_abdab = cov_abdab[
#     cov_abdab["Neutralising Vs"].str.lower().apply(lambda s: "sars-cov2" in s)
# ]
# cov_abdab.shape

In [12]:
cov_abdab["Neutralising Vs"].value_counts()

SARS-CoV2_WT                                                                                                                                                                                                                             1898
SARS-CoV2_WT (weak)                                                                                                                                                                                                                       318
SARS-CoV2_WT;SARS-CoV2_Omicron-BA1                                                                                                                                                                                                        149
SARS-CoV2_WT;SARS-CoV2_Omicron-BA1;SARS-CoV2_Omicron-BA1.1;SARS-CoV2_Omicron-BA3;SARS-CoV2_Omicron-BA2;SARS-CoV2_Omicron-BA2.12.1;SARS-CoV2_Omicron-BA2.13;SARS-CoV2_Omicron-BA4/BA5                                                      102
SARS-CoV2_WT;SARS-CoV1;Pangolin-GD;RatG13       

In [13]:
cov_abdab["Heavy V Gene"].str.split("(").str[1].value_counts()

Human)          7930
Alpaca)          617
Mouse)           185
Rhesus)            2
Human)+K3788       1
Name: Heavy V Gene, dtype: int64

In [14]:
cov_abdab["Heavy V Gene"].isna().value_counts()

False    8801
True        1
Name: Heavy V Gene, dtype: int64

In [15]:
cov_abdab.dropna(subset=["Heavy V Gene"], inplace=True)

In [16]:
cov_abdab = cov_abdab[cov_abdab["Heavy V Gene"].apply(lambda s: "(Human)" in s)]
cov_abdab.shape

(7931, 23)

In [17]:
cov_abdab["Heavy J Gene"].value_counts()

IGHJ4 (Human)    3404
IGHJ6 (Human)    1923
IGHJ3 (Human)    1073
IGHJ5 (Human)     912
IGHJ2 (Human)     258
ND                220
IGHJ1 (Human)     134
IGKJ4 (Human)       1
Name: Heavy J Gene, dtype: int64

In [18]:
cov_abdab["Heavy J Gene"].isna().value_counts()

False    7925
True        6
Name: Heavy J Gene, dtype: int64

In [19]:
cov_abdab = cov_abdab[cov_abdab["Heavy J Gene"] != "ND"]
cov_abdab.shape

(7711, 23)

In [20]:
cov_abdab.dropna(subset=["Heavy J Gene"], inplace=True)

In [21]:
cov_abdab["Heavy J Gene"].value_counts()

IGHJ4 (Human)    3404
IGHJ6 (Human)    1923
IGHJ3 (Human)    1073
IGHJ5 (Human)     912
IGHJ2 (Human)     258
IGHJ1 (Human)     134
IGKJ4 (Human)       1
Name: Heavy J Gene, dtype: int64

In [22]:
cov_abdab["Heavy V Gene"].value_counts()

IGHV3-30 (Human)          934
IGHV1-69 (Human)          756
IGHV3-53 (Human)          451
IGHV3-30-3 (Human)        349
IGHV3-23 (Human)          346
                         ... 
IGHV3-24 (Human)            1
IGKV1-33 (Human)            1
IGHV1-45 (Human)            1
 IGHV7-4-1 (Human)          1
IGHV1-69 (Human)+K3788      1
Name: Heavy V Gene, Length: 67, dtype: int64

In [23]:
cov_abdab["Heavy V Gene"] = cov_abdab["Heavy V Gene"].str.split("(").str[0].str.strip()

In [24]:
cov_abdab["Heavy V Gene"]

11       IGHV3-30
12       IGHV3-30
13       IGHV3-30
14       IGHV3-30
15       IGHV1-46
           ...   
10000     IGHV3-7
10001    IGHV3-23
10002    IGHV1-46
10003    IGHV3-30
10004    IGHV3-30
Name: Heavy V Gene, Length: 7705, dtype: object

In [25]:
cov_abdab["Heavy J Gene"].value_counts()

IGHJ4 (Human)    3404
IGHJ6 (Human)    1923
IGHJ3 (Human)    1073
IGHJ5 (Human)     912
IGHJ2 (Human)     258
IGHJ1 (Human)     134
IGKJ4 (Human)       1
Name: Heavy J Gene, dtype: int64

In [26]:
cov_abdab["Heavy J Gene"] = cov_abdab["Heavy J Gene"].str.split("(").str[0].str.strip()

In [27]:
cov_abdab["Heavy J Gene"]

11       IGHJ6
12       IGHJ6
13       IGHJ6
14       IGHJ6
15       IGHJ6
         ...  
10000    IGHJ4
10001    IGHJ5
10002    IGHJ4
10003    IGHJ4
10004    IGHJ4
Name: Heavy J Gene, Length: 7705, dtype: object

In [28]:
cov_abdab = cov_abdab[cov_abdab["CDRH3"] != "ND"]
cov_abdab.shape

(7705, 23)

In [29]:
cov_abdab["CDRH3"]

11           ARAYTGSYYYGMDV
12           ARARGGSYYYGMDV
13           ARSRGGGYYYGMDV
14           ARAHRGSYYYGMDV
15             ASDVAGHHGMDV
                ...        
10000      VKDRTDWELIRGYFGH
10001    AKGELLWFGDLLHNWFDP
10002       ARDQADYYYGSGCIV
10003      ARDEQKDRAMVTLPDY
10004       ARELDYYGSGSYPDY
Name: CDRH3, Length: 7705, dtype: object

In [30]:
cov_abdab["Ab or Nb"].value_counts()

Ab    7680
Nb      25
Name: Ab or Nb, dtype: int64

In [31]:
cov_abdab["VHorVHH"].isna().value_counts()

False    7705
Name: VHorVHH, dtype: int64

In [32]:
(cov_abdab["VHorVHH"] != "ND").value_counts()

True     7619
False      86
Name: VHorVHH, dtype: int64

In [33]:
cov_abdab = cov_abdab[cov_abdab["VHorVHH"] != "ND"]
cov_abdab.shape

(7619, 23)

In [34]:
cov_abdab.columns

Index(['Name', 'Ab or Nb', 'Binds to', 'Doesn't Bind to', 'Neutralising Vs',
       'Not Neutralising Vs', 'Protein + Epitope', 'Origin', 'VHorVHH', 'VL',
       'Heavy V Gene', 'Heavy J Gene', 'Light V Gene', 'Light J Gene', 'CDRH3',
       'CDRL3', 'Structures', 'ABB Homology Model (if no structure)',
       'Sources', 'Date Added', 'Last Updated', 'Update Description',
       'Notes/Following Up?'],
      dtype='object')

In [35]:
cov_abdab["Origin"].value_counts()

B-cells; SARS-CoV2 Human Patient                     2461
B-cells; SARS-CoV2_WT Human Patient                  2172
B-cells (SARS-CoV2 Human Patient and/or Vaccinee)    1055
B-cells; SARS-CoV2_WT Vaccinee                        519
B-cells (SARS-CoV2 Human Patient+Vaccinee)            280
                                                     ... 
Engineered from CR3023                                  1
Engineered from CR3041                                  1
Engineered from CR3042                                  1
Engineered from CR3043                                  1
B-cells; SARS-CoV2 Human Vaccinees (BNT162b2)           1
Name: Origin, Length: 86, dtype: int64

In [36]:
cov_abdab["Origin"].value_counts().head(n=25)

B-cells; SARS-CoV2 Human Patient                             2461
B-cells; SARS-CoV2_WT Human Patient                          2172
B-cells (SARS-CoV2 Human Patient and/or Vaccinee)            1055
B-cells; SARS-CoV2_WT Vaccinee                                519
B-cells (SARS-CoV2 Human Patient+Vaccinee)                    280
B-cells; SARS-CoV1 Human Patient                              248
Semi-synthetic Human Fab Library                              116
B-cells (SARS-CoV2 Human Patient/Vaccinee)                     90
B-cells; SARS-CoV2_Beta Human Patient                          81
B-cells; SARS-CoV2 Vaccinee (ChAdOx1 then mRNA-1273)           72
Phage Display (Ab, human, non-immune)                          58
B-cells; SARS-CoV2_Gamma Human Patient                         50
B-cells; SARS-CoV2 Vaccinee (2 x ChAdOx1)                      45
B-cells (Human Naive)                                          39
B-cells; SARS-CoV2_WT Vaccinee (Lymph Nodes)                   37
Directed E

In [37]:
cov_abdab["Origin"].unique()

array(['Phage Display (Ab, human, immune - SARS-CoV2)',
       'B-cells; SARS-CoV2_Gamma Human Patient',
       'B-cells; SARS-CoV2 Vaccinee (2 x ChAdOx1)',
       'B-cells; SARS-CoV2 Vaccinee (ChAdOx1 then mRNA-1273)',
       'B-cells; SARS-CoV1 Human Patient',
       'B-cells; Transgenic Mouse (VelocImmune)',
       'B-cells; SARS-CoV2_WT Human Patient', 'Transgenic Mouse',
       'Phage Library Engineered from SARS-CoV-1 binder',
       'Phage Display (Ab, human, non-immune)',
       'B-cells; SARS-CoV2 Human Patient', 'Immunised mouse (TC-mAb)',
       'Engineered from ADI-55688',
       'B-cells (SARS-CoV2 Vaccinated Human)',
       'B-cells; SARS-CoV2 Human Patient plus vaccinated mRNA-1273 vaccine',
       'B-cells; SARS-CoV2_WT Vaccinee',
       'B-cells (SARS-CoV2 Human Patient+Vaccinee)',
       'B-cells (SARS-CoV2 Human Patient/Vaccinee)',
       'B-cells (SARS-CoV2 Human Patient and/or Vaccinee)',
       'Immunised mouse (RenMab)', 'Transgenic Mouse (H2L2)',
       'Phage D

In [38]:
# Test "keep human origin only" filter:
[
    origin
    for origin in cov_abdab["Origin"].unique()
    if (
        "human" in origin.lower()
        or "patient" in origin.lower()
        or "vaccinee" in origin.lower()
    )
    and "humanised" not in origin.lower()
]

['Phage Display (Ab, human, immune - SARS-CoV2)',
 'B-cells; SARS-CoV2_Gamma Human Patient',
 'B-cells; SARS-CoV2 Vaccinee (2 x ChAdOx1)',
 'B-cells; SARS-CoV2 Vaccinee (ChAdOx1 then mRNA-1273)',
 'B-cells; SARS-CoV1 Human Patient',
 'B-cells; SARS-CoV2_WT Human Patient',
 'Phage Display (Ab, human, non-immune)',
 'B-cells; SARS-CoV2 Human Patient',
 'B-cells (SARS-CoV2 Vaccinated Human)',
 'B-cells; SARS-CoV2 Human Patient plus vaccinated mRNA-1273 vaccine',
 'B-cells; SARS-CoV2_WT Vaccinee',
 'B-cells (SARS-CoV2 Human Patient+Vaccinee)',
 'B-cells (SARS-CoV2 Human Patient/Vaccinee)',
 'B-cells (SARS-CoV2 Human Patient and/or Vaccinee)',
 'Phage Display (Ab, human, immune - CoV2_WT)',
 'Phage Display (sdAbs from human VH)',
 'B-cells (Human Naive)',
 'B-cells (SARS-CoV1 Human Patient)',
 'B-cells (SARS-CoV2_Beta Human Patient)',
 'Phage Display (sdAb, non-immune, human) + Engineering',
 'B-cells (Zika Human Patient)',
 'Phage Display (single-domain, human, non-immune)',
 'B-cells (SAR

In [39]:
# rejects:
[
    origin
    for origin in cov_abdab["Origin"].unique()
    if not (
        (
            "human" in origin.lower()
            or "patient" in origin.lower()
            or "vaccinee" in origin.lower()
        )
        and "humanised" not in origin.lower()
    )
]

['B-cells; Transgenic Mouse (VelocImmune)',
 'Transgenic Mouse',
 'Phage Library Engineered from SARS-CoV-1 binder',
 'Immunised mouse (TC-mAb)',
 'Engineered from ADI-55688',
 'Immunised mouse (RenMab)',
 'Transgenic Mouse (H2L2)',
 'HIV-1 induced Ab',
 'TBC',
 'Phage Display (Humanised sdAbs, immune - CoV2)',
 'Engineered from ADI-55689',
 'Engineered from ADG-2',
 'Engineered from ADI-56046',
 'ND',
 'Phage Display (Ab, based on trastuzumab VH)',
 'Computational Design',
 'Transgenic Mouse (Alloy GK)',
 'Computational Engineering',
 'Engineered from CR3022',
 'Engineered from CR3031',
 'Engineered from CR3032',
 'Engineered from CR3033',
 'Engineered from CR3034',
 'Engineered from CR3035',
 'Engineered from CR3036',
 'Engineered from CR3037',
 'Engineered from CR3038',
 'Engineered from CR3039',
 'Engineered from CR3040',
 'Engineered from CR3023',
 'Engineered from CR3041',
 'Engineered from CR3042',
 'Engineered from CR3043',
 'Engineered from CR3044',
 'Engineered from CR3045',


In [40]:
cov_abdab.shape

(7619, 23)

In [41]:
# Apply "keep human origin only" filter:
cov_abdab = cov_abdab[
    cov_abdab["Origin"].apply(
        lambda origin: (
            "human" in origin.lower()
            or "patient" in origin.lower()
            or "vaccinee" in origin.lower()
        )
        and "humanised" not in origin.lower()
    )
]
cov_abdab.shape

(7451, 23)

In [42]:
cov_abdab["Origin"].value_counts()

B-cells; SARS-CoV2 Human Patient                                      2461
B-cells; SARS-CoV2_WT Human Patient                                   2172
B-cells (SARS-CoV2 Human Patient and/or Vaccinee)                     1055
B-cells; SARS-CoV2_WT Vaccinee                                         519
B-cells (SARS-CoV2 Human Patient+Vaccinee)                             280
B-cells; SARS-CoV1 Human Patient                                       248
Semi-synthetic Human Fab Library                                       116
B-cells (SARS-CoV2 Human Patient/Vaccinee)                              90
B-cells; SARS-CoV2_Beta Human Patient                                   81
B-cells; SARS-CoV2 Vaccinee (ChAdOx1 then mRNA-1273)                    72
Phage Display (Ab, human, non-immune)                                   58
B-cells; SARS-CoV2_Gamma Human Patient                                  50
B-cells; SARS-CoV2 Vaccinee (2 x ChAdOx1)                               45
B-cells (Human Naive)    

In [43]:
cov_abdab["Protein + Epitope"].value_counts()

S; RBD                                     4334
S; Unk                                     1754
S; NTD                                      567
S; non-RBD                                  392
S; S2                                       275
N                                            39
S; non-S1                                    33
S; S1 non-RBD                                27
Unknown                                       7
S; RBD/non-RBD                                6
S; S2 Stem Helix                              5
S; S1/S2                                      2
TBC                                           2
S                                             1
S; S2' Cleavage Site/Fusion Peptide NTD       1
S; S1                                         1
S: NTD                                        1
S: RBD                                        1
S; S2' Cleavage Site/Fusion Peptide           1
Name: Protein + Epitope, dtype: int64

In [44]:
cov_abdab["Binds to"].value_counts()

SARS-CoV2_WT                                                                                                                                                                            4371
SARS-CoV1;SARS-CoV2_WT                                                                                                                                                                   361
SARS-CoV2_WT;SARS-CoV2_Omicron-BA1;SARS-CoV2_Omicron-BA1.1;SARS-CoV2_Omicron-BA3;SARS-CoV2_Omicron-BA2;SARS-CoV2_Omicron-BA2.12.1;SARS-CoV2_Omicron-BA2.13;SARS-CoV2_Omicron-BA4/BA5     219
SARS-CoV2_WT;SARS-CoV2_Omicron-BA1;SARS-CoV2_Beta;SARS-CoV2_Delta;SARS-CoV2_Omicron-BA2;SARS-CoV1                                                                                        200
SARS-CoV2_WT;SARS-CoV1                                                                                                                                                                   192
                                                       

In [45]:
cov_abdab["Doesn't Bind to"].value_counts()

SARS-CoV1                                                                     737
SARS-CoV2_WT                                                                   72
229E;HKU1;NL63;OC43                                                            70
SARS-CoV2_Delta;SARS-CoV2_Omicron-BA1                                          57
SARS-CoV2_Omicron-BA1                                                          54
                                                                             ... 
SARS-CoV2_Mu;SARS-CoV2_Omicron-BA1                                              1
SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Delta                                    1
SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Gamma;SARS-CoV2_Delta;SARS-CoV2_Eta      1
SARS-CoV2_WT;SARS-CoV2_Gamma                                                    1
SARS-CoV2_Omicron                                                               1
Name: Doesn't Bind to, Length: 93, dtype: int64

In [46]:
cov_abdab["Not Neutralising Vs"].value_counts()

SARS-CoV2_WT                                                                                                                                                               988
SARS-CoV1                                                                                                                                                                  382
SARS-CoV1;SARS-CoV2_WT                                                                                                                                                     264
SARS-CoV2_WT;SARS-CoV2_Gamma                                                                                                                                               182
SARS-CoV2_Omicron-BA1;SARS-CoV2_Omicron-BA1.1;SARS-CoV2_Omicron-BA3;SARS-CoV2_Omicron-BA2;SARS-CoV2_Omicron-BA2.12.1;SARS-CoV2_Omicron-BA2.13;SARS-CoV2_Omicron-BA4/BA5    137
                                                                                                                             

In [47]:
cov_abdab_export = (
    cov_abdab[
        [
            "CDRH3",
            "Heavy J Gene",
            "Heavy V Gene",
            "VHorVHH",
            "Binds to",
            "Doesn't Bind to",
            "Neutralising Vs",
            "Not Neutralising Vs",
            "Protein + Epitope",
            "Origin",
            "Sources",
        ]
    ]
    .rename(columns={"Heavy J Gene": "j_gene", "Heavy V Gene": "v_gene"})
    .reset_index(drop=True)
)
cov_abdab_export

Unnamed: 0,CDRH3,j_gene,v_gene,VHorVHH,Binds to,Doesn't Bind to,Neutralising Vs,Not Neutralising Vs,Protein + Epitope,Origin,Sources
0,ARERGYSGYGAAYYFDY,IGHJ4,IGHV1-69,EVQLLESGTEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLE...,SARS-CoV2_WT;SARS-CoV2_Omicron-BA1,,SARS-CoV2_WT;SARS-CoV2_Omicron-BA1,,S; RBD,"Phage Display (Ab, human, immune - SARS-CoV2)","Qianqian Zheng et al., 2022 (https://www.scien..."
1,ASWLYGDPISFDY,IGHJ4,IGHV4-39,EVQLLESGPGLVKPSETLSLTCTVSGGSISNSSYYWGWIRQPPGKG...,SARS-CoV2_WT;SARS-CoV2_Omicron-BA1,,SARS-CoV2_WT;SARS-CoV2_Omicron-BA1,,S; RBD,"Phage Display (Ab, human, immune - SARS-CoV2)","Qianqian Zheng et al., 2022 (https://www.scien..."
2,AKSQGDYGGNSGAGYFQH,IGHJ1,IGHV3-9,EVQLVESGGGLVQPGRSLRLSCAASGFTFDEYAMHWVRQAPGKGLE...,SARS-CoV2_WT (weak);SARS-CoV2_Beta (weak);SARS...,,SARS-CoV2_WT;SARS-CoV2_Gamma;SARS-CoV2_Omicron...,,S; RBD,B-cells; SARS-CoV2_Gamma Human Patient,"Marianna Agudelo et al., 2022 (https://rupress..."
3,ARLREFGDYYYYGMDV,IGHJ6,IGHV5-51,EVQLVQSGAEVKKPGESLKISCLGSGYSFTSYWIGWVRQMPGKGLE...,SARS-CoV2_WT (weak);SARS-CoV2_Beta,SARS-CoV2_WT,SARS-CoV2_Gamma;SARS-CoV2_Omicron-BA1 (weak),SARS-CoV2_WT,S; RBD,B-cells; SARS-CoV2_Gamma Human Patient,"Marianna Agudelo et al., 2022 (https://rupress..."
4,ARDTTDDYYVI,IGHJ1,IGHV4-61,QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGSYYWSWIRQPAGKG...,SARS-CoV2_WT;SARS-CoV2_Beta;SARS-CoV2_Gamma,,SARS-CoV2_WT;SARS-CoV2_Gamma,SARS-CoV2_Omicron-BA1,S; RBD,B-cells; SARS-CoV2_Gamma Human Patient,"Marianna Agudelo et al., 2022 (https://rupress..."
...,...,...,...,...,...,...,...,...,...,...,...
7446,ARGESGSPYGMDV,IGHJ6,IGHV3-53,EVQLLESGGGLIQPGGSLRLSCAASGLTVSSNYMSWVRQAPGKGLE...,SARS-CoV2_WT,,SARS-CoV2_WT,,S; RBD,"Phage Display (Ab, human, immune - CoV2_WT)",EP3919126A1 (https://patents.google.com/patent...
7447,AAPSCSRTICSDGFDI,IGHJ3,IGHV1-58,EVQLVQSGPEVKKPGTSVKVACKASGFTFITPVSMQWVRQARGQRL...,SARS_CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS_CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,S; RBD,B-cells; SARS-CoV2 Human Vaccinees (BNT162b2),"Biao Zhou et al., 2022 (https://www.nature.com..."
7448,AREGRRYGSGWYISTGYFDY,IGHJ4,IGHV1-69,QVQLVQSGAEVKKPGSSVRVSCKASGGTFSTYPISWVRQAPGQGLE...,SARS-CoV2_WT,,SARS-CoV2_WT,,S; RBD,B-cells; SARS-CoV2_WT Human Patient,"Xiaojuan Zhou et al., 2021 (https://www.scienc..."
7449,AIHGGTYYYDKNILA,IGHJ4,IGHV3-30,EVQLVESGGGVVQPGTSLRLSCAASGFSFSHYVMYWVRQAPGKGLD...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,SARS-CoV2_Omicron,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,SARS-CoV2_Omicron,S; RBD,B-cells (SARS-CoV2 Vaccinated Human),CN113735970A (https://patents.google.com/paten...


CDRH3 already has `C` prefix and `W` suffix removed - consistent with our internal data.

In [48]:
# compute cdr3_aa_sequence_trim_len

In [49]:
cov_abdab_export["cdr3_seq_aa_q_trim"] = (
    cov_abdab_export["CDRH3"]
    .str.replace(".", "", regex=False)
    .str.replace("-", "", regex=False)
    .str.replace(" ", "", regex=False)
    .str.replace("*", "", regex=False)
    .str.strip()
    .str.upper()
)
cov_abdab_export["cdr3_aa_sequence_trim_len"] = cov_abdab_export[
    "cdr3_seq_aa_q_trim"
].str.len()
cov_abdab_export

Unnamed: 0,CDRH3,j_gene,v_gene,VHorVHH,Binds to,Doesn't Bind to,Neutralising Vs,Not Neutralising Vs,Protein + Epitope,Origin,Sources,cdr3_seq_aa_q_trim,cdr3_aa_sequence_trim_len
0,ARERGYSGYGAAYYFDY,IGHJ4,IGHV1-69,EVQLLESGTEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLE...,SARS-CoV2_WT;SARS-CoV2_Omicron-BA1,,SARS-CoV2_WT;SARS-CoV2_Omicron-BA1,,S; RBD,"Phage Display (Ab, human, immune - SARS-CoV2)","Qianqian Zheng et al., 2022 (https://www.scien...",ARERGYSGYGAAYYFDY,17
1,ASWLYGDPISFDY,IGHJ4,IGHV4-39,EVQLLESGPGLVKPSETLSLTCTVSGGSISNSSYYWGWIRQPPGKG...,SARS-CoV2_WT;SARS-CoV2_Omicron-BA1,,SARS-CoV2_WT;SARS-CoV2_Omicron-BA1,,S; RBD,"Phage Display (Ab, human, immune - SARS-CoV2)","Qianqian Zheng et al., 2022 (https://www.scien...",ASWLYGDPISFDY,13
2,AKSQGDYGGNSGAGYFQH,IGHJ1,IGHV3-9,EVQLVESGGGLVQPGRSLRLSCAASGFTFDEYAMHWVRQAPGKGLE...,SARS-CoV2_WT (weak);SARS-CoV2_Beta (weak);SARS...,,SARS-CoV2_WT;SARS-CoV2_Gamma;SARS-CoV2_Omicron...,,S; RBD,B-cells; SARS-CoV2_Gamma Human Patient,"Marianna Agudelo et al., 2022 (https://rupress...",AKSQGDYGGNSGAGYFQH,18
3,ARLREFGDYYYYGMDV,IGHJ6,IGHV5-51,EVQLVQSGAEVKKPGESLKISCLGSGYSFTSYWIGWVRQMPGKGLE...,SARS-CoV2_WT (weak);SARS-CoV2_Beta,SARS-CoV2_WT,SARS-CoV2_Gamma;SARS-CoV2_Omicron-BA1 (weak),SARS-CoV2_WT,S; RBD,B-cells; SARS-CoV2_Gamma Human Patient,"Marianna Agudelo et al., 2022 (https://rupress...",ARLREFGDYYYYGMDV,16
4,ARDTTDDYYVI,IGHJ1,IGHV4-61,QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGSYYWSWIRQPAGKG...,SARS-CoV2_WT;SARS-CoV2_Beta;SARS-CoV2_Gamma,,SARS-CoV2_WT;SARS-CoV2_Gamma,SARS-CoV2_Omicron-BA1,S; RBD,B-cells; SARS-CoV2_Gamma Human Patient,"Marianna Agudelo et al., 2022 (https://rupress...",ARDTTDDYYVI,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7446,ARGESGSPYGMDV,IGHJ6,IGHV3-53,EVQLLESGGGLIQPGGSLRLSCAASGLTVSSNYMSWVRQAPGKGLE...,SARS-CoV2_WT,,SARS-CoV2_WT,,S; RBD,"Phage Display (Ab, human, immune - CoV2_WT)",EP3919126A1 (https://patents.google.com/patent...,ARGESGSPYGMDV,13
7447,AAPSCSRTICSDGFDI,IGHJ3,IGHV1-58,EVQLVQSGPEVKKPGTSVKVACKASGFTFITPVSMQWVRQARGQRL...,SARS_CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS_CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,S; RBD,B-cells; SARS-CoV2 Human Vaccinees (BNT162b2),"Biao Zhou et al., 2022 (https://www.nature.com...",AAPSCSRTICSDGFDI,16
7448,AREGRRYGSGWYISTGYFDY,IGHJ4,IGHV1-69,QVQLVQSGAEVKKPGSSVRVSCKASGGTFSTYPISWVRQAPGQGLE...,SARS-CoV2_WT,,SARS-CoV2_WT,,S; RBD,B-cells; SARS-CoV2_WT Human Patient,"Xiaojuan Zhou et al., 2021 (https://www.scienc...",AREGRRYGSGWYISTGYFDY,20
7449,AIHGGTYYYDKNILA,IGHJ4,IGHV3-30,EVQLVESGGGVVQPGTSLRLSCAASGFSFSHYVMYWVRQAPGKGLD...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,SARS-CoV2_Omicron,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,SARS-CoV2_Omicron,S; RBD,B-cells (SARS-CoV2 Vaccinated Human),CN113735970A (https://patents.google.com/paten...,AIHGGTYYYDKNILA,15


In [50]:
cov_abdab_export = cov_abdab_export.drop_duplicates(
    subset=["v_gene", "j_gene", "cdr3_seq_aa_q_trim"]
)
cov_abdab_export

Unnamed: 0,CDRH3,j_gene,v_gene,VHorVHH,Binds to,Doesn't Bind to,Neutralising Vs,Not Neutralising Vs,Protein + Epitope,Origin,Sources,cdr3_seq_aa_q_trim,cdr3_aa_sequence_trim_len
0,ARERGYSGYGAAYYFDY,IGHJ4,IGHV1-69,EVQLLESGTEVKKPGSSVKVSCKASGGTFSSYAISWVRQAPGQGLE...,SARS-CoV2_WT;SARS-CoV2_Omicron-BA1,,SARS-CoV2_WT;SARS-CoV2_Omicron-BA1,,S; RBD,"Phage Display (Ab, human, immune - SARS-CoV2)","Qianqian Zheng et al., 2022 (https://www.scien...",ARERGYSGYGAAYYFDY,17
1,ASWLYGDPISFDY,IGHJ4,IGHV4-39,EVQLLESGPGLVKPSETLSLTCTVSGGSISNSSYYWGWIRQPPGKG...,SARS-CoV2_WT;SARS-CoV2_Omicron-BA1,,SARS-CoV2_WT;SARS-CoV2_Omicron-BA1,,S; RBD,"Phage Display (Ab, human, immune - SARS-CoV2)","Qianqian Zheng et al., 2022 (https://www.scien...",ASWLYGDPISFDY,13
2,AKSQGDYGGNSGAGYFQH,IGHJ1,IGHV3-9,EVQLVESGGGLVQPGRSLRLSCAASGFTFDEYAMHWVRQAPGKGLE...,SARS-CoV2_WT (weak);SARS-CoV2_Beta (weak);SARS...,,SARS-CoV2_WT;SARS-CoV2_Gamma;SARS-CoV2_Omicron...,,S; RBD,B-cells; SARS-CoV2_Gamma Human Patient,"Marianna Agudelo et al., 2022 (https://rupress...",AKSQGDYGGNSGAGYFQH,18
3,ARLREFGDYYYYGMDV,IGHJ6,IGHV5-51,EVQLVQSGAEVKKPGESLKISCLGSGYSFTSYWIGWVRQMPGKGLE...,SARS-CoV2_WT (weak);SARS-CoV2_Beta,SARS-CoV2_WT,SARS-CoV2_Gamma;SARS-CoV2_Omicron-BA1 (weak),SARS-CoV2_WT,S; RBD,B-cells; SARS-CoV2_Gamma Human Patient,"Marianna Agudelo et al., 2022 (https://rupress...",ARLREFGDYYYYGMDV,16
4,ARDTTDDYYVI,IGHJ1,IGHV4-61,QVQLQESGPGLVKPSQTLSLTCTVSGGSISSGSYYWSWIRQPAGKG...,SARS-CoV2_WT;SARS-CoV2_Beta;SARS-CoV2_Gamma,,SARS-CoV2_WT;SARS-CoV2_Gamma,SARS-CoV2_Omicron-BA1,S; RBD,B-cells; SARS-CoV2_Gamma Human Patient,"Marianna Agudelo et al., 2022 (https://rupress...",ARDTTDDYYVI,11
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7446,ARGESGSPYGMDV,IGHJ6,IGHV3-53,EVQLLESGGGLIQPGGSLRLSCAASGLTVSSNYMSWVRQAPGKGLE...,SARS-CoV2_WT,,SARS-CoV2_WT,,S; RBD,"Phage Display (Ab, human, immune - CoV2_WT)",EP3919126A1 (https://patents.google.com/patent...,ARGESGSPYGMDV,13
7447,AAPSCSRTICSDGFDI,IGHJ3,IGHV1-58,EVQLVQSGPEVKKPGTSVKVACKASGFTFITPVSMQWVRQARGQRL...,SARS_CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,SARS_CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,,S; RBD,B-cells; SARS-CoV2 Human Vaccinees (BNT162b2),"Biao Zhou et al., 2022 (https://www.nature.com...",AAPSCSRTICSDGFDI,16
7448,AREGRRYGSGWYISTGYFDY,IGHJ4,IGHV1-69,QVQLVQSGAEVKKPGSSVRVSCKASGGTFSTYPISWVRQAPGQGLE...,SARS-CoV2_WT,,SARS-CoV2_WT,,S; RBD,B-cells; SARS-CoV2_WT Human Patient,"Xiaojuan Zhou et al., 2021 (https://www.scienc...",AREGRRYGSGWYISTGYFDY,20
7449,AIHGGTYYYDKNILA,IGHJ4,IGHV3-30,EVQLVESGGGVVQPGTSLRLSCAASGFSFSHYVMYWVRQAPGKGLD...,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,SARS-CoV2_Omicron,SARS-CoV2_WT;SARS-CoV2_Alpha;SARS-CoV2_Beta;SA...,SARS-CoV2_Omicron,S; RBD,B-cells (SARS-CoV2 Vaccinated Human),CN113735970A (https://patents.google.com/paten...,AIHGGTYYYDKNILA,15


In [51]:
cov_abdab_export["Sources"].value_counts()

Wooseob Kim et al., 2022 (https://www.nature.com/articles/s41586-022-04527-1)                                                                                                                                                                                                                  1456
Yunlong Cao et al. 2022 (https://www.nature.com/articles/s41586-022-04980-y#Sec4)                                                                                                                                                                                                              1095
Alice Cho et al., 2021 (https://www.nature.com/articles/s41586-021-04060-7)                                                                                                                                                                                                                     394
Zijun Wang et al., 2022 (https://www.cell.com/immunity/fulltext/S1074-7613(22)00174-1)                                      

In [52]:
cov_abdab_export.drop(["Sources"], axis=1).to_csv(
    config.paths.base_data_dir / "CoV-AbDab_260722.filtered.tsv", sep="\t", index=None
)