In [77]:
import pandas as pd
import numpy as np

In [78]:
seqs = pd.read_csv('./gilman_RSV_Abs/5_AA-sequences.txt', sep='\t', index_col=0)

In [79]:
seqs['Sequence ID'] = seqs['Sequence ID'].apply(lambda x: x.split('|')[1].split('.')[0])
seqs.index = seqs['Sequence ID']

In [80]:
hc_seqs = seqs[~seqs['V-D-J-REGION'].isna()]
lc_seqs = seqs[seqs['V-D-J-REGION'].isna()]

In [81]:
supp_table = pd.read_excel('./aaj1879_data_files_s1_to_s3/aaj1879_Data file S1.xlsx',  index_col=0)


binding_cols = ['Prefusion subtype A Kd (M)*', 'Postfusion subtype A Kd (M)*', 'Prefusion subtype B Kd (M)*', 'Postfusion subtype B Kd (M)*']
# Converting Kd data to numeric
supp_table[binding_cols] = supp_table[binding_cols].apply(lambda x: pd.to_numeric(x, errors='coerce'))

supp_table.dropna(how='all', inplace=True)
supp_table.shape

(364, 20)

In [82]:
# Paper says low potency are 0.5-5 ug/ml, medium 0.05-0.5, and strong are <0.05
rsvA_cols = ['Prefusion subtype A Kd (M)*', 'Postfusion subtype A Kd (M)*']
rsvA = supp_table[pd.to_numeric(supp_table['Neut IC50 (ug/ml) subtype A*'], errors='coerce') < 5]
rsvA = rsvA[rsvA['Prefusion subtype A Kd (M)*'] != 'NB']
rsvA['strongest_binder'] = rsvA[rsvA_cols].apply(lambda x: str(x.idxmin()).replace('Kd (M)*', ''), axis=1)
print(rsvA.shape)


rsvB_cols = ['Prefusion subtype B Kd (M)*', 'Postfusion subtype B Kd (M)*']
rsvB = supp_table[pd.to_numeric(supp_table['Neut IC50 (ug/ml) subtype B*'], errors='coerce') < 5]
rsvB = rsvB[rsvB['Prefusion subtype B Kd (M)*'] != 'NB']
rsvB['strongest_binder'] = rsvB[rsvB_cols].apply(lambda x: str(x.idxmin()).replace('Kd (M)*', ''), axis=1)

print(rsvB.shape)

(242, 21)
(237, 21)


In [83]:
hc_dict = dict(zip(hc_seqs.index, hc_seqs['V-D-J-REGION']))
lc_dict = dict(zip(lc_seqs.index, lc_seqs['V-J-REGION']))

In [84]:
seq_df = supp_table[['GenBank Accession Number (VH)', 'GenBank Accession Number (VL)']]
seq_df.dropna(inplace=True)

seq_df['GenBank Accession Number (VH)'] = seq_df['GenBank Accession Number (VH)'].map(hc_dict)
seq_df['GenBank Accession Number (VL)'] = seq_df['GenBank Accession Number (VL)'].map(lc_dict)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seq_df.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seq_df['GenBank Accession Number (VH)'] = seq_df['GenBank Accession Number (VH)'].map(hc_dict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  seq_df['GenBank Accession Number (VL)'] = seq_df['GenBank Accession Number (VL)'].map(lc_dict)


In [85]:
rsvB_seqdf = seq_df.loc[rsvB.index]
rsvA_seqdf = seq_df.loc[rsvA.index]

In [86]:
# sp|P03420|FUS_HRSVA Fusion glycoprotein F0 OS=Human respiratory syncytial virus A (strain A2) OX=11259 GN=F PE=1 SV=1
# https://rest.uniprot.org/uniprotkb/P03420.fasta
rsvAf = 'MELLILKANAITTILTAVTFCFASGQNITEEFYQSTCSAVSKGYLSALRTGWYTSVITIELSNIKENKCNGTDAKVKLIKQELDKYKNAVTELQLLMQSTPPTNNRARRELPRFMNYTLNNAKKTNVTLSKKRKRRFLGFLLGVGSAIASGVAVSKVLHLEGEVNKIKSALLSTNKAVVSLSNGVSVLTSKVLDLKNYIDKQLLPIVNKQSCSISNIETVIEFQQKNNRLLEITREFSVNAGVTTPVSTYMLTNSELLSLINDMPITNDQKKLMSNNVQIVRQQSYSIMSIIKEEVLAYVVQLPLYGVIDTPCWKLHTSPLCTTNTKEGSNICLTRTDRGWYCDNAGSVSFFPQAETCKVQSNRVFCDTMNSLTLPSEINLCNVDIFNPKYDCKIMTSKTDVSSSVITSLGAIVSCYGKTKCTASNKNRGIIKTFSNGCDYVSNKGMDTVSVGNTLYYVNKQEGKSLYVKGEPIINFYDPLVFPSDEFDASISQVNEKINQSLAFIRKSDELLHNVNAGKSTTNIMITTIIIVIIVILLSLIAVGLLLYCKARSTPVTLSKDQLSGINNIAFSN'

# sp|P13843|FUS_HRSV1 Fusion glycoprotein F0 OS=Human respiratory syncytial virus B (strain 18537) OX=11251 GN=F PE=1 SV=1
# https://rest.uniprot.org/uniprotkb/P13843.fasta
rsvBf = 'MELLIHRSSAIFLTLAVNALYLTSSQNITEEFYQSTCSAVSRGYFSALRTGWYTSVITIELSNIKETKCNGTDTKVKLIKQELDKYKNAVTELQLLMQNTPAANNRARREAPQYMNYTINTTKNLNVSISKKRKRRFLGFLLGVGSAIASGIAVSKVLHLEGEVNKIKNALLSTNKAVVSLSNGVSVLTSKVLDLKNYINNRLLPIVNQQSCRISNIETVIEFQQMNSRLLEITREFSVNAGVTTPLSTYMLTNSELLSLINDMPITNDQKKLMSSNVQIVRQQSYSIMSIIKEEVLAYVVQLPIYGVIDTPCWKLHTSPLCTTNIKEGSNICLTRTDRGWYCDNAGSVSFFPQADTCKVQSNRVFCDTMNSLTLPSEVSLCNTDIFNSKYDCKIMTSKTDISSSVITSLGAIVSCYGKTKCTASNKNRGIIKTFSNGCDYVSNKGVDTVSVGNTLYYVNKLEGKNLYVKGEPIINYYDPLVFPSDEFDASISQVNEKINQSLAFIRRSDELLHNVNTGKSTTNIMITTIIIVIIVVLLSLIAIGLLLYCKAKNTPVTLSKDQLSGINNIAFSK'


In [87]:
rsvA_seqdf.columns = ['VH_AA', 'VL_AA']
rsvA_seqdf['antigen_seq'] = rsvAf
rsvA_seqdf['antigen_name'] = 'RSV-A'

rsvB_seqdf.columns = ['VH_AA', 'VL_AA']
rsvB_seqdf['antigen_seq'] = rsvBf
rsvB_seqdf['antigen_name'] = 'RSV-B'


In [88]:
print(pd.concat([rsvA_seqdf, rsvB_seqdf]).shape)
concat_df =pd.concat([rsvA_seqdf, rsvB_seqdf])

(479, 4)


In [89]:
concat_df[binding_cols] = supp_table[binding_cols]
concat_df[['Neut IC50 (ug/ml) subtype A*',	'Neut IC50 (ug/ml) subtype B*']] = supp_table[['Neut IC50 (ug/ml) subtype A*',	'Neut IC50 (ug/ml) subtype B*']]
concat_df[['CDRH3', 'CDRL3']] = supp_table[['CDR H3 Sequence','CDR L3 Sequence']]

In [90]:
concat_df.index = concat_df['antigen_name'] + ':' + concat_df.index

In [93]:
# concat_df.to_csv('gilman_RSV_binders_v2_24-03-08.csv')

In [106]:
pd.concat([rsvA_seqdf, rsvB_seqdf]).to_csv('rsv_Gilman_rsvA-B_seqs_v2_24-03-07.csv')