In [1]:
import pandas as pd
import numpy as np

In [2]:
groups = {
    'HIV': ['ZM197.LSS', 'BG505.LSS', 'B4.1.LSS',],
    'SARS': ['XBB.1.LSS', 'BQ.1.1.LSS', 'SARS-1.LSS', 'OC43.LSS', 'HKU1.LSS',],
    'MPV/RSV/PIV': ['RSV_A.LSS', 'RSV_B.LSS', 'RSV_Post.LSS', 'MPV_A.LSS', 'MPV_B.LSS', 'MPV_Post.LSS', 'PIV_3.LSS'],
    'Noro':['SYD_2012.LSS', 'CHDC.LSS', 'GII.17.LSS'],
    'Flu': [ 'H3_HK68.LSS', 'H3_Perth19.LSS', 'H1_MI15.LSS', 'H1_NC99.LSS', 'H2_SG57.LSS', 'B_Wash19.LSS', 'H5_VN04.LSS', 'H5_IN05.LSS', 'H7_Anh13.LSS', 'H9_HK09.LSS', 'H10_JD13.LSS']
        }

donor_status_dict = {
    'HIV':['donor1', 'donor4', 'donor6', 'donor11', 'donor12'], # infected
    'Healthy':['donor2', 'donor3', 'donor5', 'donor7a', 'donor7b', 'donor8'],
    'Flu':['donor9', 'donor10', 'donor13', 'donor14', 'donor15'], # vaccinated
    'Covid':['donor16', 'donor17', 'donor18', 'donor19', 'donor20'] # convalescent
}

In [3]:
import os
output_files = os.listdir('./paired_lseq_outputs/')

In [4]:
dfs = {}
count = 0

for i in output_files:
    df = pd.read_csv('./paired_lseq_outputs/' + i, sep = '\t')

    # break
    df.index = df['BARCODE'] +  '-' + df.index.astype(str)
    donor = i.split('_')[0]
    dfs[donor] = df



In [5]:
df = pd.concat(dfs)
df.shape

(196657, 118)

In [6]:
df = df[df['N'] == 1]

In [7]:
df = df[df['C_CALL.H'] != 'IGHM']
df = df[df['C_CALL.H'] != 'IGHD']

In [8]:
lss_names = [i for i in df.columns if '.LSS' in i]
lss_df = df[lss_names]

# Bin LSS based on threshold of 2
lss_df = lss_df.apply(lambda x: x >=2)

In [9]:
# Subset to only rows with at least one LSS >= 2
binned_df = df[lss_df.sum(axis=1) > 0]

# Subset LSS df as well
lss_df = lss_df[lss_df.sum(axis=1) > 0]

In [10]:
umi_names = [i.split('.LSS')[0] for i in lss_names]

In [11]:
# Get final dataframe where both lss_df is true (>=2) and UMIs pass threshold
final_df = lss_df & binned_df[umi_names].apply(lambda x: x >= 30).values

In [12]:
# Finally, remove any cells with binding interactions
final_df = final_df.loc[final_df[final_df.sum(axis=1) > 0].index]

In [13]:
# Getting rows from non-HIV donors
non_HIV_index = [i for i in final_df.index if i[0] not in donor_status_dict['HIV']]
non_HIV_df = final_df.loc[non_HIV_index].copy()

# Finding rows in non-HIV donors that bind to at least one HIV antigen
false_HIV_idx = non_HIV_df[non_HIV_df[groups['HIV']].sum(axis=1) > 0].index
print(len(false_HIV_idx))

37


In [14]:
# Drop non-HIV donor cells that bind HIV
final_df.drop(false_HIV_idx, inplace=True)

In [15]:
final_df.shape

(2175, 29)

In [16]:
final_df.sum(axis=0).sum()

3776

### Removing polyreactive

We have 2212 Abs that bind to at least one antigen, with 3,922 total binding hits.

In [17]:


groups = {
    'HIV': ['ZM197.LSS', 'BG505.LSS', 'B4.1.LSS',],
    'SARS': ['XBB.1.LSS', 'BQ.1.1.LSS', 'SARS-1.LSS', 'OC43.LSS', 'HKU1.LSS',],
    'MPV/RSV/PIV': ['RSV_A.LSS', 'RSV_B.LSS', 'RSV_Post.LSS', 'MPV_A.LSS', 'MPV_B.LSS', 'MPV_Post.LSS', 'PIV_3.LSS'],
    'Noro':['SYD_2012.LSS', 'CHDC.LSS', 'GII.17.LSS'],
    'Flu': [ 'H3_HK68.LSS', 'H3_Perth19.LSS', 'H1_MI15.LSS', 'H1_NC99.LSS', 'H2_SG57.LSS', 'B_Wash19.LSS', 'H5_VN04.LSS', 'H5_IN05.LSS', 'H7_Anh13.LSS', 'H9_HK09.LSS', 'H10_JD13.LSS']
        }


groups = {
    'HIV': ['ZM197.LSS', 'BG505.LSS', 'B4.1.LSS',],
    'SARS': ['XBB.1.LSS', 'BQ.1.1.LSS', 'SARS-1.LSS', 'OC43.LSS', 'HKU1.LSS',],
    'MPV/RSV': ['RSV_A.LSS', 'RSV_B.LSS', 'RSV_Post.LSS', 'MPV_A.LSS', 'MPV_B.LSS', 'MPV_Post.LSS'],
    'PIV':[ 'PIV_3.LSS'],
    'Noro':['SYD_2012.LSS', 'CHDC.LSS', 'GII.17.LSS'],
    'Flu': [ 'H3_HK68.LSS', 'H3_Perth19.LSS', 'H1_MI15.LSS', 'H1_NC99.LSS', 'H2_SG57.LSS', 'B_Wash19.LSS', 'H5_VN04.LSS', 'H5_IN05.LSS', 'H7_Anh13.LSS', 'H9_HK09.LSS', 'H10_JD13.LSS']
        }

# poly_groups =  {
#     'HIV': ['ZM197.LSS', 'BG505.LSS', 'B4.1.LSS',],
#     # 'SARS': [,],
#     'MPV/RSV/PIV': ['RSV_A.LSS', 'RSV_B.LSS', 'RSV_Post.LSS', 'MPV_A.LSS', 'MPV_B.LSS', 'MPV_Post.LSS', 'PIV_3.LSS', 'XBB.1.LSS', 'BQ.1.1.LSS', 'SARS-1.LSS', 'OC43.LSS', 'HKU1.LSS', 'H3_HK68.LSS', 'H3_Perth19.LSS', 'H1_MI15.LSS', 'H1_NC99.LSS', 'H2_SG57.LSS', 'B_Wash19.LSS', 'H5_VN04.LSS', 'H5_IN05.LSS', 'H7_Anh13.LSS', 'H9_HK09.LSS', 'H10_JD13.LSS' ],
#     'Noro':['SYD_2012.LSS', 'CHDC.LSS', 'GII.17.LSS'],
#     # 'Flu': [ ]
#         }

In [18]:
grouped_dfs = {}

for viral_group in groups:
    viral_subset_df = final_df[groups[viral_group]]

    # if viral_group != 'PIV':
    #     upsetplot.plot(viral_subset_df.value_counts(), show_counts=True)
        
    #     fig = plt.gcf()
    #     name = viral_group.split('/')[0]
    #     # fig.savefig(name + '_upset_plot_10618-AA_24-01-05.png')
    # # break
    viral_subset_df = viral_subset_df.sum(axis=1)
    grouped_dfs[viral_group] = viral_subset_df
    # break

In [19]:
grouped_df = pd.DataFrame(grouped_dfs)

In [20]:
poly_df = grouped_df.apply(lambda x: x >=1) # .astype(int)

In [21]:
poly_ab_idx = poly_df[poly_df.sum(axis=1) > 1].index
len(poly_ab_idx)

528

In [22]:
final_df.drop(poly_ab_idx, inplace=True)

In [23]:
final_df.sum(axis=0).sum()

1928

In [25]:
final_df.shape

(1647, 29)

In [29]:
final_df.index = final_df.index.get_level_values(0) + '_' + final_df.index.get_level_values(1)

In [34]:
final_df.head()

Unnamed: 0,SYD_2012.LSS,CHDC.LSS,GII.17.LSS,BG505.LSS,B4.1.LSS,ZM197.LSS,XBB.1.LSS,BQ.1.1.LSS,SARS-1.LSS,OC43.LSS,...,H7_Anh13.LSS,H9_HK09.LSS,H10_JD13.LSS,RSV_A.LSS,RSV_B.LSS,RSV_Post.LSS,MPV_A.LSS,MPV_B.LSS,MPV_Post.LSS,PIV_3.LSS
donor9_CAGTAACCACGAGAGT-1649,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
donor9_GAAACTCTCTTGTCAT-3089,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
donor9_GCCAAATGTCTAACGT-3566,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
donor9_GGCGTGTGTACCTACA-3921,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
donor9_GGTATTGCACATGTGT-4032,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [47]:
final_df

Unnamed: 0,SYD_2012.LSS,CHDC.LSS,GII.17.LSS,BG505.LSS,B4.1.LSS,ZM197.LSS,XBB.1.LSS,BQ.1.1.LSS,SARS-1.LSS,OC43.LSS,...,H7_Anh13.LSS,H9_HK09.LSS,H10_JD13.LSS,RSV_A.LSS,RSV_B.LSS,RSV_Post.LSS,MPV_A.LSS,MPV_B.LSS,MPV_Post.LSS,PIV_3.LSS
donor9_CAGTAACCACGAGAGT-1649,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
donor9_GAAACTCTCTTGTCAT-3089,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
donor9_GCCAAATGTCTAACGT-3566,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
donor9_GGCGTGTGTACCTACA-3921,False,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
donor9_GGTATTGCACATGTGT-4032,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
donor3_GCATGCGAGGGATCTG-2822,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,True,True,False,False,False,False
donor3_GGTGTTAGTGCGATAG-3233,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,True,True,False,False,False,False
donor3_GTTCGGGTCTCTTATG-3493,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
donor3_TCTCTAAGTCGCATAT-4001,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [50]:
seq_df = final_df.melt(ignore_index=False)
seq_df = seq_df[seq_df['value'] == True]

In [52]:
seq_df.to_csv('AbGPT-v2_10618_binding_24-03-06.csv')