In [None]:
import networkx as nx

import itertools
import requests
import urllib3

from collections import Counter
from collections import defaultdict

import numpy as np
from networkx.algorithms import shortest_paths
from scipy.cluster.hierarchy import linkage,dendrogram
from scipy import cluster
from matplotlib import pyplot as plt
import pickle


In [None]:
# make list of ligands to exclude 

ligs2excl = []

with open('ligands-to-exclude.txt','r') as file:
    line_list = file.readlines()
    for line in line_list:
        ligs2excl.append(line.split()[0])
        

In [None]:
# get total number of residues 

total_res_dict = pickle.load(open('total_res_dict.p','rb'))

In [None]:
# set filters on PDB Spheres data

datecut = 'current' # to include all templates currently available: 'current'
resolutioncut = 'all' # to include all resolutions: 'all'
gdccut = '60' 
Nccut = '15'
N4cut = '4'
ligsizecut = '8'
clcut = '0'
            

In [None]:
# create ligand binding dictionary and calculate percentage of residues that each ligand binds

ligand_dict = {}
fracres_dict = {}
ligs_leaveout = {}
all_ligs_remove = []
bind_thresh = 0.333

for lig in ligs2excl:
    all_ligs_remove.append(lig)

    
for protnow in ['E','S','ORF3a','nsp12','nsp13','nsp14','nsp15','nsp16','nsp3','nsp5','nsp7','nsp8','nsp9',\
                'nsp1','nsp2','ORF7a','nsp4','nsp10','N','ORF8']:
    
    rlist = []
    llist = []
    ligand_dict[protnow] = {}
    
    file = open('CCC.confidence_centroid_contacts.'+Nccut+'_10_'+gdccut+'_'+N4cut+'_'+clcut+'.ligs_'+ligsizecut+'.nCoV.'+datecut+'.res'+resolutioncut,'r')

    
    line_list = file.readlines()
    
    for line in line_list:          
        # viral protein
        if line.split()[0].split('.')[0].split('_')[0]=='nCoV':
            protein = line.split()[0].split('.')[0].split('_')[1]
            if protein=='Spike':
                protein = 'S'
        
        # human protein
        elif line.split()[0].split('.')[0].split('_')[0]=='sp':
            protein = line.split()[0].split('.')[0].split('_')[2]
            
        if protein==protnow:
            # ligand
            ligand = line.split()[0].split('.')[6]

            # residues
            binding_residues = line.split()[-1].split(',')
            del binding_residues[-1]
                
            if len(binding_residues)>0:
                if ligand not in llist:
                    llist.append(ligand)
    
                for residue in binding_residues:
                    if residue not in rlist:
                        rlist.append(residue)
                        
                    if ligand not in ligand_dict[protnow]:
                        ligand_dict[protnow][ligand] = [residue]
                    elif ligand in ligand_dict[protnow] and residue not in ligand_dict[protnow][ligand]:
                        ligand_dict[protnow][ligand].append(residue)
            
    file.close()
        
    rlist.sort()
    llist.sort()
    
    fracres_dict[protnow] = {}
    ligs_leaveout[protnow] = []
    for lig in ligs2excl:
        ligs_leaveout[protnow].append(lig)
    
    for lig in llist:
        fracres_dict[protnow][lig] = float(len(ligand_dict[protnow][lig]))/float(total_res_dict[protnow])
        if fracres_dict[protnow][lig]>bind_thresh and lig not in ligs_leaveout[protnow]:
            ligs_leaveout[protnow].append(lig)
            
    rfd_sorted = sorted(fracres_dict[protnow].items(), key=lambda x: x[1], reverse=True)   
    
    for lig in ligs_leaveout[protnow]:
        if lig not in all_ligs_remove:
            all_ligs_remove.append(lig)

In [None]:
# Load the contact ligand residues into data structures
# Filter ligands by SMILES strings and percentage of residues they bind
# Filter PDB templates by date available, resolution, GDC value

def findOccurrences(s, ch):
    return [i for i, letter in enumerate(s) if letter == ch]

with open('CCC.confidence_centroid_contacts.'+Nccut+'_10_'+gdccut+'_'+N4cut+'_'+clcut+'.ligs_'+ligsizecut+'.nCoV.'+datecut+'.res'+resolutioncut) as M:
    wer=M.readlines()

ncovdict=defaultdict(lambda: ([], [])) 
    
ligdict=defaultdict(set)
filedict=defaultdict(set)

all_contacts = {}
for protnow in ['E','S','ORF3a','nsp12','nsp13','nsp14','nsp15','nsp16','nsp3','nsp5','nsp7','nsp8','nsp9',\
                'nsp1','nsp2','ORF7a','nsp4','nsp10','N','ORF8']:
    all_contacts[protnow] = 0

for lin in wer:
    if lin.split()[0].split('.')[0].split('_')[0]=='nCoV':
        ligand = lin.split()[0].split('.')[6]
        ncovfind=lin.find('nCoV_')
        underfind=findOccurrences(lin,'_')
        virprot=lin[(underfind[min(k for k,x in enumerate(underfind) if x>ncovfind)]+1):underfind[min(k for k,x in enumerate(underfind) if x>ncovfind)+1]]
        if virprot=='Spike':
            virprot='S'     
        if ligand not in ligs_leaveout[virprot]:  
            fins=findOccurrences(lin, '.')
            spherfind=lin.find('.Sphere.')
            ligid=lin[(fins[min(k for k,x in enumerate(fins) if x>spherfind)]+1):fins[1+min(k for k,x in enumerate(fins) if x>spherfind)]]
            contstr=lin.strip(',\n').split()[-1]
            conts=contstr.split(',')
            all_contacts[virprot] = all_contacts[virprot] + len(conts)
            fileSrc=lin[:lin.find(':')]
            nonodes=[1 if (not cont[-2]=='_') else 0 for cont in conts]
            if any(nonodes):
                continue
            ncovdict[virprot][1].extend(conts)
            for cont in conts:
                ligdict[virprot+'.'+cont].add(ligid)
                filedict[virprot+'.'+cont].add(fileSrc)
            for pair in itertools.combinations(conts,2):
                ncovdict[virprot][0].append(pair)

print(all_contacts)


In [None]:
# Identify PDB Spheres contacts that contribute to each cluster

directory = 'cluster-output-ncov-residues-shortestpath-CCC-'+Nccut+'-10-'+gdccut+'-'+N4cut+'-'+clcut+'.ligs_'+ligsizecut+'/date_'+datecut+'_res'+resolutioncut

cldict = pickle.load(open(directory+'/cldict.p',"rb"))

cluster_spheres = {}

for protnow in ['S','nsp3','nsp5','nsp12','nsp13','nsp14','ORF3a','nsp9','nsp15','nsp16']:
    print(protnow)
    cluster_spheres[protnow] = {}
    # each pocket
    for key in cldict[protnow].keys():
        cluster_spheres[protnow][key] = []
        # iterate through pairs of residues in pocket
        for pair in itertools.combinations(cldict[protnow][key]['residues'],2): 
            res1 = pair[0]
            res2 = pair[1]
            # check if pair shares one or more contacts
            if (res1,res2) in ncovdict[protnow][0] or (res2,res1) in ncovdict[protnow][0]: 
                for contact in filedict[protnow+'.'+res1]:
                    if contact in filedict[protnow+'.'+res2]:
                        if contact not in cluster_spheres[protnow][key]:
                            cluster_spheres[protnow][key].append(contact)
        

In [None]:
# open dictionary of PDB templates and their resolution (in Angstroms)

resolution_dict = pickle.load(open('template_resolution_dict.p',"rb"))

print(len(resolution_dict))

restempslist = resolution_dict.keys()


In [None]:
# open dictionary of PDB templates and their organism(s)

organism_dict = pickle.load(open('template_organism_dict.p',"rb"))

In [None]:
# make list of unique organisms

unique_organisms_list = []

for template,organism in organism_dict.items():
    if ';' in organism:
        org_list = [organism.split('; ')[i] for i in range(0,len(organism.split('; ')))]
        for org in org_list:
            if org not in unique_organisms_list:
                unique_organisms_list.append(org)
    else:
        if organism not in unique_organisms_list:
            unique_organisms_list.append(organism)

print(len(unique_organisms_list))
pickle.dump(unique_organisms_list,open('unique_organisms_list.p', 'wb')) 

f = open('unique_organisms.txt','w')
for org in unique_organisms_list[0:-1]:
    f.write(org+'\n')  
f.write(unique_organisms_list[-1])
f.close()
    

In [None]:
# open organism classification dictionary

organism_class = pickle.load(open('organism_classification_dict.p',"rb"))


unknown_class = []
for key,value in organism_class.items():
    if value=='Unknown':
        unknown_class.append(key)
print(len(unknown_class))

for uc in unknown_class:
    if 'Clostridium' in uc:
        organism_class[uc] = 'Bacteria'
    elif 'Bacillus' in uc:
        organism_class[uc] = 'Bacteria'
    elif 'Haemophilus' in uc:
        organism_class[uc] = 'Bacteria'
    elif 'Ruminococcus' in uc:
        organism_class[uc] = 'Bacteria'
    elif 'Eubacterium' in uc:
        organism_class[uc] = 'Bacteria'
    elif 'Leptolyngbya' in uc:
        organism_class[uc] = 'Bacteria'
    elif 'Kitasatospora' in uc:
        organism_class[uc] = 'Bacteria' 
    elif 'Mannheimia' in uc:
        organism_class[uc] = 'Bacteria' 
    elif 'Streptococcus' in uc:
        organism_class[uc] = 'Bacteria' 
    elif 'Enterobacter' in uc:
        organism_class[uc] = 'Bacteria'
    elif 'Pseudomonas' in uc:
        organism_class[uc] = 'Bacteria'   
    elif 'Escherichia coli' in uc:
        organism_class[uc] = 'Bacteria'
    elif '[Chlorella]' in uc:
        organism_class[uc] = 'Viridiplantae'
    elif 'virus' in uc:
        organism_class[uc] = 'Viruses'
    elif '[Candida]' in uc:
        organism_class[uc] = 'Fungi'
    elif uc=='Physeter catodon':
        organism_class[uc] = 'Mammalia'

unknown_class = []
for key,value in organism_class.items():
    if value=='Unknown':
        unknown_class.append(key)
print(len(unknown_class))
print(unknown_class)
    

In [None]:
# make horizontal stacked bar charts showing composition of clusters PDB data in terms of organisms
# results aggregated by organism classification
        
        
organism_categories = ['Severe acute respiratory syndrome coronavirus 2',\
          'Severe acute respiratory syndrome-related coronavirus','Viruses','Homo sapiens','Other','Unknown']

org_comp_dict = {}
unique_template_dict = {}
virus_dict = {}

# protein
for protnow in ['nsp5','nsp12','S']: 
    org_comp_dict[protnow] = {}
    unique_template_dict[protnow] = {}
    virus_dict[protnow] = {}
    
    # pocket
    for pocket in cluster_spheres[protnow]:
        org_comp_dict[protnow][pocket] = {}
        unique_template_dict[protnow][pocket] = []
        organism_counts = {}
        organism_class_counts = {}
        resolution_templates = []
        resolutions = []
        virus_dict[protnow][pocket] = {}
        
        # for each contact, collect organism and resolution data for templates
        for contact in cluster_spheres[protnow][pocket]:
            try:
                proteintemplate = contact.split('.')[3].split('_')[0]
                ligandtemplate = contact.split('.')[8].split('_')[0]
                
                if ligandtemplate=='6kpu':
                    ligandtemplate = '7ddf'
                if ligandtemplate=='6kpv':
                    ligandtemplate = '7ddl'
                if ligandtemplate=='6kpw':
                    ligandtemplate = '7ddh'
                if ligandtemplate=='6kpy':
                    ligandtemplate = '7ddg'
                if ligandtemplate=='6kpz':
                    ligandtemplate = '7ddj'
                if ligandtemplate=='7ks5':
                    ligandtemplate = '7lfe'
            
                if proteintemplate not in unique_template_dict[protnow][pocket]:
                    unique_template_dict[protnow][pocket].append(proteintemplate)
                if ligandtemplate not in unique_template_dict[protnow][pocket]:
                    unique_template_dict[protnow][pocket].append(ligandtemplate)
            
                if proteintemplate == 'xray':
                    proteinorg = 'Unknown'
                else:
                    if ';' in organism_dict[proteintemplate]:
                        prot_org_list = [organism_dict[proteintemplate].split('; ')[i] for i in range(0,len(organism_dict[proteintemplate].split('; ')))]
                        prot_org_list_sorted = sorted(prot_org_list)
                        proteinorg = prot_org_list_sorted[0]
                        for j in range(1,len(prot_org_list_sorted)):
                            proteinorg = proteinorg+', '+prot_org_list_sorted[j]
                        for protorg in prot_org_list:
                            if organism_class[protorg] not in organism_class_counts:
                                organism_class_counts[organism_class[protorg]] = 1
                            elif organism_class[protorg] in organism_class_counts:
                                organism_class_counts[organism_class[protorg]] = organism_class_counts[organism_class[protorg]] + 1
                            if organism_class[protorg]=='Viruses' and protorg not in virus_dict[protnow][pocket]:
                                virus_dict[protnow][pocket][protorg] = 1 
                            elif organism_class[protorg]=='Viruses' and protorg in virus_dict[protnow][pocket]:
                                virus_dict[protnow][pocket][protorg] = virus_dict[protnow][pocket][protorg]+1 
                        
                    else:
                        proteinorg = organism_dict[proteintemplate]
                        if organism_class[proteinorg] not in organism_class_counts:
                            organism_class_counts[organism_class[proteinorg]] = 1
                        elif organism_class[proteinorg] in organism_class_counts:
                            organism_class_counts[organism_class[proteinorg]] = organism_class_counts[organism_class[proteinorg]] + 1
                        if organism_class[proteinorg]=='Viruses' and proteinorg not in virus_dict[protnow][pocket]:
                            virus_dict[protnow][pocket][proteinorg] = 1  
                        elif organism_class[proteinorg]=='Viruses' and proteinorg in virus_dict[protnow][pocket]:
                            virus_dict[protnow][pocket][proteinorg] = virus_dict[protnow][pocket][proteinorg]+1
            
                if ';' in organism_dict[ligandtemplate]:                    
                    lig_org_list = [organism_dict[ligandtemplate].split('; ')[i] for i in range(0,len(organism_dict[ligandtemplate].split('; ')))]
                    lig_org_list_sorted = sorted(lig_org_list)
                    ligandorg = lig_org_list_sorted[0]
                    for j in range(1,len(lig_org_list_sorted)):
                        ligandorg = ligandorg+', '+lig_org_list_sorted[j]
                    for ligorg in lig_org_list:
                        if organism_class[ligorg] not in organism_class_counts:
                            organism_class_counts[organism_class[ligorg]] = 1
                        elif organism_class[ligorg] in organism_class_counts:
                            organism_class_counts[organism_class[ligorg]] = organism_class_counts[organism_class[ligorg]] + 1
                        if organism_class[ligorg]=='Viruses' and ligorg not in virus_dict[protnow][pocket]:
                            virus_dict[protnow][pocket][ligorg] = 1 
                        elif organism_class[ligorg]=='Viruses' and ligorg in virus_dict[protnow][pocket]:
                            virus_dict[protnow][pocket][ligorg] = virus_dict[protnow][pocket][ligorg]+1

                else:
                    ligandorg = organism_dict[ligandtemplate]
                    if organism_class[ligandorg] not in organism_class_counts:
                        organism_class_counts[organism_class[ligandorg]] = 1
                    elif organism_class[ligandorg] in organism_class_counts:
                        organism_class_counts[organism_class[ligandorg]] = organism_class_counts[organism_class[ligandorg]] + 1
                    if organism_class[ligandorg]=='Viruses' and ligandorg not in virus_dict[protnow][pocket]:
                        virus_dict[protnow][pocket][ligandorg] = 1  
                    elif organism_class[ligandorg]=='Viruses' and ligandorg in virus_dict[protnow][pocket]:
                        virus_dict[protnow][pocket][ligandorg] = virus_dict[protnow][pocket][ligandorg]+1
                        
            
                if proteinorg not in organism_counts:
                    organism_counts[proteinorg] = 1
                elif proteinorg in organism_counts:
                    organism_counts[proteinorg] = organism_counts[proteinorg] + 1
                
                if ligandorg not in organism_counts:
                    organism_counts[ligandorg] = 1
                elif ligandorg in organism_counts:
                    organism_counts[ligandorg] = organism_counts[ligandorg] + 1
            
                if proteintemplate in resolution_dict:
                    proteinres = resolution_dict[proteintemplate]
                    if proteinres!='90.00' and proteinres!='99.99':
                        resolutions.append(proteinres)
            
                        if proteintemplate not in resolution_templates:
                            resolution_templates.append(proteintemplate)
            
                if ligandtemplate in resolution_dict:
                    ligandres = resolution_dict[ligandtemplate]
                    if ligandres!='90.00' and ligandres!='99.99':
                        resolutions.append(ligandres)
            
                        if ligandtemplate not in resolution_templates:
                            resolution_templates.append(ligandtemplate)
                
            except:
                print(contact)
        
        
        # bar chart of organisms
        organism_category_counts = []
        for org_category in ['Severe acute respiratory syndrome coronavirus 2',\
          'Severe acute respiratory syndrome-related coronavirus','Viruses','Homo sapiens']:
            if org_category in organism_class_counts:
                organism_category_counts.append(organism_class_counts[org_category])
            else:
                organism_category_counts.append(0)
        
        total_other = 0
        for key, value in organism_class_counts.items():
            if key not in ['Severe acute respiratory syndrome coronavirus 2',\
          'Severe acute respiratory syndrome-related coronavirus','Viruses','Homo sapiens','Unknown']:
                total_other = total_other + value
        
        organism_category_counts.extend([total_other,organism_class_counts['Unknown']])
        
        total = sum(organism_category_counts)
        organism_category_percents = [float(orgcount)/float(total) for orgcount in organism_category_counts]
        
        org_comp_dict[protnow][pocket] = organism_category_percents
              
pickle.dump(org_comp_dict,open('org_comp_dict_stacked_barcharts_'+datecut+'.pkl','wb'))        

In [None]:
def plot_pocket_comp(results, category_names):

    labels = list(results.keys())
    data = np.array(list(results.values()))
    data_cum = data.cumsum(axis=1)
    
    category_colors = ['tab:blue','tab:orange','tab:purple','tab:green','tab:red','tab:gray']

    fig, ax = plt.subplots(figsize=(9.2, 5))
    ax.invert_yaxis()
    ax.xaxis.set_visible(False)
    ax.set_xlim(0, np.sum(data, axis=1).max())

    for i, (colname, color) in enumerate(zip(category_names, category_colors)):
        widths = data[:, i]
        starts = data_cum[:, i] - widths
        rects = ax.barh(labels, widths, left=starts, height=0.5,
                        label=colname, color=color)

    ax.legend(['SARS-CoV-2','SARS-CoV-1','Viruses','Homo sapiens','Other','Unknown'],ncol=len(category_names), bbox_to_anchor=(0, 1),
              loc='lower left', fontsize='small')


    return fig, ax

In [None]:
# pre and during pandemic bar charts

prepandemic_comp = pickle.load(open('org_comp_dict_stacked_barcharts_2020_02_04.pkl',"rb"))
pandemic_comp = pickle.load(open('org_comp_dict_stacked_barcharts_current.pkl',"rb"))

pockets_to_compare = {'nsp5, Pocket 1, current': pandemic_comp['nsp5'][1], 'nsp5, Pocket 1, pre-pandemic': prepandemic_comp['nsp5'][1]}

plot_pocket_comp(pockets_to_compare,organism_categories)
plt.show()


In [None]:
# pre and during pandemic bar charts

pockets_to_compare = {'nsp12, Pocket 1, current': pandemic_comp['nsp12'][1], 'nsp12, Pocket 1, pre-pandemic': prepandemic_comp['nsp12'][1]}


plot_pocket_comp(pockets_to_compare,organism_categories)
plt.show()


In [None]:
# collect GDC scores for each organism classification over time

import numpy as np

datecut_list = ['2019_07_09','2020_03_10','2020_06_23','2020_09_29','2021_02_23','2021_04_20','2021_06_29','2021_10_05','current']

org_class_list = ['Severe acute respiratory syndrome coronavirus 2',\
                  'Severe acute respiratory syndrome-related coronavirus','Homo sapiens', 'Mammalia',\
                  'Viruses','Bacteria', 'Fungi', 'Sauropsida','Insecta', 'Viridiplantae','Archaea',\
                  'Other eukaryote','Unknown']


avgGDC = {}
GDCvalues = {}
resolutioncut = 'all'
gdccut = '60'
Nccut = '15'
N4cut = '4'
ligsizecut = '8'
clcut = '0'

for org_class in org_class_list:
    avgGDC[org_class] = []
    GDCvalues[org_class] = {}
    for datecut in datecut_list:
        file = open('CCC.confidence_centroid_contacts.'+Nccut+'_10_'+gdccut+'_'+N4cut+'_'+clcut+'.ligs_'+ligsizecut+'.nCoV.'+datecut+'.res'+resolutioncut,'r')
        line_list = file.readlines()
        GDClist = []
        for line in line_list:
            proteintemplate = line.split()[0].split('.')[3].split('_')[0]
            ligandtemplate = line.split()[0].split('.')[8].split('_')[0]
            gdc = line.split()[10]
            
            # update retired PDB templates to current templates 
            if ligandtemplate=='6kpu':
                ligandtemplate = '7ddf'
            if ligandtemplate=='6kpv':
                ligandtemplate = '7ddl'
            if ligandtemplate=='6kpw':
                ligandtemplate = '7ddh'
            if ligandtemplate=='6kpx':
                ligandtemplate = '7ddi'
            if ligandtemplate=='6kpy':
                ligandtemplate = '7ddg'
            if ligandtemplate=='6kpz':
                ligandtemplate = '7ddj'
            if ligandtemplate=='6kq0':
                ligandtemplate = '7ddk'
            if ligandtemplate=='7ks5':
                ligandtemplate = '7lfe'
            if ligandtemplate=='7kw5':
                ligandtemplate = '7ldx'
            if ligandtemplate=='7kfj':
                ligandtemplate = '7lfp'
            
            orgclass_match = 0
            if ';' in organism_dict[ligandtemplate]:                    
                lig_org_list = [organism_dict[ligandtemplate].split('; ')[i] for i in range(0,len(organism_dict[ligandtemplate].split('; ')))]
                for ligorg in lig_org_list:
                    if organism_class[ligorg]==org_class:
                        orgclass_match = 1
            else:
                ligandorg = organism_dict[ligandtemplate]
                if organism_class[ligandorg]==org_class:
                    orgclass_match = 1  
            if orgclass_match==1:
                GDClist.append(float(gdc))
                
        if len(GDClist)>0:
            avg_gdc = np.mean(GDClist)
            std_gdc = np.std(GDClist)
            avgGDC[org_class].append((datecut,avg_gdc,len(GDClist),std_gdc))
            GDCvalues[org_class][datecut] = GDClist

In [None]:
# overlapping histograms
# distributions of GDC scores for each organism over all dates

org_class_list_reorder = ['Severe acute respiratory syndrome-related coronavirus','Severe acute respiratory syndrome coronavirus 2',\
                          'Homo sapiens', 'Mammalia','Viruses','Bacteria', 'Fungi', 'Sauropsida','Insecta',\
                          'Viridiplantae','Archaea','Other eukaryote','Unknown']

org_class_list = ['Severe acute respiratory syndrome-related coronavirus','Severe acute respiratory syndrome coronavirus 2',\
                          'Homo sapiens', 'Mammalia','Viruses','Bacteria', 'Fungi', 'Sauropsida','Insecta',\
                          'Viridiplantae','Archaea','Other eukaryote','Unknown']

colors_dict = {'Severe acute respiratory syndrome coronavirus 2': 'tab:blue',\
          'Severe acute respiratory syndrome-related coronavirus': 'tab:orange',\
          'Homo sapiens': 'tab:green', 'Mammalia': 'tab:red', 'Viruses': 'tab:purple',\
          'Bacteria': 'tab:brown', 'Fungi': 'tab:pink', 'Sauropsida': 'tab:cyan',\
          'Insecta': 'tab:olive', 'Unknown': 'tab:gray', 'Viridiplantae': 'm',\
          'Archaea': 'lightgreen', 'Other eukaryote': 'navy'}

fig, ax = plt.subplots(figsize=(8,5))

bins_dict = {}

for org_class in org_class_list_reorder:
    GDCalldates = []
    bins_dict[org_class] = {}
    for datecut in datecut_list:
        if datecut=='current' and datecut in GDCvalues[org_class]:
            GDCalldates.extend(GDCvalues[org_class][datecut])
      
    histout = plt.hist(GDCalldates,bins=41,range=(60,100),color=colors_dict[org_class],alpha=0.5)
    bins_dict[org_class] = histout
    
plt.xlabel('GDC metric',fontsize=16)
plt.ylabel('Count',fontsize=16) 
plt.xticks(fontsize=14,rotation=90)
plt.yticks(fontsize=14)
plt.show()

fig, ax = plt.subplots(figsize=(12,7.5))

for org_class in org_class_list_reorder:
    rescaled_bins = bins_dict[org_class][0]/sum(bins_dict[org_class][0])
    plt.plot(bins_dict[org_class][1][0:-1],rescaled_bins,color=colors_dict[org_class],linewidth=3)
    
plt.xlabel('GDC metric',fontsize=18)
plt.ylabel('Density',fontsize=18) 
plt.xticks(fontsize=16,rotation=90)
plt.yticks(fontsize=16)
plt.legend(org_class_list_reorder,loc=(1.05,0.32),fontsize=14)
plt.show()

In [None]:
# collect GDC scores of each SARS-2 protein over time

datecut_list = ['2019_07_09','2020_03_10','2020_06_23','2020_09_29','2021_02_23','2021_04_20','2021_06_29','2021_10_05','current']

vir_prot_list = ['S','ORF3a','nsp3','nsp5','nsp12','nsp7','nsp8','nsp13','nsp14','nsp15','nsp16']


avgGDC = {}
GDCvalues = {}
resolutioncut = 'all'
gdccut = '60'
Nccut = '15'
N4cut = '4'
ligsizecut = '8'
clcut = '0'

for virprot in vir_prot_list:
    avgGDC[virprot] = []
    GDCvalues[virprot] = {}
    for datecut in datecut_list:
        file = open('CCC.confidence_centroid_contacts.'+Nccut+'_10_'+gdccut+'_'+N4cut+'_'+clcut+'.ligs_'+ligsizecut+'.nCoV.'+datecut+'.res'+resolutioncut,'r')
        line_list = file.readlines()
        GDClist = []
        for line in line_list:
            protein = line.split()[0].split('.')[0].split('_')[1]
            proteintemplate = line.split()[0].split('.')[3].split('_')[0]
            ligandtemplate = line.split()[0].split('.')[8].split('_')[0]
            gdc = line.split()[10]
            if protein=='Spike':
                protein = 'S'
            if protein==virprot:
                GDClist.append(float(gdc))
        if len(GDClist)>0:
            avg_gdc = np.mean(GDClist)
            std_gdc = np.std(GDClist)
            avgGDC[virprot].append((datecut,avg_gdc,len(GDClist),std_gdc))
            GDCvalues[virprot][datecut] = GDClist

In [None]:
# line plots of number of templates above GDC cutoff per SARS-2 protein over time (across all organisms) 

datecut_list = ['2019_07_09','2020_03_10','2020_06_23','2020_09_29','2021_02_23','2021_04_20','2021_06_29','2021_10_05','current']
gdccut = '60'

vir_prot_list = ['S','ORF3a','nsp3','nsp5','nsp12','nsp7','nsp8','nsp13','nsp14','nsp15','nsp16']

colors_dict = {'S':'tab:blue','nsp5':'tab:orange','nsp12':'tab:green','nsp3':'tab:red','ORF3a':'tab:purple',\
          'nsp7':'tab:brown','nsp8':'tab:pink','nsp13':'tab:cyan','nsp14':'tab:olive','nsp15':'tab:gray','nsp16':'m'}

GDCallorgs = {}
for datecut in datecut_list:
    GDCallorgs[datecut] = {}
    for virprot in vir_prot_list:
        GDCallorgs[datecut][virprot] = []
        file = open('CCC.confidence_centroid_contacts.'+Nccut+'_10_'+gdccut+'_'+N4cut+'_'+clcut+'.ligs_'+ligsizecut+'.nCoV.'+datecut+'.res'+resolutioncut,'r')
        line_list = file.readlines()
        for line in line_list:
            gdc = line.split()[10]
            protein = line.split()[0].split('.')[0].split('_')[1]
            if protein=='Spike':
                protein = 'S'
            if float(gdc)>=float(gdccut) and protein==virprot:
                GDCallorgs[datecut][virprot].append(gdc)
                
fig, ax = plt.subplots(figsize=(8,5))

for virprot in vir_prot_list:
    plt.scatter(datecut_list,[len(GDCallorgs[datecut][virprot]) for datecut in GDCallorgs],color=colors_dict[virprot])
    plt.plot(datecut_list,[len(GDCallorgs[datecut][virprot]) for datecut in GDCallorgs],color=colors_dict[virprot])
    plt.xlabel('Date cutoff',fontsize=16)
    plt.ylabel('Number of protein-ligand template \npairs with GDC above '+gdccut,fontsize=16) 
    plt.xticks(fontsize=14,rotation=90)
    plt.yticks(fontsize=14)
    plt.legend(vir_prot_list,loc=(1.05,0.14),fontsize=14)
    plt.show()

In [None]:
# line plots of number of unique protein templates per SARS-2 protein over time (across all organisms) 

datecut_list = ['2019_07_09','2020_03_10','2020_06_23','2020_09_29','2021_02_23','2021_04_20','2021_06_29','2021_10_05','current']
gdccut = '60'

vir_prot_list = ['S','ORF3a','nsp3','nsp5','nsp12','nsp7','nsp8','nsp13','nsp14','nsp15','nsp16']

colors_dict = {'S':'tab:blue','nsp5':'tab:orange','nsp12':'tab:green','nsp3':'tab:red','ORF3a':'tab:purple',\
          'nsp7':'tab:brown','nsp8':'tab:pink','nsp13':'tab:cyan','nsp14':'tab:olive','nsp15':'tab:gray','nsp16':'m'}

protein_templates_allorgs = {}
for datecut in datecut_list:
    protein_templates_allorgs[datecut] = {}
    for virprot in vir_prot_list:
        protein_templates_allorgs[datecut][virprot] = []
        file = open('CCC.confidence_centroid_contacts.'+Nccut+'_10_'+gdccut+'_'+N4cut+'_'+clcut+'.ligs_'+ligsizecut+'.nCoV.'+datecut+'.res'+resolutioncut,'r')
        line_list = file.readlines()
        for line in line_list:
            gdc = line.split()[10]
            protein = line.split()[0].split('.')[0].split('_')[1]
            proteintemplate = line.split()[0].split('.')[3].split('_')[0]
            if protein=='Spike':
                protein = 'S'
            if protein==virprot and proteintemplate not in protein_templates_allorgs[datecut][virprot]:
                protein_templates_allorgs[datecut][virprot].append(proteintemplate)
                
fig, ax = plt.subplots(figsize=(8,5))

for virprot in vir_prot_list:
    plt.scatter(datecut_list,[len(protein_templates_allorgs[datecut][virprot]) for datecut in protein_templates_allorgs],color=colors_dict[virprot])
    plt.plot(datecut_list,[len(protein_templates_allorgs[datecut][virprot]) for datecut in protein_templates_allorgs],color=colors_dict[virprot])
    plt.xlabel('Date cutoff',fontsize=16)
    plt.ylabel('Number of unique protein \ntemplates',fontsize=16) 
    plt.xticks(fontsize=14,rotation=90)
    plt.yticks(fontsize=14)
    plt.legend(vir_prot_list,loc=(1.05,0.14),fontsize=14)
    plt.show()