## Manually get _public_ targets

In [98]:
# This was copypasted from the site

targets = '''NUDT7A
Open SGC summary
ATAD
BRD1A
Open SGC summary
DCP2B
FAM83BA
Open SGC summary
MURD
NUDT4A
Open SGC summary
OXA10OTA
PARP14A
Open SGC summary
PHIPA
PTP1B
SMTGR
ACVR1A
ATAD2A
CAMK1DA
DCLRE1AA
Open SGC summary
FALZA
Open SGC summary
HAO1A
Open SGC summary
MUREECA
NUDT21A
NUDT4
NUDT5A
NUDT7A_CRUDE
smTGRNEW
STAG1A
TBXTA
VIM2
XX02KALRNA
TNCA
ALAS2A
EPB41L3A
mArh
INPP5DA
nsp13
Mac1
Mpro
NSP15_B
MUREECOLI
PGN_RS02895PGA
CD44MMA
Nprot
macro-combi
NSP16
NSP14
MID2A
SOS3ATA'''.replace('Open SGC summary', '').split()

## Blast for seq to Uniprot

In [None]:
# Note that the first hsp in the alignment.hsps may not be correct
# identical sequences may be merged into a single header (pipe seperated)

import functools, re, operator, io
from typing import Set, List, Dict, Union, Tuple
import pandas as pd
import requests
from Bio.Blast import NCBIXML
from Bio.Blast import NCBIWWW
from Bio.Blast import Record
from fragalysis_api import GetTargetsData

# get sequence
ids: Dict[str, int] = {}
seqs: Dict[str, str] = {}
for target_name in targets:
    ts = GetTargetsData()
    ts.set_target_name_url(target_name)
    ts.get_target_json()
    ids[target_name] = ts.target_json['results'][0]['id']
    seqs[target_name] = ts.target_json['results'][0]['sequences'][0]['sequence']
    
# get uniprot id
alignments: Dict[str, Record.Alignment] = {}
ranges: Dict[str, Tuple[int, int]] = {}
for target_name, seq in seqs.items():
    if not seq:
        continue
    if target_name in alignments:
        continue
    blast: io.StringIO = NCBIWWW.qblast(program='blastp', 
                                        database='swissprot', #'refseq_protein', 
                                        sequence=seq,
                                       )

    blast_record: Record.Blast = list(NCBIXML.parse(blast))[0]
    alignment: Record.Alignment = blast_record.alignments[0]
    hsp: Record.HSP = alignment.hsps[0]
    alignments[target_name] = blast_record.alignments[0]
    ranges[target_name] = (hsp.sbjct_start, hsp.sbjct_end)

## Get Uniprot data

In [111]:
# get uniprot data
uniprot_data: Dict[str, dict] = {}
for target_name, alignment in alignments.items():
    uniprot_id: str = re.match(r'.*?\|([^|]*)\.\d+\|', alignment.title).group(1)
    response = requests.get(f'https://rest.uniprot.org/uniprotkb/{uniprot_id}.json')
    response.raise_for_status()
    uniprot_data[target_name] = response.json()

## get count mols

In [211]:
# The fragalysis-api is utterly non-standard and prints stuff

from fragalysis_api import GetMoleculesData
from IPython.display import display, clear_output

mol_data: Dict[str, dict] = {}
mol_counts: Dict[str, int] = {}
for target_name in ids:
    search = GetMoleculesData()
    search.set_target_id(target_name)
    search.set_molecule_url()
    search.set_mol_data()
    mol_counts[target_name] = search.get_mol_data['count']
    mol_data[target_name] = search.get_mol_data['results']
    clear_output()

## combine into table

In [213]:
get_inner = lambda key: {t: d.get(key, None) for t, d in uniprot_data.items()}
get_org_inner = lambda key: {t: d['organism'].get(key, None) for t,d in uniprot_data.items()}
get_fun_inner = lambda fun: {t: fun(d) for t, d in uniprot_data.items()}

def get_recommended(d):
    try:
        return d['proteinDescription']['recommendedName']['fullName']['value']
    except Exception:
        return ''
    
def get_alternativeNames(d):
    try:
        return tuple([e['fullName']['value'] for e in d['proteinDescription']['alternativeNames']])
    except Exception:
        return ()
    
def get_gene(d):
    try:
        return d['genes'][0]['geneName']['value']
    except Exception:
        return ''
    
    
def get_alt_genes(d):
    try:
        return tuple([e['value'] for e in d['genes'][0]['synonyms']])
    except Exception:
        return ()
    
def get_ec(d):
    try:
        return tuple([e['value'] for e in d['proteinDescription']['recommendedName']['ecNumbers']])
    except Exception:
        return ()
    
def get_comment_fun(d):
    try:
        
        return tuple([t['value'] for c in d['comments'] if c['commentType'] == 'FUNCTION' for t in c['texts'] ])
    except Exception:
        return ()
    
def get_comment_cat(d):
    try:
        return tuple([c['reaction']['name']  for c in d['comments'] if c['commentType'] == 'CATALYTIC ACTIVITY' ])
    except Exception:
        return ()
    
def get_comment_inter(d):
    try:
        return tuple([t['value'] for c in d['comments'] if c['commentType'] == 'SUBUNIT' for t in c['texts']])
    except Exception:
        return ()
    
def get_domains():
    descriptions = {}
    for t, d in uniprot_data.items():
        span = ranges[t]
        domains = []
        for feat in d['features']:
            if feat['type'] != 'Domain':
                continue
            start = feat['location']['start']['value']
            end = feat['location']['end']['value']
            if not(ranges[t][0] <= end and ranges[t][1] >= start):
                continue
            domains.append((feat['description'], end-start))
        descriptions[t] = tuple(map(operator.itemgetter(0), sorted(domains, key=lambda d: d[1])))
    return descriptions

targets = pd.DataFrame(dict(target_id=ids, sequence_length={t: len(s) for t, s in seqs.items()},
                            N_hits=mol_counts,
                              uniprot_ranges=ranges,
                              uniprot_id=get_inner('primaryAccession'),
                              #uniprot_acc=get_inner('primaryAccession'),                
                              protein_name=get_fun_inner(get_recommended),
                              gene_name=get_fun_inner(get_gene),
                              organism_scientific_name=get_org_inner('scientificName'),   
                              organism_common_name=get_org_inner('commonName'),  
                              organism_taxon_id=get_org_inner('taxonId'),
                              protein_alt_names=get_fun_inner(get_alternativeNames),
                              alt_gene_names=get_fun_inner(get_alt_genes),
                              domains=get_domains(),
                              catalysis=get_fun_inner(get_comment_cat),
                              EC_numbers=get_fun_inner(get_ec),
                              function=get_fun_inner(get_comment_fun),
                              interaction=get_fun_inner(get_comment_inter),
                             ))

In [215]:
import pickle, json

with open('targets.fasta', 'w') as fh:
    for t, s in seqs.items():
        fh.write(f'>{t}\n{s}\n')
        
with open('targets_blast.p', 'wb') as fh:
    pickle.dump(alignments, fh)
    
with open('targets_mol.p', 'wb') as fh:
    pickle.dump(mol_data, fh)
    
targets.to_pickle('targets_df.p')
targets.to_csv('targets.csv')
targets.to_markdown('targets.md')