# Download uniprot IDs from curated human proteome

<a rel="license" href="https://creativecommons.org/licenses/by-sa/4.0/"><img alt="Creative Commons Licence" style="border-width:0" src="https://licensebuttons.net/l/by-sa/4.0/88x31.png" title='This work is licensed under a Creative Commons Attribution 4.0 International License.' align="right"/></a>

Author: Dr Antonia Mey   
Email: antonia.mey@ed.ac.uk

In [40]:
import pandas as pd
import json
import urllib
import os
import numpy as np

In [46]:
def download_uniprot_json_file(uni_prot_id, workdir = '.'):
    #check if there is uniprot information available for the protein
    try:
        url_2 = 'https://www.uniprot.org/uniprot/' + uni_prot_id + '.json'
        html_2 = urllib.request.urlopen(url_2)
        lines = html_2.readlines()[0]
        # now try and write to file
        f = open(os.path.join(workdir,uni_prot_id)+'.json', 'w')
        f.write(lines.decode('utf-8'))
        f.close()
        
    except Exception as e:
        raise Exception('Failed to obtain UNIPROT data. %s'%e)
    

    return html_2

In [70]:
def get_kown_things(known_things,data):

    #known_things['Helix'] = []
    if 'features' in data.keys():
        for d in data['features']:
            #print(d['type'])
            if d['type'] == 'Domain':
                if d['description'] not in  known_things['Domain']:
                    known_things['Domain'].append(d['description'])
            elif d['type'] == 'Region':
                if d['description'] not in  known_things['Region']:
                    known_things['Region'].append( d['description'])
            elif d['type'] == 'Motif':
                if d['description'] not in  known_things['Motif']:
                    known_things['Motif'].append(d['description'])
    return known_things

In [45]:
f = open('data/unique_ids_from_spreadsheet.txt','w')
for i in unique_ids:
    f.write(f'{i}\n')
f.close()

In [None]:
# Initalise dictionary of all annotations for Domain, Region and Motif
known_things = {}
known_things['Domain'] = []
known_things['Region'] = []
known_things['Motif'] = []

In [72]:
# Loop over all uniprot IDs 
# This will take a few hours as it isn't parallelised!
counter = 0
for ids in unique_ids:
    download_uniprot_json_file(ids, workdir='unique_ids')
    # Process uniprot file
    f = open('unique_ids/'+ids+'.json')
    data = json.load(f)
    f.close()
    info = get_kown_things(known_things,data)
    counter = counter +1
    if counter%500==0:
        print(f'At entry {counter}/{len(unique_ids)}')

At entry 12000/16238
At entry 12500/16238
At entry 13000/16238
At entry 13500/16238
At entry 14000/16238
At entry 14500/16238
At entry 15000/16238
At entry 15500/16238
At entry 16000/16238


In [76]:
info_keys = info.keys()

In [None]:
# Saving all annotations for domain region and motif
domain = list(set(info['Domain'])) # set generates unique list
region = list(set(info['Region']))
motif = list(set(info['Motif']))

# Now lets save that information
np.savetxt('data/motif.csv', motif, fmt='%s')
np.savetxt('data/region.csv', region fmt='%s')
np.savetxt('data/domain.csv', domain, fmt='%s')

In [89]:
# Now we want to create an exclusion list file
# 1. We exclude all domains
# 2. We exclude certain regions
# 3. we keep all motifs

region_filter = []
for d in region:
    if 'PUM-HD' in d:
        region_filter.append(d)
    if 'HEAT' in d:
        region_filter.append(d)
    if 'ARM' in d:
        region_filter.append(d) 
    if 'bHLH' in d:
        region_filter.append(d)
    #if 'BIG' in d:
        region_filter.append(d)
    #if 'helical'.lower() in d.lower():
    #    if 'nonhelical'.lower() in d.lower():
    #        continue
    #    else:
    #        region_filter.append(d)
    #if 'helix'.lower() in d.lower():
    #    region_filter.append(d)
    if 'coil coil'.lower() in d.lower():
        region_filter.append(d)
    if 'DNA Binding'.lower() in d.lower():
        region_filter.append(d)
    if 'DNA-Binding'.lower() in d.lower():
        region_filter.append(d)
    if 'Leucine-zipper'.lower() in d.lower():
        region_filter.append(d)
np.savetxt('data/region_filter.csv', region_filter, fmt='%s')

In [102]:
# Some old checking domain content
for d in domain:
    if 'PUM-HD'.lower() in d.lower():
        print('Pumilio')
    if 'HEAT'.lower() in d.lower():
        print('HEAT')
    if 'ARM'.lower() in d.lower():
        print(d) 
    if 'bHLH'.lower() in d.lower():
        print('bHLH')

bHLH
BIG
Pumilio


# END
------------