In [1]:
#/usr/bin/env python

import json
import urllib
import requests

import re
import os
import numpy
import pandas as pd

import Bio
from Bio import Entrez
from Bio import AlignIO
from Bio.Align.Applications import ClustalwCommandline
from Bio.pairwise2 import format_alignment
from Bio import SeqIO
from Bio import pairwise2

from localcider.sequenceParameters import SequenceParameters
from localcider import plots

import matplotlib.pyplot as plt
import matplotlib.cm as cm

# Query the human proteome against the D2P2 database and store disordered sequences, PFAM domains, and list of PFAM information

## Use only if you want to generate all the raw data from source. This takes ~ 4.5 hours to run ~20,000 API requests at roughly 1 request/second

## The input_filename should be input fasta file for which you want to query disorder+pfams

### Currently, the data that is stored is the consensus disorder, the PFAM (excluding PFAM-b) domains and their positions, 

In [3]:
filename = "../data/raw_sequence_data/nuclear_proteins.fasta"



filename_write_data = "../data/raw_data/20190709_human_pfam_gene_disorder_all.tsv"

if not os.path.exists(os.path.dirname(filename_write_data)):
    try:
        os.makedirs(os.path.dirname(filename_write_data))
    except OSError as exc: # Guard against race condition
        if exc.errno != errno.EEXIST:
            raise


records = list(SeqIO.parse(filename,"fasta"));

output_data = {};

with open(filename_write_data,'a+') as file_write:

    for item in records[15460:]:

        Id_current = str(item.id.split('|')[1])
        gene_name = str(item.id.split('|')[2])
        description = str(item.description.split('|')[-1])
        seq = str(item.seq);
    #     print(seq,Id_current,description)
        data = 'seqids=["{}"]'.format(Id_current)
        request = requests.get('http://d2p2.pro/api/seqid', data)
        response = request.json()
        ID_list = [];

        if (response[Id_current]):

            count = 1;
            for pos in response[Id_current][0][2]['disorder']['consranges']:
                Id = Id_current + '_' + str(count);
                output_data[Id] = {};
                output_data[Id]['id'] = Id_current;
                output_data[Id]['seq'] = seq;
                output_data[Id]['description'] = description;
                output_data[Id]['idr_start'] = pos[0];
                output_data[Id]['idr_end'] = pos[1]; 
                output_data[Id]['gene_name'] = gene_name; 
            
                ID_list.append(Id)
                count = count+1;

            count_dom = 0;

            if count==1:
                output_data[Id_current] = {};
                output_data[Id_current]['id'] = Id_current;
                output_data[Id_current]['seq'] = seq;
                output_data[Id_current]['description'] = description;
                output_data[Id_current]['idr_start'] = -1;
                output_data[Id_current]['idr_end'] = -1;
                output_data[Id_current]['gene_name'] = gene_name; 

                ID_list.append(Id_current)

            count_dom = 1;

            id_domains = {};
            for pfams in (response[Id_current][0][2]['structure']['pfam']):
                pfam_name = pfams[2][0:7];

                if (pfam_name.find('PF') > -1) :

                    domain_id = Id_current + '_pfam_' + str(count_dom);
                    id_domains[domain_id] = {};
                    id_domains[domain_id]['id'] = pfam_name;
                    id_domains[domain_id]['start'] = int(pfams[7])
                    id_domains[domain_id]['end'] = int(pfams[8])
                    id_domains[domain_id]['escore'] = float(pfams[5])
                    id_domains[domain_id]['pfam_name'] = str(pfams[3])
                    id_domains[domain_id]['pfam_desc'] = str(pfams[4])
                    count_dom = count_dom + 1;

            if count_dom ==1:
                domain_id = Id_current ;
                id_domains[domain_id] = {};
                id_domains[domain_id]['id'] = 'None';

            for ID in ID_list:
                output_data[ID]['pfam_list'] = [];
                for dom_id in list(id_domains.keys()):
                    str_to_write = '';
                    for key in id_domains[dom_id].keys():
                        str_to_write = str_to_write + str(id_domains[dom_id][key]) + '_';
                    output_data[ID]['pfam_list'].append(str_to_write)


        else:
            output_data[Id_current] = {};
            output_data[Id_current]['id'] = Id_current;
            output_data[Id_current]['seq'] = seq;
            output_data[Id_current]['description'] = description;
            output_data[Id_current]['idr_start'] = -1;
            output_data[Id_current]['idr_end'] = -1;
            output_data[Id_current]['gene_name'] = gene_name; 
            output_data[Id_current]['pfam_list'] = ['None'];
            
            ID_list.append(Id_current);
            
        for ID in ID_list:
            data_to_write = list(output_data[ID].values());
            file_write.write(str(ID)+'\t');
            for data in data_to_write:
                if not isinstance(data, list):
                    file_write.write(str(data) + '\t');
                else:
                    for pfam in data:
                        file_write.write(str(pfam) + '\t');
                        
            file_write.write('\n')


ConnectionError: HTTPConnectionPool(host='d2p2.pro', port=80): Max retries exceeded with url: /api/seqid?seqids=[%22Q7RTT3%22] (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x1244eeac8>: Failed to establish a new connection: [Errno 60] Operation timed out'))

## Read in XLSX (SAVE TSV AS XLSX) file of PFAM data
### The above output is stored as a TSV file - just resave it an xlsx file

In [None]:
file_name = 'raw_data/20190709_human_pfam_gene_disorder_all.xlsx'
df = pd.read_excel(file_name,usecols="A:DQ",delimiter='\t',header=None)
len(df.columns)

# collapse all pfam files into one column, all idrs of a given protein into one column, and output sorted data-frame (by ID) to write_file

In [None]:
p = df.columns[7:121]
df['pfam_all'] = df[p].astype(str).apply(lambda x: ','.join(x), axis=1)
for count in range(len(df['pfam_all'].values)):
    df['pfam_all'].values[count] = df['pfam_all'].values[count].strip('nan,')
less_data = df[list(df.columns[0:7])+ ['pfam_all']]
less_data.columns = ['IDR_id','ID','sequence','description','idr_start','idr_end','gene','pfam_all']
less_data

In [None]:
sorted_data = less_data.astype(str).groupby('ID').agg(lambda x: ','.join(x.unique()));
length_proteins = numpy.array( [float(len(x)) for x in sorted_data.iloc[:,1]])
sorted_data['lengths'] = length_proteins;
sorted_data

In [None]:
write_sorted_data ='raw_data/20190709_clean_output_test_repeat.xlsx'
sorted_data.to_excel(write_sorted_data)