In [129]:
import gzip
import pandas as pd
import numpy as np
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SwissProt
from Bio import SeqIO

from functools import partial

In [None]:
DATA_FOLDER = "../data/"

In [116]:
hum = pd.read_csv(DATA_FOLDER + "humsavar_gt.tab.gz", sep="\t", index_col="MUTANT")
uniprots = pd.DataFrame.from_items(zip(hum.index, hum.index.str.split("-"))).T
uniprots.columns = ["uniprot", "position", "amino", "amino_var"]
uniprots.position = uniprots.position.astype(int)

In [117]:
sequences = open(DATA_FOLDER + "human_prot_sequences.txt", "w")
for e, record in enumerate(SeqIO.parse("./data/uniprot-proteome%3AUP000005640.fasta", "fasta")):
    uni_id = record.id.split("|")[1]
    sequences.write("%s,%s\n" % (uni_id, record.seq))

In [118]:
sequences = pd.read_csv(DATA_FOLDER + "human_prot_sequences.txt", header=None, names=["uniprot", "sequence"])

In [119]:
uniprots = uniprots.reset_index().merge(sequences, on="uniprot").set_index("index")

In [120]:
def get_subsequence(row, width=1):
    seq = row['sequence']
    pos = row['position']-1
    return seq[pos-width:pos] + row['amino_var'] + seq[pos+1:pos+width+1]

uniprots['var_slice'] = uniprots.apply(partial(get_subsequence, width=10), axis=1)

In [124]:
uniprots.head()

Unnamed: 0_level_0,uniprot,position,amino,amino_var,sequence,var_slice
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
P11362-174-V-A,P11362,174,V,A,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,KLHAVPAAKTAKFKCPSSGTP
P11362-274-E-G,P11362,274,E,G,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,NKTVALGSNVGFMCKVYSDPQ
P11362-330-N-I,P11362,330,N,I,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,DKEMEVLHLRIVSFEDAGEYT
P11362-332-S-C,P11362,332,S,C,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,EMEVLHLRNVCFEDAGEYTCL
P11362-719-M-V,P11362,719,M,V,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,LFKLLKEGHRVDKPSNCTNEL


In [125]:
pd.set_option('display.max_colwidth', -1)
uniprots.reset_index().iloc[6837]

index        P18283-37-A-L                                                                                                                                                                                 
uniprot      P18283                                                                                                                                                                                        
position     37                                                                                                                                                                                            
amino        A                                                                                                                                                                                             
amino_var    L                                                                                                                                                                          

In [96]:
uniprots[["var_slice"]].to_csv(DATA_FOLDER + "humsavar_gt_protein_slices.tab.gz", sep="\t", index=True, index_label="MUTANT", compression="gzip")

In [135]:
uniprots = pd.read_csv(DATA_FOLDER + "humsavar_gt_protein_slices.tab.gz", sep="\t")


def get_protparam(row, func_name):
    protein_analysis = ProteinAnalysis(row)
    try:
        param = getattr(protein_analysis, func_name)()
        return param
    except:
        return np.nan
        

params = ["aromaticity", "isoelectric_point", "gravy"]


for param in params:
    uniprots[param] = uniprots.var_slice.apply(partial(get_protparam, func_name=param))

In [136]:
uniprots.to_csv(DATA_FOLDER + "protparam_features.tab.gz", )

Unnamed: 0,MUTANT,var_slice,aromaticity,isoelectric_point,gravy
0,P11362-174-V-A,KLHAVPAAKTAKFKCPSSGTP,0.047619,10.037781,-0.309524
1,P11362-274-E-G,NKTVALGSNVGFMCKVYSDPQ,0.095238,8.175232,-0.114286
2,P11362-330-N-I,DKEMEVLHLRIVSFEDAGEYT,0.095238,4.352844,-0.419048
3,P11362-332-S-C,EMEVLHLRNVCFEDAGEYTCL,0.095238,4.245789,0.009524
4,P11362-719-M-V,LFKLLKEGHRVDKPSNCTNEL,0.047619,8.184509,-0.747619
