In [2]:
import gzip
import pandas as pd
import numpy as np
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SwissProt
from Bio import SeqIO

from functools import partial

In [3]:
DATA_FOLDER = "../data/"

In [4]:
GT = pd.read_csv(DATA_FOLDER + "processed/humsavar_gt.tab.gz", sep="\t", index_col="MUTANT")
uniprots = pd.DataFrame.from_items(zip(GT.index, GT.index.str.split("-"))).T
uniprots.columns = ["uniprot", "position", "amino", "amino_var"]
uniprots.position = uniprots.position.astype(int)

In [5]:
uniprots.head()

Unnamed: 0,uniprot,position,amino,amino_var
P11362-174-V-A,P11362,174,V,A
Q8WZA1-504-V-I,Q8WZA1,504,V,I
P46100-243-C-F,P46100,243,C,F
P11473-362-T-I,P11473,362,T,I
Q9NXN4-106-G-S,Q9NXN4,106,G,S


In [50]:
# Reads FASTA file with human proteome, and sequences a to file for easier Pandas reading.
sequences = open(DATA_FOLDER + "human_prot_sequences.txt", "w")
for e, record in enumerate(SeqIO.parse(DATA_FOLDER + "uniprot-proteome%3AUP000005640.fasta", "fasta")):
    uni_id = record.id.split("|")[1]
    sequences.write("%s,%s\n" % (uni_id, record.seq))

In [7]:
sequences = pd.read_csv(DATA_FOLDER + "interim/human_prot_sequences.txt", header=None, names=["uniprot", "sequence"])

In [9]:
sequences.head()

Unnamed: 0,uniprot,sequence
0,Q9Y3L3,MMKRQLHRMRQLAQTGSLGRTPETAEFLGEDLLQVEQRLEPAKRAA...
1,P04217,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...
2,Q5SQ80,MKLFGFGSRRGQTAQGSIDHVYTGSGYRIRDSELQKIHRAAVKGDA...
3,Q4UJ75,MKLFGFGSRRGQTAQGSIDHVYTGSGYRIRDSELQKIHRAAVKGDA...
4,A0PJZ0,MKLFGFRSRRGQTVLGSIDHLYTGSGYRIRYSELQKIHKAAVKGDA...


In [10]:
uniprots = uniprots.reset_index().merge(sequences, on="uniprot").set_index("index")

In [12]:
# Extracts subsequence and applies variation. 
# If var == False, it doesn't replace the aminoacid. 
# With width == -1, then the whole sequence is returned.
def get_subsequence(row, width=1, var=True):
    seq = row['sequence']
    pos = row['position']-1
    if (width != -1 and var):
        return seq[pos-width:pos] + row['amino_var'] + seq[pos+1:pos+width+1]
    if (width != -1 and  not var):
        return seq[pos-width:pos] + seq[pos-1:pos+width+1]
    if (width == -1 and var):
        return seq[:pos] + row['amino_var'] + seq[pos+1:]
    
uniprots['slice'] = uniprots.apply(partial(get_subsequence, width=7, var=False), axis=1)
uniprots['var_slice'] = uniprots.apply(partial(get_subsequence, width=7), axis=1)

In [13]:
pd.set_option('display.max_colwidth', -1)
uniprots.reset_index().iloc[6837]

index        P18283-37-A-L                                                                                                                                                                                 
uniprot      P18283                                                                                                                                                                                        
position     37                                                                                                                                                                                            
amino        A                                                                                                                                                                                             
amino_var    L                                                                                                                                                                          

In [14]:
uniprots[["var_slice", "slice"]].to_csv(DATA_FOLDER + "interim/humsavar_gt_protein_slices.tab.gz", sep="\t", index=True, index_label="MUTANT", compression="gzip")

In [15]:
uniprots = pd.read_csv(DATA_FOLDER + "interim/humsavar_gt_protein_slices.tab.gz", sep="\t")

def get_protparam(row, func_name):
    protein_analysis = ProteinAnalysis(row)
    try:
        param = getattr(protein_analysis, func_name)()
        return param
    except:
        return np.nan
        

params = ["aromaticity", "isoelectric_point", "gravy", "instability_index"]


for param in params:
    uniprots[('var_' + param)] = uniprots.var_slice.apply(partial(get_protparam, func_name=param))
    uniprots[param] = uniprots.slice.apply(partial(get_protparam, func_name=param))
    uniprots[param + "_diff"] =  abs(uniprots[param] - uniprots[('var_' + param)])
    uniprots[param + "_log_ratio"] = np.log((uniprots[param] + 1) / (uniprots[('var_' + param)] + 1))
    
uniprots.columns = uniprots.columns.map(lambda x: x.upper())



In [16]:
uniprots

Unnamed: 0,MUTANT,VAR_SLICE,SLICE,VAR_AROMATICITY,AROMATICITY,AROMATICITY_DIFF,AROMATICITY_LOG_RATIO,VAR_ISOELECTRIC_POINT,ISOELECTRIC_POINT,ISOELECTRIC_POINT_DIFF,ISOELECTRIC_POINT_LOG_RATIO,VAR_GRAVY,GRAVY,GRAVY_DIFF,GRAVY_LOG_RATIO,VAR_INSTABILITY_INDEX,INSTABILITY_INDEX,INSTABILITY_INDEX_DIFF,INSTABILITY_INDEX_LOG_RATIO
0,P11362-174-V-A,AVPAAKTAKFKCPSS,AVPAAKTTVKFKCPSS,0.066667,0.062500,0.004167,-0.003914,9.790833,9.790833,0.000000,0.000000,-0.033333,0.07500,1.083333e-01,0.106222,63.513333,58.368750,5.144583,-0.083104
1,P11362-274-E-G,VALGSNVGFMCKVYS,VALGSNVVEFMCKVYS,0.133333,0.125000,0.008333,-0.007380,8.168396,5.964661,2.203735,-0.274913,0.953333,0.96250,9.166667e-03,0.004682,-7.013333,-0.643750,6.369583,
2,P11362-330-N-I,MEVLHLRIVSFEDAG,MEVLHLRRNVSFEDAG,0.066667,0.062500,0.004167,-0.003914,4.649353,5.426453,0.777100,0.128882,0.506667,-0.30625,8.129167e-01,-0.775543,29.353333,76.962500,47.609167,0.943322
3,P11362-332-S-C,VLHLRNVCFEDAGEY,VLHLRNVVSFEDAGEY,0.133333,0.125000,0.008333,-0.007380,4.651062,4.651062,0.000000,0.000000,-0.020000,0.03750,5.750000e-02,0.057017,38.213333,36.450000,1.763333,-0.046010
4,P11362-719-M-V,LLKEGHRVDKPSNCT,LLKEGHRRMDKPSNCT,0.000000,0.000000,0.000000,0.000000,8.206604,9.311096,1.104492,0.113300,-1.013333,-1.37500,3.616667e-01,3.336659,15.620000,60.462500,44.842500,1.307820
5,P11362-719-M-R,LLKEGHRRDKPSNCT,LLKEGHRRMDKPSNCT,0.000000,0.000000,0.000000,0.000000,9.311096,9.311096,0.000000,0.000000,-1.593333,-1.37500,2.183333e-01,-0.458830,63.826667,60.462500,3.364167,-0.053290
6,P11362-623-D-Y,SKKCIHRYLAARNVL,SKKCIHRRDLAARNVL,0.066667,0.000000,0.066667,-0.064539,10.306213,10.916077,0.609863,0.052536,-0.213333,-0.61875,4.054167e-01,-0.724349,20.760000,60.600000,39.840000,1.040588
7,P11362-621-H-R,LASKKCIRRDLAARN,LASKKCIIHRDLAARN,0.000000,0.000000,0.000000,0.000000,10.916443,10.053650,0.862793,-0.075159,-0.606667,-0.20625,4.004167e-01,0.702111,55.746667,24.800000,30.946667,-0.788222
8,P11362-165-L-S,PEKMEKKSHAVPAAK,PEKMEKKKLHAVPAAK,0.000000,0.000000,0.000000,0.000000,9.556946,9.835632,0.278687,0.026056,-1.220000,-1.10000,1.200000e-01,-0.788457,68.333333,59.381250,8.952083,-0.138247
9,P11362-167-A-S,KMEKKLHSVPAAKTV,KMEKKLHHAVPAAKTV,0.000000,0.000000,0.000000,0.000000,10.001404,10.001404,0.000000,0.000000,-0.513333,-0.51875,5.416667e-03,-0.011193,51.086667,48.518750,2.567917,-0.050558


In [17]:
uniprots.drop("SLICE", 1).drop("VAR_SLICE",1).to_csv(DATA_FOLDER + "interim/protparam_features.tab.gz", 
                                                     sep="\t", index=False, compression="gzip")