In [1]:
import gzip
import pandas as pd
import numpy as np
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from Bio import SwissProt
from Bio import SeqIO

from functools import partial

In [2]:
DATA_EXTERNAL = "../data/external/"
DATA_INTERIM = "../data/interim/"

In [3]:
humsavar = pd.read_csv("../data/interim/humsavar_full.csv.gz", index_col="MUTANT")
uniprots = pd.DataFrame.from_items(zip(humsavar.index, humsavar.index.str.split("-"))).T
uniprots.columns = ["uniprot", "position", "amino", "amino_var"]
uniprots.position = uniprots.position.astype(int)

In [4]:
uniprots.head()

Unnamed: 0,uniprot,position,amino,amino_var
P04217-52-H-R,P04217,52,H,R
P04217-395-H-R,P04217,395,H,R
Q9NQ94-555-V-M,Q9NQ94,555,V,M
Q9NQ94-558-A-S,Q9NQ94,558,A,S
A8K2U0-207-G-R,A8K2U0,207,G,R


In [9]:
# Reads FASTA file with human proteome, and sequences a to file for easier Pandas reading.
sequences = open(DATA_INTERIM + "human_prot_sequences.txt", "w")
for e, record in enumerate(SeqIO.parse(DATA_EXTERNAL + "uniprot-proteome%3AUP000005640.fasta", "fasta")):
    uni_id = record.id.split("|")[1]
    sequences.write("%s,%s\n" % (uni_id, record.seq))

In [10]:
sequences = pd.read_csv(DATA_INTERIM + "human_prot_sequences.txt", header=None, names=["uniprot", "sequence"])

In [11]:
sequences.head()

Unnamed: 0,uniprot,sequence
0,Q9Y3L3,MMKRQLHRMRQLAQTGSLGRTPETAEFLGEDLLQVEQRLEPAKRAA...
1,P04217,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...
2,Q5SQ80,MKLFGFGSRRGQTAQGSIDHVYTGSGYRIRDSELQKIHRAAVKGDA...
3,Q4UJ75,MKLFGFGSRRGQTAQGSIDHVYTGSGYRIRDSELQKIHRAAVKGDA...
4,A0PJZ0,MKLFGFRSRRGQTVLGSIDHLYTGSGYRIRYSELQKIHKAAVKGDA...


In [12]:
uniprots = uniprots.reset_index().merge(sequences, on="uniprot").set_index("index")

In [13]:
# Extracts subsequence and applies variation. 
# If var == False, it doesn't replace the aminoacid. 
# With width == -1, then the whole sequence is returned.
def get_subsequence(row, width=1, var=True):
    seq = row['sequence']
    pos = row['position']-1
    if (width != -1 and var):
        return seq[pos-width:pos] + row['amino_var'] + seq[pos+1:pos+width+1]
    if (width != -1 and  not var):
        return seq[pos-width:pos] + seq[pos-1:pos+width+1]
    if (width == -1 and var):
        return seq[:pos] + row['amino_var'] + seq[pos+1:]
    
uniprots['slice'] = uniprots.apply(partial(get_subsequence, width=7, var=False), axis=1)
uniprots['var_slice'] = uniprots.apply(partial(get_subsequence, width=7), axis=1)

In [14]:
uniprots.head()

Unnamed: 0_level_0,uniprot,position,amino,amino_var,sequence,slice,var_slice
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
P04217-52-H-R,P04217,52,H,R,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...,VTLTCQAAHLETPDFQ,VTLTCQARLETPDFQ
P04217-395-H-R,P04217,395,H,R,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...,PSERLELLHVDGPPPR,PSERLELRVDGPPPR
Q9NQ94-555-V-M,Q9NQ94,555,V,M,MESNHKSGDGLSGTQKEAALRALVQRTGYSLVQENGQRKYGGPPPG...,VPNATAPPVSAAQLKQ,VPNATAPMSAAQLKQ
Q9NQ94-558-A-S,Q9NQ94,558,A,S,MESNHKSGDGLSGTQKEAALRALVQRTGYSLVQENGQRKYGGPPPG...,ATAPVSAAAQLKQAVT,ATAPVSASQLKQAVT
A8K2U0-207-G-R,A8K2U0,207,G,R,MWAQLLLGMLALSPAIAEELPNYLVTLPARLNFPSVQKVCLDLSPG...,VAEGKTFFGTFSVEEY,VAEGKTFRTFSVEEY


In [15]:
uniprots[["var_slice", "slice"]].to_csv(DATA_INTERIM + "humsavar_full_protein_slices.csv.gz", index=True, index_label="MUTANT", compression="gzip")

In [35]:
uniprots = pd.read_csv(DATA_INTERIM + "humsavar_full_protein_slices.csv.gz")

def get_protparam(row, func_name):
    protein_analysis = ProteinAnalysis(row)
    try:
        param = getattr(protein_analysis, func_name)()
        if type(param) == list:
            return np.average(param)
        else:
            return param
    except:
        return np.nan
        

params = ["aromaticity", "isoelectric_point", "gravy", "instability_index", "flexibility"]

for param in params:
    uniprots[('var_' + param)] = uniprots.var_slice.apply(partial(get_protparam, func_name=param))
    uniprots[param] = uniprots.slice.apply(partial(get_protparam, func_name=param))
    uniprots[param + "_diff"] =  abs(uniprots[param] - uniprots[('var_' + param)])
    uniprots[param + "_log_ratio"] = np.log(uniprots[param] + 1) / np.log(uniprots[('var_' + param)] + 1) + 1
    uniprots.drop([param, 'var_' + param], 1, inplace=True)
    
uniprots.columns = uniprots.columns.map(lambda x: x.upper())

  avg = a.mean(axis)


In [36]:
uniprots.drop(["SLICE", "VAR_SLICE"], 1).to_csv(DATA_INTERIM + "protparam_features.csv.gz", index=False, compression="gzip")