In [1]:
import pandas as pd
import re
import os
import math
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
from scipy import optimize
from scipy import stats
import sympy as sy
from numpy import matrix
import signal
from scipy.optimize import brentq as root
%matplotlib inline

In [2]:
def normalize_column(df2,reads,channels,string):
    df1=df2[df2['Search ID'] == string][channels]
    df1.columns=reads
    df1=df1.reset_index(drop=True)
    temp1=list(df1['37'].values)
    for i in reads:
        temp=df1[i].values/temp1
        df1[i]=temp
    Seq=df2[df2['Search ID'] == string][["Sequence", "Protein Group Accessions"]]
    Seq=Seq.reset_index(drop=True)
    return df1,Seq

def fctSigmoidTR0(x,Pl,a,b):
    return (1 - Pl)*1/(1+np.exp(-(a/x-b))) + Pl

def fctSigmoidTR1(x,Pl,a,b):
    return -((1 - Pl) * (np.exp(-(a/x - b)) * (a/x**2))/(1 + np.exp(-(a/x - b)))**2)

def fctSigmoidTR2(x,Pl,a,b):
    return -((1 - Pl) * 1 * (np.exp(-(a/x - b)) * (a/x**2) * (a/x**2) - np.exp(-(a/x - b)) * (a * (2 * x)/(x**2)**2))/(1 + np.exp(-(a/x - b)))**2 - (1 - Pl) * 1 * (np.exp(-(a/x - b)) * (a/x**2)) *(2 * (np.exp(-(a/x - b)) * (a/x**2) * (1 + np.exp(-(a/x - b)))))/((1 + np.exp(-(a/x - b)))**2)**2)


startPars=[0, 550, 10]

def rSquared(y,z):
#     y=[i for i in y if not(np.isnan(i))]
#     z=[i for i in z if not(np.isnan(i))]
    y=np.array(y)
    z=np.array(z)
    ssTot = np.nansum((y - np.nanmean(y))**2)
    ssRes = np.nansum((y - z)**2)
    r2 = 1 - (ssRes/ssTot)
    return r2

def fitSigmoidTR(xVec, yVec, startPars, maxAttempts, fixT0=True, method=None):
    varyPars = 0
    attempts = 0
    repeatLoop = True
    validValues = []
    m=1234
    for i in yVec:
        if math.isnan(float(i)):
            validValues.append(0)
        else:
            validValues.append(1)
    if sum(validValues) <=2:
        m = [float('nan'),float('nan')]
        r_squared=0
    else:
        yVec=[yVec[i] for i in range(0,len(yVec)) if validValues[i] == 1]
        xVec=[xVec[i] for i in range(0,len(xVec)) if validValues[i] == 1]
        while (repeatLoop & (attempts < maxAttempts)):
            temp=(1 + varyPars*(np.random.uniform(-0.2, 0.2,(1,))[0]))
            parTmp = [i*temp for i in startPars]
            try:
                m=optimize.curve_fit(fctSigmoidTR0,xVec,yVec, parTmp, check_finite=False, bounds=([0.0,1e-5,1e-5], [1.5, 15000, 250]), method=method)
                attempts = attempts + 1
                varyPars = 1
                if not(all(np.isnan(m[0]))):
                    repeatLoop = False
            except RuntimeError,ValueError:
                m = [float('nan'),float('nan')]
                r_squared=0
        residuals = yVec- fctSigmoidTR0(xVec, m[0][0],m[0][1],m[0][2])
        ss_res = np.nansum(residuals**2)
        ss_tot = np.nansum((yVec-np.nanmean(xVec))**2)
        r_squared = 1 - (ss_res / ss_tot)
    return m[0],r_squared

def plot_fit(df,joint, xVec,startPars,title,plot=False):
    anorm=df.ix[joint]
    anorm = anorm[(((anorm.iloc[0:,6] < 0.6) & (anorm.iloc[0:,6] > 0.4)) & ((anorm.iloc[0:,8] < 0.3) & (anorm.iloc[0:,9] < 0.2)))]
    a=anorm.median(0)
    afitmodel,rsquared=fitSigmoidTR(xVec, a, startPars, 500, fixT0=True)
    afit=[]
    for i in temps:
        afit.append(fctSigmoidTR0(i,afitmodel[0],afitmodel[1],afitmodel[2]))
    r2=rSquared(a,afit)
    if plot == True:
        plt.scatter(xVec,a)
        plt.plot(xVec,afit)
        plt.xlim(35,66)
        plt.xlabel('Temperature')
        plt.ylabel('Fold Change')
        plt.title(title+'\n'+'R-Squared (noSUM): '+str(rsquared)+'\n'+'R-Squared (SUM): '+str(r2))
        plt.savefig('Figures/'+title+ ' Fit Plot.png')
        plt.show
    return a.tolist(),afit,afitmodel,rsquared,r2

def scale_factor(val1,fit1,r1,val2,fit2,r2):
    if r1 > r2:
        normcurve = fit1
    else:
        normcurve = fit2

    val1coeff=[normcurve[i]/val1[i] for i in range(0,len(val1))]
    val2coeff=[normcurve[i]/val2[i] for i in range(0,len(val2))]
    return val1coeff,val2coeff

def meltingPoint(model,xRange):
    try:
        if (model[1] == 0):
            r=float('nan')
        else:
            Pl=model[0][0]
            a=model[0][1]
            b=model[0][2]
            def calc(i):
                return fctSigmoidTR0(i,Pl,a,b)-0.5
            r=root(calc,min(xRange),max(xRange))
    except ValueError:
        r=float('nan')
    return r

def inflectionPoint(model,xRange):
    try:
        if (model[1] == 0):
            r=float('nan')
        else:
            Pl=model[0][0]
            a=model[0][1]
            b=model[0][2]
            def calc(i):
                return fctSigmoidTR2(i,Pl,a,b)
            r=root(calc,min(xRange),max(xRange))
    except ValueError:
        r=float('nan')
    return r

def meltingCurveSlope(model, xInfl):
    try:
        if (model[1] == 0):
            r=float('nan')
        else:
            Pl=model[0][0]
            a=model[0][1]
            b=model[0][2]
            r=fctSigmoidTR1(xInfl,Pl,a,b)
    except ValueError:
        r=float('nan')
    return r

def timeout(signum, frame):
#     print('TimeOut, changing method')
    raise Exception('changing method')

def fitting(df1,temps,startPars,title,df_info):
    df_fit = pd.DataFrame(columns=df1.columns,index=range(0,len(df1.index)))
    df_fit.columns=[title+'_'+str(df1.columns[i]) for i in range(0,len(df1.columns))]
    df_param=pd.DataFrame(columns=range(0,3),index=range(0,len(df1.index)))
    df_param.columns=[title+'_'+i for i in ['Pl','a','b']]
    df_R=[float('nan')]*len(df1.index)
    df_min=[float('nan')]*len(df1.index)
    df_infl=[float('nan')]*len(df1.index)
    df_slope=[float('nan')]*len(df1.index)
    colnames=[title+'_'+i for i in ['R','min','infl','slope']]
    for i in range(0,len(df1.index)):
#     for i in range(0,25):
        if i%1000==0:
            print(title+'_'+str(i))
        pts = df1.ix[i].tolist()
        validValues=[]
        for k in pts:
            if math.isnan(float(k)):
                validValues.append(0)
            else:
                validValues.append(1)
        pts1=pts
        pts=[pts[k] for k in range(0,len(pts)) if validValues[k] == 1]
        temps1=[temps[k] for k in range(0,len(temps)) if validValues[k] == 1]
        
        if sum(validValues) <= 2:
            continue
        else:
            signal.signal(signal.SIGALRM, timeout)
            signal.alarm(2)
            try:
                dffitmodel = fitSigmoidTR(temps1, pts, startPars, 500, fixT0=True)
            except Exception:
                signal.signal(signal.SIGALRM, timeout)
                signal.alarm(2)
                try:
                    dffitmodel = fitSigmoidTR(temps1, pts, startPars, 500, fixT0=True, method='dogbox')
                except Exception:
                    signal.signal(signal.SIGALRM, timeout)
                    signal.alarm(2)
                    try:
                        dffitmodel = fitSigmoidTR(temps1, pts, startPars, 500, fixT0=True, method='trf')
                    except Exception:
                        print('Notfound '+title+'_'+str(i))
                        dffitmodel = [float('nan'),float('nan')]
            if not(np.isnan(dffitmodel[1])):
                df_param.ix[i]=list(dffitmodel[0])
                dffit=[float('nan')] * len(temps)
                for j in range(0,len(temps)):
                    if validValues[j] == 1:
                        dffit[j]=fctSigmoidTR0(temps[j],dffitmodel[0][0],dffitmodel[0][1],dffitmodel[0][2])
                df_fit.ix[i]=dffit
                dffit=[dffit[k] for k in range(0,len(dffit)) if validValues[k] == 1]
                slope1, intercept1, r_value1, p_value1, std_err1 = sp.stats.linregress(pts,dffit)
                df_R[i]=r_value1**2
#                 df_R[i]=rSquared(pts1,df_fit.ix[i].tolist())
                df_min[i]=meltingPoint(dffitmodel,temps1)
                df_infl[i]=inflectionPoint(dffitmodel,temps1)
                df_slope[i]=meltingCurveSlope(dffitmodel, df_infl[i])
            else:
                continue
    df_res=pd.concat([df_fit,df_param],1)
    df_res[colnames[0]]=df_R
    df_res[colnames[1]]=df_min
    df_res[colnames[2]]=df_infl
    df_res[colnames[3]]=df_slope
    df_info=df_info.reset_index(drop=True)
    df_res=pd.concat([df_info,df_res],1)
    df_res.to_csv('../Result_dup/'+title+'_'+'fitting_aggr.csv',index=False)
    return df_fit,df_param,df_R,df_min,df_infl,df_slope

In [3]:
#INIT
data=pd.read_csv('GMRJ1511B and GMRJ1512A 24vs28 Joint Accessions_30_New Export_2016-05-19_psms.txt',sep='\t')
mycols=[2,4,5,7,10,19, 20, 22, 24, 26,28,30, 32, 34, 36, 39, 45, 47, 49,51]
mycols=[i-1 for i in mycols]
dataReduced=data[data.columns[mycols]]
ctrl1Name='E'
treated1Name='G'
ctrl2Name='F'
treated2Name='H'
ctrl3Name='I'
treated3Name='J'
dataReduced=dataReduced[dataReduced['Quan Usage'] == 'Used']
dataReduced=dataReduced[dataReduced['Isolation Interference [%]'] <= 30].reset_index(drop=True)
reads = ["37", "40", "43", "46", "49", "52", "55", "58", "61", "64"]
channels = ["126", "127_N", "127_C", "128_N", "128_C", "129_N", "129_C", "130_N", "130_C", "131"]

In [4]:
ctrl1,ctrl1seq=normalize_column(dataReduced,reads,channels,ctrl1Name)
treated1,treated1seq=normalize_column(dataReduced,reads,channels,treated1Name)
ctrl2,ctrl2seq=normalize_column(dataReduced,reads,channels,ctrl2Name)
treated2,treated2seq=normalize_column(dataReduced,reads,channels,treated2Name)

In [5]:
Pgroups=dataReduced.drop_duplicates('Protein Group Accessions')['Protein Group Accessions']
Protein_desc=dataReduced.drop_duplicates('Protein Group Accessions')['Protein Descriptions']
Desc=[]
gene_name=[]
for i in Protein_desc:
    temp=i.split('OS')
    Desc.append(temp[0][0:-1])
    result=re.search(".*?GN=(.*?) .*", temp[1])
    try:
        gene_name.append(result.group(1))
    except AttributeError:
        gene_name.append(i)
Pgroups=pd.DataFrame([Pgroups.tolist(),Desc,gene_name]).T
Pgroups.columns=["Protein Group Accessions", "Protein_Description", "Gene_Name"]

In [6]:
Seqs=dataReduced.drop_duplicates('Sequence')[['Unique Sequence ID','Sequence',"Protein Group Accessions"]]
Seqs.columns=["Unique_Sequence_ID", "Sequence", "Protein_Group_Accessions"]

In [7]:
temps=[37,40,43,46,49,52,55,58,61,64]
ctrl1Read=list(ctrl1.index)
treated1Read=list(treated1.index)
ctrl2Read=list(ctrl2.index)
treated2Read=list(treated2.index)

In [8]:
joint1=list(set(ctrl1Read).intersection(treated1Read))
joint2=list(set(ctrl2Read).intersection(treated2Read))

In [9]:
a,afit,afitmodel,rsquared,ar=plot_fit(ctrl1,joint1,temps,startPars,'ctrl1')
b,bfit,bfitmodel,rsquared,br=plot_fit(treated1,joint1,temps,startPars,'treated1')
c,cfit,cfitmodel,rsquared,cr=plot_fit(ctrl2,joint2,temps,startPars,'ctrl2')
d,dfit,dfitmodel,rsquared,dr=plot_fit(treated2,joint2,temps,startPars,'treated2')

In [10]:
acoeff,bcoeff=scale_factor(a,afit,ar,b,bfit,br)
ccoeff,dcoeff=scale_factor(c,cfit,cr,d,dfit,dr)
allcoeff=pd.DataFrame([acoeff,bcoeff,ccoeff,dcoeff],columns=reads,index=['a.coeff','b.coeff','c.coeff','d.coeff'])

In [11]:
ctrl1norm=ctrl1*acoeff
ctrl2norm=ctrl2*ccoeff
treated1norm=treated1*bcoeff
treated2norm=treated2*dcoeff

In [12]:
ctrl1Unorm=pd.concat([ctrl1seq,ctrl1norm], 1).groupby('Sequence').median()
treated1Unorm=pd.concat([treated1seq,treated1norm], 1).groupby('Sequence').median()
ctrl2Unorm=pd.concat([ctrl2seq,ctrl2norm], 1).groupby('Sequence').median()
treated2Unorm=pd.concat([treated2seq,treated2norm], 1).groupby('Sequence').median()
combinedUnorm=pd.concat([ctrl1Unorm,treated1Unorm,ctrl2Unorm,treated2Unorm],1)
combinedUnorm['Protein_Group_Accessions']=Seqs[Seqs['Sequence'].isin(combinedUnorm.index)].sort_values('Sequence')['Protein_Group_Accessions'].tolist()
combinedUnorm=combinedUnorm.groupby('Protein_Group_Accessions').median()

In [13]:
ctrl1norm=combinedUnorm.iloc[:,0:10]
treated1norm=combinedUnorm.iloc[:,10:20]
ctrl2norm=combinedUnorm.iloc[:,20:30]
treated2norm=combinedUnorm.iloc[:,30:40]

In [14]:
UniProt_Accession = set(combinedUnorm.index).intersection(Pgroups['Protein Group Accessions'])
Pgroups=Pgroups[Pgroups['Protein Group Accessions'].isin(UniProt_Accession)].sort_values('Protein Group Accessions')
Pgroups.to_csv('../Result_dup/Pgroups.csv',index=False)

In [15]:
ctrl1_fit,ctrl1_param,ctrl1_R,ctrl1_min,ctrl1_infl,ctrl1_slope=fitting(ctrl1norm,temps,startPars,'ctrl1',Pgroups)
treated1_fit,treated1_param,treated1_R,treated1_min,treated1_infl,treated1_slope=fitting(treated1norm,temps,startPars,'treated1',Pgroups)
ctrl2_fit,ctrl2_param,ctrl2_R,ctrl2_min,ctrl2_infl,ctrl2_slope=fitting(ctrl2norm,temps,startPars,'ctrl2',Pgroups)
treated2_fit,treated2_param,treated2_R,treated2_min,treated2_infl,treated2_slope=fitting(treated2norm,temps,startPars,'treated2',Pgroups)

ctrl1_0
ctrl1_1000
ctrl1_2000
ctrl1_3000
Notfound ctrl1_3043




ctrl1_4000
Notfound ctrl1_4823
ctrl1_5000
ctrl1_6000
ctrl1_7000
ctrl1_8000
treated1_0
treated1_1000
Notfound treated1_1095
Notfound treated1_1593
Notfound treated1_1685
treated1_2000
Notfound treated1_2054
Notfound treated1_2195




Notfound treated1_2983
treated1_3000
Notfound treated1_3010
Notfound treated1_3051
Notfound treated1_3319
Notfound treated1_3689
treated1_4000
Notfound treated1_4384
Notfound treated1_4755
treated1_5000
treated1_6000
Notfound treated1_6320
Notfound treated1_6420




Notfound treated1_6695
treated1_7000
Notfound treated1_7426
Notfound treated1_7718
treated1_8000
Notfound treated1_8363
ctrl2_0
Notfound ctrl2_174
Notfound ctrl2_916
ctrl2_1000
Notfound ctrl2_1234
Notfound ctrl2_1526
Notfound ctrl2_1538
Notfound ctrl2_1721
ctrl2_2000
ctrl2_3000
Notfound ctrl2_3085
Notfound ctrl2_3108
Notfound ctrl2_3212
Notfound ctrl2_3767
Notfound ctrl2_3874
ctrl2_4000
Notfound ctrl2_4960
ctrl2_5000
Notfound ctrl2_5740
ctrl2_6000
Notfound ctrl2_6695
ctrl2_7000
ctrl2_8000
treated2_0
Notfound treated2_151
Notfound treated2_209
Notfound treated2_412
Notfound treated2_672
Notfound treated2_916
treated2_1000
Notfound treated2_1751
Notfound treated2_1798
Notfound treated2_1984
treated2_2000
Notfound treated2_2054
Notfound treated2_2551
Notfound treated2_2574
Notfound treated2_2958
treated2_3000
Notfound treated2_3050
Notfound treated2_3077
Notfound treated2_3117
Notfound treated2_3295
Notfound treated2_3399
Notfound treated2_3401
Notfound treated2_3585
Notfound treated2_386