In [1]:
import scipy as sp
from scipy import stats
import numpy as np 
import pandas as pd
from FileExporter import create_download_link
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
from msresist.mining_bioreplicates import MergeDfbyMean, AvsBacrossCond, AvsBvsCacrossCond
from msresist.plsr import FilteringOutPeptides, ClusterAverages, GridSearch_CV, MeasuredVsPredicted_LOOCVplot, Q2Y_across_components, R2Y_across_components
from msresist.comp_estimator import ComHyperPar, MyOwnKMEANS
from sklearn.cross_decomposition import PLSRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import scale

Select one of the two normalization methods and comment out the one not used  

### Normalization 1: Mean Center

1st center to mean, then take the log for k-means. This requires transforming to absolute values first since log(x) if x<0 = Math E

In [2]:
A = pd.read_csv('./msresist/data/Raw/20180817_JG_AM_TMT10plex_R1_psms_raw.csv', header=0)
B = pd.read_csv('./msresist/data/Raw/20190214_JG_AM_PC9_AXL_TMT10_AC28_R2_PSMs_raw.csv', header=0)
C = pd.read_csv('./msresist/data/Raw/CombinedBR3_TR1&2_raw.csv', header=0)

In [5]:
#Mean centered data
A_d = pd.DataFrame(scale(A.iloc[:,2:], with_std=False, axis = 1))
B_d = pd.DataFrame(scale(B.iloc[:,2:], with_std=False, axis = 1))
C_d = pd.DataFrame(scale(C.iloc[:,2:], with_std=False, axis = 1))

#Peptide Sequences
Aseqs, Bseqs, Cseqs = A.iloc[:, 0], B.iloc[:, 0], C.iloc[:, 0]

#Protein Names 
Anames, Bnames, Cnames = [], [], []
x = list(map(lambda  v : Anames.append(v.split("OS")[0]), A.iloc[:,1]))
y = list(map(lambda  v : Bnames.append(v.split("OS")[0]), B.iloc[:,1]))
z = list(map(lambda  v : Cnames.append(v.split("OS")[0]), C.iloc[:,1]))

Aframes = [Aseqs, pd.DataFrame(Anames), A_d]
Bframes = [Bseqs, pd.DataFrame(Bnames), B_d]
Cframes = [Cseqs, pd.DataFrame(Cnames), C_d]

A = pd.concat(Aframes, axis = 1)
B = pd.concat(Bframes, axis = 1)
C = pd.concat(Cframes, axis = 1)

In [7]:
create_download_link(A, "A_MeanCentered")

In [8]:
frames = [A,B, C]
ABC = pd.concat(frames)
ABC_seqs = []
for seq in list(ABC.iloc[:,0]):
    if seq[0] == "y" and "y" not in seq[1:]:
        ABC_seqs.append(seq.split("-")[0])
    else:
        ABC_seqs.append(seq[1:].split("-")[0])
ABC['peptide-phosphosite'] = ABC_seqs
print("shape of concatenated matrix:", ABC.shape)

shape of concatenated matrix: (1496, 12)


In [9]:
create_download_link(ABC, "ABC_MeanCentered")

In [10]:
raise SystemExit

SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


### Normalization 2: Fold-Change

In [None]:
#Biological Replicates, note that C contains TR1 and TR2 already (ask Jacq how she merged them)
A = pd.read_csv('./msresist/data/Fold-Change/20180817_JG_AM_TMT10plex_R1_psms.csv', header=0)
B = pd.read_csv('./msresist/data/Fold-Change/20190214_JG_AM_PC9_AXL_TMT10_AC28_R2_PSMs.csv', header=0)
C = pd.read_csv('./msresist/data/Fold-Change/CombinedBR3_TR1&2.csv', header=0)

# Header
header = A.columns

# Filter based on fold change 
A_f = FilteringOutPeptides(A, header)

#Data
a, b, c = A.iloc[:, 2:12], B.iloc[:, 2:12], C.iloc[:, 2:12]
t = list(a.columns)

#Peptide Sequences
Aseqs, Bseqs, Cseqs = A.iloc[:, 0], B.iloc[:, 0], C.iloc[:, 0]

#Protein Names 
Anames, Bnames, Cnames = [], [], []
x = list(map(lambda  v : Anames.append(v.split("OS")[0]), A.iloc[:,1]))
y = list(map(lambda  v : Bnames.append(v.split("OS")[0]), B.iloc[:,1]))
z = list(map(lambda  v : Cnames.append(v.split("OS")[0]), C.iloc[:,1]))

A['Master Protein Descriptions'] = Anames
B['Master Protein Descriptions'] = Bnames
C['Master Protein Descriptions'] = Cnames

#Cell Viability
Y = np.array(pd.read_csv('./msresist/data/Phenotypic_data/ydata.csv', header=0))
Y_cv = Y[:,2]
Y_cv = Y_cv[:10]

## Filter based on variability

- Get a dataframe with all duplicates and a dataframe with all triplicates. Leave out any peptide that only shows up once. 
- Use two separate pivot_table functions: 
    1. Get range of duplicated peptides for each condition. Ideally, however, we would find a way to compute Pearson's correlation across all conditions between both duplicated peptides. 
    2. Get stdev for trips. 
    - This will generate a new col with this information.
- Set threshold for each, then compute final merged ABC where all recurrent peptides are averaged

Concatenate all biological replicates. 

In [None]:
frames = [A,B, C]
ABC = pd.concat(frames)
ABC_seqs = []
for seq in list(ABC.iloc[:,0]):
    if seq[0] == "y" and "y" not in seq[1:]:
        ABC_seqs.append(seq.split("-")[0])
    else:
        ABC_seqs.append(seq[1:].split("-")[0])
ABC['peptide-phosphosite'] = ABC_seqs
print("shape of concatenated matrix:", ABC.shape)

Build dataframe inidicating the number of recurrencies of each peptide: 1,2 or 3. From this dataframe we manually created two csv files () containing all duplicates. Finally, generate counter to see number of non-recurrent peptides, duplicates and triplicates:

In [None]:
dups = pd.pivot_table(ABC, index=['Master Protein Descriptions', 'peptide-phosphosite'], aggfunc = "size").sort_values()
dups_counter = {i:list(dups).count(i) for i in list(dups)}
print("total number of recurrences:", dups_counter)
dups = pd.DataFrame(dups)
#create_download_link(dups, "ABC_RecurrentPeptides")

Import duplicates and triplicates (cols = Prot, Seq, Recurrencies):

In [None]:
duplicated_ = pd.read_csv('./msresist/data/Fold-Change/ABC_Duplicates.csv', header=0)
triplicated_ = pd.read_csv('./msresist/data/Fold-Change/ABC_Triplicates.csv', header=0)

Find all duplicates in the concatenated dataframe ABC.

Found out ADP-ribosylation factor-like protein 11 appears twice but the same two entries (exact values across conditions) appeared with the protein name (blank). Check by:

In [None]:
ABC[ABC["Master Protein Descriptions"].str.contains("ADP-ribosylation factor-like protein 11")]

In [None]:
ABC[ABC["Master Protein Descriptions"].str.contains("(blank)")]

In [None]:
dupslist = []   #should be shape 492
for idx, dupseq in enumerate(duplicated_.iloc[:,1]):
    dup_name = duplicated_.iloc[idx,0]
    pepts = ABC.reset_index().set_index(["peptide-phosphosite", "Master Protein Descriptions"], drop = False).loc[dupseq, dup_name]
    names = pepts.iloc[:,2]
    if dup_name == "(blank)":
        continue
    elif len(pepts) == 2 and len(set(names)) == 1:
        for i in range(len(pepts)):
            dupslist.append(pepts.iloc[i, :])
    else:
        print("check this")
        print(pepts)
duplicates = pd.DataFrame(dupslist)

Find all triplicates in the concatenated dataframe ABC:

In [None]:
tripslist = []   #should be shape 492
for idx, tripseq in enumerate(triplicated_.iloc[:,1]):
    trip_name = triplicated_.iloc[idx,0]
    pepts = ABC.reset_index().set_index(["peptide-phosphosite", "Master Protein Descriptions"], drop = False).loc[tripseq, trip_name]
    names = pepts.iloc[:,2]
    seq = pepts.iloc[:,1]
    if trip_name == "(blank)":
        continue
    if len(pepts) == 3 and len(set(names)) == 1:
        for i in range(len(pepts)):
            tripslist.append(pepts.iloc[i, :])
    else:
        print("check this")
triplicates = pd.DataFrame(tripslist)

Need to find a way to get pearson's correlation instead of using ptp... for the mean we take values of the same condition and for pearson we want two compare the entire list of values between replicates...

In [None]:
func_tri, func_dup = {}, {}
for i in t:
    func_tri[i] = np.mean, np.std
    func_dup[i] = np.mean, np.ptp #np.corrcoef
    
ABC_dups_avg = pd.pivot_table(duplicates, values = t, index = ['Master Protein Descriptions', 'peptide-phosphosite'], aggfunc = func_dup)
ABC_dups_avg = ABC_dups_avg.reset_index()[header]

ABC_trips_avg = pd.pivot_table(triplicates, values = t, index = ['Master Protein Descriptions', 'peptide-phosphosite'], aggfunc = func_tri)
ABC_trips_avg = ABC_trips_avg.reset_index()[header]

### Apply filter: Set range to 0.6 and stdev to 0.3

Overlapping peptides across 2 BRs:

In [None]:
ABC_dups_avg = ABC_dups_avg.set_index(['Master Protein Descriptions', 'peptide-phosphosite'])
dups_final, protnames, seqs = [], [], []
for i in range(ABC_dups_avg.shape[0]):
    ptp = ABC_dups_avg.iloc[i, ABC_dups_avg.columns.get_level_values(1)=='ptp']
    mean = ABC_dups_avg.iloc[i, ABC_dups_avg.columns.get_level_values(1)=='mean']
    seq = ABC_dups_avg.index[i][1]
    name = ABC_dups_avg.index[i][0]
    if all(v <= 0.6 for v in ptp):
        dups_final.append(mean)
        seqs.append(seq)
        protnames.append(name)

dups_final = pd.DataFrame(dups_final).reset_index().iloc[:,1:]

frames = [pd.DataFrame(seqs),pd.DataFrame(protnames), dups_final]
dups_final = pd.concat(frames, axis = 1)
dups_final.columns = header
dups_final = dups_final.sort_values(by = "Master Protein Descriptions")

Overlapping peptides across all 3 BRs:

In [None]:
ABC_trips_avg = ABC_trips_avg.set_index(['Master Protein Descriptions', 'peptide-phosphosite'])
trips_final, protnames, seqs = [], [], []
for i in range(ABC_trips_avg.shape[0]):
    std = ABC_trips_avg.iloc[i, ABC_trips_avg.columns.get_level_values(1)=='std']
    mean = ABC_trips_avg.iloc[i, ABC_trips_avg.columns.get_level_values(1)=='mean']
    seq = ABC_trips_avg.index[i][1]
    name = ABC_trips_avg.index[i][0]
    if all(v <= 0.3 for v in std):
        trips_final.append(mean)
        seqs.append(seq)
        protnames.append(name)

trips_final = pd.DataFrame(trips_final).reset_index().iloc[:,1:]

frames = [pd.DataFrame(seqs),pd.DataFrame(protnames), trips_final]
trips_final = pd.concat(frames, axis = 1)
trips_final.columns = header
trips_final = trips_final.sort_values(by = "Master Protein Descriptions")

In [None]:
frame = [dups_final, trips_final]
X_f = pd.concat(frame)
X_f = X_f.sort_values(by = "Master Protein Descriptions")

In [None]:
X_f_size = pd.pivot_table(X_f, index=['Master Protein Descriptions', 'peptide-phosphosite'], aggfunc = "size")
Xf_counter = {i:list(X_f_size).count(i) for i in list(X_f_size)}
print("total number of recurrences:", Xf_counter)

In [None]:
create_download_link(X_f, "ABC_FilteredByVarAndFold")