In [1]:
import scipy as sp
from scipy import stats
import numpy as np 
import pandas as pd
from FileExporter import create_download_link
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
from msresist.mining_bioreplicates import MergeDfbyMean, AvsBacrossCond, AvsBvsCacrossCond
from msresist.plsr import FilteringOutPeptides, ClusterAverages, GridSearch_CV, MeasuredVsPredicted_LOOCVplot, Q2Y_across_components, R2Y_across_components
from msresist.comp_estimator import ComHyperPar, MyOwnKMEANS
from sklearn.cross_decomposition import PLSRegression
from sklearn.pipeline import Pipeline

In [2]:
#Biological Replicates, note that C contains TR1 and TR2 already (ask Jacq how she merged them)
A = pd.read_csv('./msresist/data/20180817_JG_AM_TMT10plex_R1_psms.csv', header=0)
B = pd.read_csv('./msresist/data/20190214_JG_AM_PC9_AXL_TMT10_AC28_R2_PSMs.csv', header=0)
C = pd.read_csv('./msresist/data/CombinedBR3_TR1&2.csv', header=0)
X_f = pd.read_csv('./msresist/data/ABC_fVar_fVal.csv', header = 0).iloc[:,1:]

#Cell Viability
Y = np.array(pd.read_csv('./msresist/data/ydata.csv', header=0))
Y_cv = Y[:,2]
Y_cv = Y_cv[:10]

#Data
a, b, c = A.iloc[:, 2:12], B.iloc[:, 2:12], C.iloc[:, 2:12]
t = list(a.columns)

#Peptide Sequences
Aseqs, Bseqs, Cseqs = A.iloc[:, 0], B.iloc[:, 0], C.iloc[:, 0]

#Protein Names 
Anames, Bnames, Cnames = [], [], []
x = list(map(lambda  v : Anames.append(v.split("OS")[0]), A.iloc[:,1]))
y = list(map(lambda  v : Bnames.append(v.split("OS")[0]), B.iloc[:,1]))
z = list(map(lambda  v : Cnames.append(v.split("OS")[0]), C.iloc[:,1]))

A['Master Protein Descriptions'] = Anames
B['Master Protein Descriptions'] = Bnames
C['Master Protein Descriptions'] = Cnames

### Filter based on variability

Use nodups to locate dups and trips after setting phosphosites as index. Use if statements to get a df with all dups and a df with all trips. Then use two separate pivot_table functions, get range for dups, get stdev for trips. This will generate a new col with this information. We are going to fit ABC_avg where only overlapping peptides show up. For every peptide there will be a col with either range or stdev. Access this info and filter based on this. Set threshold.  

In [3]:
frames = [A,B, C]
ABC = pd.concat(frames)
ABC_seqs = []
for seq in list(ABC.iloc[:,0]):
    if seq[0] == "y" and "y" not in seq[1:]:
        ABC_seqs.append(seq.split("-")[0])
    else:
        ABC_seqs.append(seq[1:].split("-")[0])
ABC['peptide-phosphosite'] = ABC_seqs
print("shape of concatenated matrix:", ABC.shape)

#count number of duplicates across data sets
dups = pd.pivot_table(ABC, index=['Master Protein Descriptions', 'peptide-phosphosite'], aggfunc = "size")
dups_counter = {i:list(dups).count(i) for i in list(dups)}
print("total number of recurrences:", dups_counter)

dups = pd.DataFrame(dups)
create_download_link(dups, "dups")

In [4]:
DupsAndTrips = ABC[ABC.duplicated(['Master Protein Descriptions', 'peptide-phosphosite'], keep = False)].sort_values(by = "Master Protein Descriptions")
ABC_nodups = DupsAndTrips.copy().iloc[:,0].drop_duplicates()
DupsAndTrips.set_index("peptide-phosphosite", inplace=True)

In [5]:
header = ['peptide-phosphosite', 'Master Protein Descriptions', 'PC9', 'Erl', 'R428', 'Erl/R428', 'Erl+HGF', 'Erl+FGF', 'Erl+IGF', 'KO Erl', 'KO R428', 'KO Erl/R428']

dups, trips, idk = [], [], []
for i in ABC_nodups:
    pepts = DupsAndTrips.loc[i]
    names = pepts.iloc[:,0]
    if len(pepts) == 2 and len(set(names)) == 1:
        for i in range(len(pepts)):
            dups.append(pepts.iloc[i, :])
    elif len(pepts) == 3 and len(set(names)) == 1:
        for i in range(len(pepts)):
            trips.append(pepts.iloc[i, :])
    else:
        if i == "VVESAyEVIk":
            for i in range(len(pepts)):
                trips.append(pepts.iloc[i, :])
        if i == "yDHLFk":
            for i in range(len(pepts)):
                dups.append(pepts.iloc[i, :])
        if i == "IEDNEyTAR":
            trips.append(pepts.iloc[0, :])
            trips.append(pepts.iloc[1, :])
            trips.append(pepts.iloc[2, :])
            dups.append(pepts.iloc[3, :])
            dups.append(pepts.iloc[4, :])

dups = pd.DataFrame(dups).reset_index()
trips = pd.DataFrame(trips).reset_index()
dups.columns = header
trips.columns = header

# print(dups.shape)    #TO DO: Assert function to check that these shapes match with the ones generated by dups_counter (pivot_table)
# print(trips.shape)

# create_download_link(dups, "ABC_dups")

In [6]:
# create_download_link(trips, "ABC_trips")

Need to find a way to get pearson's correlation instead of using ptp... for the mean we take values of the same condition and for pearson we want two compare the entire list of values between replicates...

In [7]:
func_tri, func_dup = {}, {}
for i in t:
    func_tri[i] = np.mean, np.std
    func_dup[i] = np.mean, np.ptp
    
ABC_dups_avg = pd.pivot_table(dups, values = t, index = ['Master Protein Descriptions', 'peptide-phosphosite'], aggfunc = func_dup)
ABC_dups_avg = ABC_dups_avg.reset_index()[header]

ABC_trips_avg = pd.pivot_table(trips, values = t, index = ['Master Protein Descriptions', 'peptide-phosphosite'], aggfunc = func_tri)
ABC_trips_avg = ABC_trips_avg.reset_index()[header]

### Apply filter: Set range to 0.6 and stdev to 0.3

Overlapping peptides across 2 BRs:

In [8]:
ABC_dups_avg = ABC_dups_avg.set_index(['Master Protein Descriptions', 'peptide-phosphosite'])
dups_final, protnames, seqs = [], [], []
for i in range(ABC_dups_avg.shape[0]):
    ptp = ABC_dups_avg.iloc[i, ABC_dups_avg.columns.get_level_values(1)=='ptp']
    mean = ABC_dups_avg.iloc[i, ABC_dups_avg.columns.get_level_values(1)=='mean']
    seq = ABC_dups_avg.index[i][1]
    name = ABC_dups_avg.index[i][0]
    if all(v <= 0.6 for v in ptp):
        dups_final.append(mean)
        seqs.append(seq)
        protnames.append(name)

dups_final = pd.DataFrame(dups_final).reset_index().iloc[:,1:]

frames = [pd.DataFrame(seqs),pd.DataFrame(protnames), dups_final]
dups_final = pd.concat(frames, axis = 1)
dups_final.columns = header
dups_final = dups_final.sort_values(by = "Master Protein Descriptions")

Overlapping peptides across all 3 BRs:

In [9]:
ABC_trips_avg = ABC_trips_avg.set_index(['Master Protein Descriptions', 'peptide-phosphosite'])
trips_final, protnames, seqs = [], [], []
for i in range(ABC_trips_avg.shape[0]):
    std = ABC_trips_avg.iloc[i, ABC_trips_avg.columns.get_level_values(1)=='std']
    mean = ABC_trips_avg.iloc[i, ABC_trips_avg.columns.get_level_values(1)=='mean']
    seq = ABC_trips_avg.index[i][1]
    name = ABC_trips_avg.index[i][0]
    if all(v <= 0.3 for v in std):
        trips_final.append(mean)
        seqs.append(seq)
        protnames.append(name)

trips_final = pd.DataFrame(trips_final).reset_index().iloc[:,1:]

frames = [pd.DataFrame(seqs),pd.DataFrame(protnames), trips_final]
trips_final = pd.concat(frames, axis = 1)
trips_final.columns = header
trips_final = trips_final.sort_values(by = "Master Protein Descriptions")

In [10]:
frame = [dups_final, trips_final]
X_f = pd.concat(frame)
X_f = X_f.sort_values(by = "Master Protein Descriptions")

X_f = FilteringOutPeptides(X_f, header)

create_download_link(X_f, "ABC_fVar_fVal")

X_f_size = pd.pivot_table(X_f, index=['Master Protein Descriptions', 'peptide-phosphosite'], aggfunc = "size")
dups_counter = {i:list(X_f_size).count(i) for i in list(X_f_size)}

Trying the same using comprehension...

In [11]:
# duplicates, triplicates, toss = [], [], []
# for i in ABC_nodups:
#     pepts = DupsAndTrips.loc[i]
#     names = pepts.iloc[:,0]
#     duplicates.append([list(pepts.iloc[v, :]) for v in range(len(pepts)) if len(pepts) == 2 and len(set(names)) == 1])
#     triplicates.append([pepts.iloc[v, :] for v in range(len(pepts)) if len(pepts) == 3 and len(set(names)) == 1])
#     if i == "VVESAyEVIk":
#         triplicates.append([pepts.iloc[v, :] for v in range(len(pepts))])
#     if i == "yDHLFk":
#         duplicates.append([pepts.iloc[v, :] for v in range(len(pepts))])
#     if i == "IEDNEyTAR":
#         triplicates.append([pepts.iloc[v, :] for v in range(len(pepts)) if pepts.iloc[v, 0] == "Tyrosine-protein kinase Lck"])
#         duplicates.append([pepts.iloc[v, :] for v in range(len(pepts)) if pepts.iloc[v, 0] == "Tyrosine-protein kinase Lyn"])
        
# duplicates = pd.DataFrame(duplicates)
# triplicates = pd.DataFrame(triplicates)

# duplicates


# print(duplicates.shape)
# print(triplicates.shape)

## Run model with refined data

In [18]:
X_data = X_f.iloc[:,2:12].T
X_protnames = X_f.iloc[:,1]
X_seqs = X_f.iloc[:,0]

In [20]:
import warnings
warnings.simplefilter("ignore")

CVresults_max, CVresults_min, best_params = ComHyperPar(X_data, Y_cv, X_protnames, X_seqs)
# display(CVresults_min)
create_download_link(CVresults_min, "23432")

## Merging Biological Replicates

There are overlapping peptides across all 3 BRs... eg: gSTAENAEyLR-1, qNVPVINITyDSTPEDVk, gSHQISLDNPDyQQDFFPk, ... Same proteins have different descriptions which it's why I'm renaming them to .split("OS")[0]. Note 

In [None]:
# A[A["Master Protein Descriptions"].str.contains("UFO")]

In [None]:
# B[B["Master Protein Descriptions"].str.contains("UFO")]

In [None]:
# C[C["Master Protein Descriptions"].str.contains("UFO")]

In [None]:
ABC_avg = MergeDfbyMean(A,B,C,t)

# ABC_avg[ABC_avg["Master Protein Descriptions"].str.contains("UFO")]

Note that the shape of ABC_avg is 994:*10*, meaning that the first two rows are not really considered when trying to use them with pandas

## Variability between overlapping peptides across Biological Replicates

In [None]:
# AvsB = AvsBacrossCond(A,B, t)
# plt.savefig("ABrawAcrossCond_NoOL.png")

In [None]:
# AvsC = AvsBacrossCond(A,C, t)
# plt.savefig("ACrawAcrossCond_NoOL.png")

In [None]:
# BvsC = AvsBacrossCond(B,C, t)
# plt.savefig("BCrawAcrossCond_NoOL.png")

In [None]:
# AvsBvsC = AvsBvsCacrossCond(A,B,C,t)
# plt.savefig("ABCrawAcrossCond_NoOL.png")

## Spectral Biclustering

In [None]:
# from sklearn.cluster import SpectralBiclustering
# from sklearn.datasets import make_checkerboard
# from sklearn.datasets import samples_generator as sg
# from sklearn.metrics import consensus_score
# from sklearn.datasets import make_biclusters

In [None]:
# n_clusters = (5, 10)
# a, rows, columns = make_checkerboard(shape = (300, 10), n_clusters = n_clusters, shuffle = False, random_state = 0, noise = 10)


In [None]:
# fig, ax = plt.subplots(figsize=(10,20))
# ax.matshow(a, cmap=plt.cm.Blues)
# ax.set_title("Original dataset")

In [None]:
# a, row_idx, col_idx = sg._shuffle(a, random_state=0)
# plt.matshow(a, cmap=plt.cm.Blues)
# plt.title("Shuffled dataset")

In [None]:
# A

In [None]:
# model = SpectralBiclustering(n_clusters = n_clusters, method = 'log', random_state = 0)
# model.fit(a)

In [None]:
# score = consensus_score(model.biclusters_, (rows[:, row_idx], columns[:, col_idx]))
# print("consensus score: {:.1f}".format(score))

In [None]:
# fit_data = a[np.argsort(model.row_labels_)]
# fit_data = fit_data[:, np.argsort(model.column_labels_)]

In [None]:
# plt.matshow(fit_data, cmap=plt.cm.Blues)
# plt.title("After biclustering; rearranged to show biclusters")

In [None]:
# plt.matshow(np.outer(np.sort(model.row_labels_) + 1, np.sort(model.column_labels_) + 1), cmap=plt.cm.Blues)
# plt.title("Checkerboard structure of rearranged data")

# plt.show()