# FIGURE 2

In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.cross_decomposition import PLSRegression
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import Pipeline
from msresist.parameter_tuning import MSclusPLSR_tuning
from msresist.plsr import Q2Y_across_components, R2Y_across_components
from msresist.figures.figure2 import plotR2YQ2Y, plotMixedClusteringPLSR_GridSearch, plotMeasuredVsPredicted, plotScoresLoadings, plotclusteraverages
from msresist.clustering import MassSpecClustering
from msresist.sequence_analysis import preprocess_seqs, FormatName, pYmotifs
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
from msresist.pre_processing import preprocessing, MergeDfbyMean, LinearFoldChange, FoldChangeToControl, MapOverlappingPeptides, BuildMatrix, TripsMeanAndStd, CorrCoefFilter
from msresist.FileExporter import create_download_link
import warnings
warnings.simplefilter("ignore")

### Importing and Pre-processing Data

### Phosphorylation measurements:

In [2]:
# pd.set_option('display.max_colwidth', 1000)
# pd.set_option('display.max_rows', 1000)
# pd.set_option('display.max_columns', 1000)

In [3]:
ABC = preprocessing(motifs=True, Vfilter=True, FCfilter=True, log2T=False)
ABC = preprocess_seqs(ABC, "Y").sort_values(by="Protein")

header = ABC.columns
treatments = ABC.columns[6:]

data = ABC.iloc[:, 6:].T
info = ABC.iloc[:, :6]

In [4]:
ABC

Unnamed: 0,Protein,Sequence,UniprotAcc,Position,r2_Std,BioReps,PC9,Erl,R428,Erl/R428,Erl+HGF,Erl+FGF,Erl+IGF,KO Erl,KO R428,KO Erl/R428
0,"1-phosphatidylinositol 4,5-bisphosphate phosph...",RNPGFyVEANP,P19174,Y783-p,,1,1.0,0.491785,0.766856,0.399749,0.535976,0.681548,0.386105,0.328710,0.787673,0.376014
3,14-3-3 protein epsilon,GDYHRyLAEFA,P62258,Y131-p,,1,1.0,1.154295,0.958756,0.613405,0.817597,0.872520,0.843452,0.500112,0.570839,0.488875
4,14-3-3 protein zeta/delta,LLSVAyKNVVG,P63104,Y48-p,,1,1.0,0.753478,0.604044,0.165229,0.149529,0.147568,0.141626,0.144489,0.164610,0.139950
15,60S ribosomal protein L22-like 1,TYELRyFQISQ,Q6P5R6,Y108-p,,1,1.0,0.928301,1.005567,1.089793,0.749455,0.932451,1.040302,0.793288,2.625435,0.920942
16,ADP-ribosylation factor-like protein 11,KtTLLyKLKGH,Q969Q4,Y30-p,0.94,2,1.0,0.756499,0.773123,0.475867,0.805728,0.594572,0.643155,0.646133,0.709589,0.668610
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
781,Zinc finger CCCH-type antiviral protein 1,LLSSDyRIING,Q7Z2W4,Y410-p,,1,1.0,0.660468,0.341765,0.085761,0.271659,0.220393,0.261110,0.112587,0.109253,0.079204
784,Zinc finger protein 185,LFVKEyVNASE,O15231,Y581-p,0.96,2,1.0,0.278890,0.300255,0.057147,0.231330,0.128414,0.150129,0.067498,0.091185,0.058823
785,Zinc finger protein 185,PVSARySNVSS,O15231,Y597-p,,1,1.0,0.682262,0.691944,0.771137,0.883649,0.677894,0.592969,0.590719,0.577119,0.412938
783,Zinc finger protein 185,GALADyEGKDV,O15231,Y408-p,,1,1.0,0.525483,0.481943,0.146341,0.607250,0.407718,0.541902,0.294049,0.398032,0.186708


In [5]:
ABC[ABC["Protein"].str.contains("Receptor-inter")]

Unnamed: 0,Protein,Sequence,UniprotAcc,Position,r2_Std,BioReps,PC9,Erl,R428,Erl/R428,Erl+HGF,Erl+FGF,Erl+IGF,KO Erl,KO R428,KO Erl/R428
629,Receptor-interacting serine/threonine-protein ...,QDEANyHLYGS,Q13546,Y384-p,,1,1.0,0.710479,0.703052,0.408664,0.613758,0.681162,0.998074,0.64517,0.845214,0.570437


In [6]:
raise SystemExit

SystemExit: 

### Phenotypes

#### Cell Viability:

In [None]:
Y_cv1 = pd.read_csv('./msresist/data/Phenotypic_data/CV_raw3.csv').iloc[:30, :11]
Y_cv2 = pd.read_csv('./msresist/data/Phenotypic_data/CV_raw4.csv').iloc[:29, :11]

for ii in range(1, Y_cv2.columns.size):
    Y_cv1.iloc[:, ii] /= Y_cv1.iloc[0, ii]
    Y_cv2.iloc[:, ii] /= Y_cv2.iloc[0, ii]

Y_cv = MergeDfbyMean(pd.concat([Y_cv1, Y_cv2], axis=0), Y_cv1.columns, "Elapsed")
Y_cv = Y_cv.reset_index()[Y_cv1.columns]
Y_cv = Y_cv[Y_cv["Elapsed"] == 72].iloc[0, 1:]

# Chained MS mixed clustering + PLSR analysis

In [None]:
ncl = 3
GMMweight = 0

MSC = MassSpecClustering(info, ncl, GMMweight=GMMweight, distance_method="Binomial").fit(data, Y_cv)
centers = MSC.transform(data)

fig, ax = plt.subplots(1, 1, figsize=(7,6))
plotR2YQ2Y(ax, ncl, centers, Y_cv)

In [None]:
# fig, ax = plt.subplots(1, 1, figsize=(8,6))
# plotMixedClusteringPLSR_GridSearch(ax, data, info, Y_cv)

In [None]:
ncomp = 2

mixedCl_plsr = Pipeline([('mixedCl', MassSpecClustering(info, ncl, GMMweight=GMMweight, distance_method="Binomial")), ('plsr', PLSRegression(ncomp))])
fit = mixedCl_plsr.fit(data, Y_cv)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(6,5))
plotMeasuredVsPredicted(ax, mixedCl_plsr, data, Y_cv)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,6))
colors_ = cm.rainbow(np.linspace(0, 1, ncl))

plotScoresLoadings(ax, fit, centers, Y_cv, ncl, colors_, treatments)

In [None]:
clustermembers = mixedCl_plsr.named_steps.mixedCl.clustermembers(ABC.T)
clustermembers.iloc[:10, :]

In [None]:
create_download_link(clustermembers, "DataClustering")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 5))

plotclusteraverages(ax, ABC, mixedCl_plsr, colors_)