# FIGURE 2

In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.cross_decomposition import PLSRegression
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import Pipeline
from msresist.parameter_tuning import MSclusPLSR_tuning
from msresist.plsr import Q2Y_across_components, R2Y_across_components
from msresist.figures.figure2 import plotR2YQ2Y, plotMixedClusteringPLSR_GridSearch, plotMeasuredVsPredicted, plotScoresLoadings, plotclusteraverages
from msresist.clustering import MassSpecClustering
from msresist.sequence_analysis import preprocess_seqs, FormatName, pYmotifs
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from msresist.pre_processing import preprocessing, MergeDfbyMean, LinearFoldChange, FoldChangeToControl, MapOverlappingPeptides, BuildMatrix, TripsMeanAndStd, CorrCoefFilter
from msresist.FileExporter import create_download_link
import warnings
warnings.simplefilter("ignore")



In [2]:
# pd.set_option('display.max_colwidth', 1000)
# pd.set_option('display.max_rows', 1000)
# pd.set_option('display.max_columns', 1000)

In [3]:
ABC = preprocessing(AXLwt=True, motifs=True, Vfilter=True, FCfilter=True, log2T=True)

In [4]:
ABC

Unnamed: 0,Protein,Sequence,UniprotAcc,Abbv,Position,BioReps,r2_Std,PC9,Erl,R428,Erl/R428,Erl+HGF,Erl+FGF,Erl+IGF,KO Erl,KO R428,KO Erl/R428
0,"1-phosphatidylinositol 4,5-bisphosphate phosph...",RNPGFyVEANP,P19174,PLCG1,Y783-p,1,,0.891610,-0.132290,0.508638,-0.431225,-0.008149,0.338497,-0.481326,-0.713501,0.547278,-0.519532
1,"10 kDa heat shock protein, mitochondrial",PEYGGtKVVLD,P61604,HSPE1,T79-p,1,,-0.358784,0.566052,-0.052941,-0.706288,-0.172295,0.334636,0.246512,0.255195,0.644213,-0.756300
3,14-3-3 protein epsilon,GDYHRyLAEFA,P62258,YWHAE,Y131-p,1,,0.413272,0.620284,0.352508,-0.291815,0.122734,0.216532,0.167650,-0.586405,-0.395571,-0.619189
4,14-3-3 protein zeta/delta,LLSVAyKNVVG,P63104,YWHAZ,Y48-p,1,,2.028584,1.620221,1.301309,-0.568873,-0.712916,-0.731959,-0.791261,-0.762381,-0.574292,-0.808433
5,182 kDa tankyrase-1-binding protein,GPPARsPsQDF,Q9C0C2,TNKS1BP1,S1552-p,1,,-0.789124,-0.270084,-0.676451,-0.143864,0.154265,-0.435783,-0.021593,0.246977,0.358189,1.577468
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681,Zinc finger protein 185,GALADyEGKDV,O15231,ZNF185,Y408-p,1,,1.316789,0.388504,0.263724,-1.455803,0.597152,0.022433,0.432893,-0.449084,-0.012254,-1.104353
682,Zinc finger protein 185,LFVKEyVNASE,O15231,ZNF185,Y581-p,2,0.96,2.694753,0.852521,0.959014,-1.434426,0.582779,-0.266371,-0.040972,-1.194264,-0.760311,-1.392723
683,Zinc finger protein 185,PVSARySNVSS,O15231,ZNF185,Y597-p,1,,0.577963,0.026361,0.046690,0.203022,0.399508,0.017095,-0.176009,-0.181494,-0.215097,-0.698038
684,cAMP-dependent protein kinase type II-alpha re...,FNRRVsVCAET,P13861,PRKAR2A,S99-p,1,,0.770174,0.664380,0.215155,-0.371144,0.243069,-0.974646,-0.135812,-0.420009,-0.759263,0.768097


In [5]:
# ABCg = ABC.groupby(["Protein", "Sequence", "UniprotAcc"])

In [6]:
# full = ABCg.get_group(("Alpha-enolase", "ASTGIyEALEL", "P06733", ""))
# display(full)
# assert len(set(full["UniprotAcc"]))==1 and len(set(full))

In [7]:
# avg = full.iloc[3:13].mean()
# avg

In [8]:
ABC.shape

(369, 17)

In [9]:
ABC[ABC["Protein"].str.contains("Epidermal growth factor")]

Unnamed: 0,Protein,Sequence,UniprotAcc,Abbv,Position,BioReps,r2_Std,PC9,Erl,R428,Erl/R428,Erl+HGF,Erl+FGF,Erl+IGF,KO Erl,KO R428,KO Erl/R428
237,Epidermal growth factor receptor,QDPHStAVGNP,P00533,EGFR,T1131-p,1,,2.66787,-0.401854,2.682426,-1.642911,-0.489795,-1.480688,-0.886165,-1.403297,1.969663,-1.015251
238,Epidermal growth factor receptor,RDPQRyLVIQG,P00533,EGFR,Y978-p,1,,1.332499,0.011125,1.260392,-0.950429,-0.2544,-0.517259,-0.248404,-0.455423,0.932603,-1.110704
239,Epidermal growth factor receptor kinase substr...,HPADGyAFSSN,Q12929,EPS8,Y491-p,1,,0.945233,0.259246,-0.07293,-0.759404,-0.169712,0.237436,0.097118,-0.537039,-0.25169,0.251743
240,Epidermal growth factor receptor kinase substr...,RADPPyTHTIQ,Q12929,EPS8,Y602-p,2,0.97,1.347429,0.203644,-0.041649,-1.209415,0.134457,-0.140056,-0.274211,0.153164,0.283055,-0.456416
241,Epidermal growth factor receptor kinase substr...,SSVSEyHPADG,Q12929,EPS8,Y485-p,1,,0.925432,0.810116,0.350853,-0.619328,-0.449975,-0.205151,-0.538812,-0.037616,-0.03525,-0.200269


In [10]:
raise SystemExit

SystemExit: 

### Importing and Pre-processing Data

### Phosphorylation measurements:

In [None]:
ABC = preprocessing(AXLwt=True, motifs=True, Vfilter=False, FCfilter=True, log2T=True)
ABC = preprocess_seqs(ABC, "Y").sort_values(by="Protein")

header = ABC.columns
treatments = ABC.columns[7:]

data = ABC.iloc[:, 7:].T
info = ABC.iloc[:, :7]

In [None]:
ABC[ABC["Protein"].str.contains("Epidermal")]

In [None]:
raise SystemExit

### Phenotypes

#### Cell Viability:

In [None]:
Y_cv1 = pd.read_csv('./msresist/data/Phenotypic_data/CV_raw3.csv').iloc[:30, :11]
Y_cv2 = pd.read_csv('./msresist/data/Phenotypic_data/CV_raw4.csv').iloc[:29, :11]

for ii in range(1, Y_cv2.columns.size):
    Y_cv1.iloc[:, ii] /= Y_cv1.iloc[0, ii]
    Y_cv2.iloc[:, ii] /= Y_cv2.iloc[0, ii]

Y_cv = MergeDfbyMean(pd.concat([Y_cv1, Y_cv2], axis=0), Y_cv1.columns, "Elapsed")
Y_cv = Y_cv.reset_index()[Y_cv1.columns]
Y_cv = Y_cv[Y_cv["Elapsed"] == 72].iloc[0, 1:]

# Chained MS mixed clustering + PLSR analysis

In [None]:
distance_method = "Binomial"
ncl = 2
GMMweight = 0

MSC = MassSpecClustering(info, ncl, GMMweight=GMMweight, distance_method=distance_method).fit(data, Y_cv)
centers = MSC.transform(data)

fig, ax = plt.subplots(1, 1, figsize=(7,6))
plotR2YQ2Y(ax, ncl, centers, Y_cv)

In [None]:
# fig, ax = plt.subplots(1, 1, figsize=(8,6))
# plotMixedClusteringPLSR_GridSearch(ax, data, info, Y_cv)

In [None]:
ncomp = 2

mixedCl_plsr = Pipeline([('mixedCl', MassSpecClustering(info, ncl, GMMweight=GMMweight, distance_method=distance_method)), ('plsr', PLSRegression(ncomp))])
fit = mixedCl_plsr.fit(data, Y_cv)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(6,5))
plotMeasuredVsPredicted(ax, mixedCl_plsr, data, Y_cv)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,6))

plotScoresLoadings(ax, fit, centers, Y_cv, ncl, treatments)

In [None]:
clustermembers = mixedCl_plsr.named_steps.mixedCl.clustermembers(ABC.T)
clustermembers.iloc[:10, :]

In [None]:
create_download_link(clustermembers, "DataClustering")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 5))

plotclusteraverages(ax, ABC, mixedCl_plsr, ncl)