In [1]:
import numpy as np
import pandas as pd
from sklearn.cross_decomposition import PLSRegression
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import Pipeline
from msresist.parameter_tuning import MSclusPLSR_tuning
from msresist.plsr import Q2Y_across_components, R2Y_across_components
from msresist.figures.figure3 import cv_pre, cm_pre, plotR2YQ2Y, plotMeasuredVsPredicted, plotScoresLoadings, plotclusteraverages, plotGridSearch
from msresist.clustering import MassSpecClustering
import matplotlib.pyplot as plt
from msresist.pre_processing import preprocessing
from msresist.FileExporter import create_download_link
import warnings
warnings.simplefilter("ignore")

In [2]:
# pd.set_option('display.max_colwidth', 1000)
# pd.set_option('display.max_rows', 1000000)
# pd.set_option('display.max_columns', 1000)

## Re-implementation with AXL mutants

### Phosphorylation measurements:

#### + Erlotinib + AXL-activating Antibody (AF154)

In [3]:
X = preprocessing(Axlmuts_ErlAF154=True, Vfilter=True, FCfilter=True, log2T=True, mc_row=True)

d = X.select_dtypes(include=['float64']).T
i = X.select_dtypes(include=['object'])

all_lines = ["PC9", "KO", "KD", "KI", "Y634F", "Y643F", "Y698F", "Y726F", "Y750F ", "Y821F"] 
mut_lines = all_lines[1:]
g_lines = all_lines[2:]

d.index = all_lines

0/771 peptides were not found in the proteome.
771


In [4]:
X

Unnamed: 0,Protein,Sequence,UniprotAcc,Position,BioReps,r2_Std,Gene,PC9 A,KO A,Kd A,KI A,M4 A,M5 A,M7 A,M10 A,M11 A,M15 A
0,26S proteasome regulatory subunit 4,DKKKKyEPPVP,P62191,Y25-p,1,,PSMC1,-0.108045,-0.476332,-0.321637,-0.239781,0.126419,0.274226,0.243800,0.210842,0.326859,-0.036352
1,40S ribosomal protein S10,NRIAIyELLFK,P46783,Y12-p,1,,RPS10,-0.169514,0.097204,-1.712342,0.173058,0.152218,-0.295116,0.846713,1.092283,-0.347285,0.162782
2,40S ribosomal protein SA,LTEASyVNLPT,P08865,Y139-p,1,,RPSA,-0.905683,-0.264555,0.166887,0.990224,-0.256346,0.239761,-0.126352,0.681401,-0.332600,-0.192736
3,ARF GTPase-activating protein GIT1,DDQHDyDSVAS,Q9Y2X7,Y383-p,1,,GIT1,0.670858,0.718143,0.225202,-0.528652,-0.461699,-0.161058,0.103659,-0.139631,-0.028111,-0.398711
4,ATPase WRNIP1,AGEEHyNCISA,Q96S55,Y500-p,1,,WRNIP1,0.242877,0.226631,-0.140038,-0.402742,-0.033013,0.011860,0.085925,0.126432,-0.122947,0.005017
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
289,Vigilin,INRMDyVEINI,Q00341,Y437-p,1,,HDLBP,-0.263325,-0.061066,0.226309,0.144577,-0.166762,-0.106788,0.162886,-0.096486,0.160574,0.000081
290,Vinculin,FLDSGyRILGA,P18206,Y822-p,3,0.25,VCL,-0.058125,-0.354182,-0.021402,-0.067675,0.268052,-0.249421,0.026603,0.151849,0.202528,0.101774
291,Vinculin,GNQAAyEHFET,P18206,Y692-p,3,0.3,VCL,-0.011366,-0.298127,-0.159582,-0.227588,0.064568,-0.240977,0.138258,0.164782,0.333162,0.236871
292,Zinc finger Ran-binding domain-containing prot...,IEREEsDGEYD,O95218,S120-p,1,,ZRANB2,-0.207557,-1.353244,0.642844,1.194481,-0.846227,0.561126,0.186066,0.657013,0.829678,-1.664181


### Phenotypes of AXL mutants

#### Cell Viability

In [5]:
all_lines = ["PC9-A/E", "AXL KO-A/E", "Kdead-A/E", "Kin-A/E", "M4-A/E", "M5-A/E", "M7-A/E", "M10-A/E", "M11-A/E", "M15-A/E"]
lines = all_lines[1:]
glines = lines[2:]

In [6]:
cv1 = pd.read_csv("msresist/data/Phenotypic_data/AXLmutants/CellViability/Phase/BR1_Phase.csv")
cv2 = pd.read_csv("msresist/data/Phenotypic_data/AXLmutants/CellViability/Phase/BR2_Phase.csv")
cv3 = pd.read_csv("msresist/data/Phenotypic_data/AXLmutants/CellViability/Phase/BR3_Phase.csv")

itp = 12
ftp = 120
tr = 'A/E'

v = cv_pre(cv1, cv2, cv3, tr, itp, ftp, all_lines)

In [7]:
v

lines
PC9-A/E       7.246282
AXL KO-A/E    2.375132
Kdead-A/E     3.653204
Kin-A/E       5.444662
M4-A/E        4.629951
M5-A/E        4.688294
M7-A/E        6.010366
M10-A/E       5.538995
M11-A/E       5.962925
M15-A/E       4.055604
Name: viability, dtype: float64

#### Cell Death

#### Cell Migration

In [8]:
all_lines = ["PC9 A/E", "KO A/E", "KD A/E", "KIN A/E", "M4 A/E", "M5 A/E", "M7 A/E", "M10 A/E", "M11 A/E", "M15 A/E"]
lines = all_lines[1:]
glines = all_lines[2:]

In [9]:
rwd = pd.read_csv("msresist/data/Phenotypic_data/AXLmutants/EMT/03062020-AXLmuts_EMT_RWD_Collagen_BR1.csv")
rwdg = pd.read_csv("msresist/data/Phenotypic_data/AXLmutants/EMT/03062020-AXLmuts_EMT_GreenRWD_Collagen_BR1.csv")
wc = pd.read_csv("msresist/data/Phenotypic_data/AXLmutants/EMT/03062020-AXLmuts_EMT_WC_Collagen_BR1.csv")

ftp = 24

m = cm_pre(rwd, tr, ftp, all_lines)
m.index = v.index

In [10]:
m

lines
PC9-A/E       94.584855
AXL KO-A/E    54.870760
Kdead-A/E     66.549360
Kin-A/E       96.111215
M4-A/E        91.605670
M5-A/E        85.836235
M7-A/E        87.085310
M10-A/E       82.729170
M11-A/E       79.769890
M15-A/E       66.557255
Name: 12, dtype: float64

#### Build Y Matrix

In [11]:
y = pd.concat([v, m], axis=1)
y.columns = ["Viability", "Migration"]

y

Unnamed: 0_level_0,Viability,Migration
lines,Unnamed: 1_level_1,Unnamed: 2_level_1
PC9-A/E,7.246282,94.584855
AXL KO-A/E,2.375132,54.87076
Kdead-A/E,3.653204,66.54936
Kin-A/E,5.444662,96.111215
M4-A/E,4.629951,91.60567
M5-A/E,4.688294,85.836235
M7-A/E,6.010366,87.08531
M10-A/E,5.538995,82.72917
M11-A/E,5.962925,79.76989
M15-A/E,4.055604,66.557255


# Co-clustering and PLSR model

## Cross-validation Strategy 1: Leaving one condition out across fixed clusters

### Fitting PLSR each time

In [12]:
from pomegranate import *

In [13]:
distance_method = "Binomial"
ncl = 10
GMMweight = 0.5

MSC = MassSpecClustering(i, ncl, GMMweight=GMMweight, distance_method=distance_method).fit(d, y)
centers = MSC.transform(d)

run:  0
iter:  0
-3.3934673850613772
iter:  1
-4.444276657918609
iter:  2
-5.174447201107639
iter:  3
-5.683661709915539
iter:  4
-5.924700817283782
iter:  5
-6.026487606575207
iter:  6
-6.084827333959319
iter:  7
-6.1351292940421
iter:  8
-6.156033344065953
iter:  9
-6.168211298909666
iter:  10
-6.186041101360208
iter:  11
-6.197492496511984
iter:  12
run:  1
iter:  0
-3.030604313668632
iter:  1
-4.541272547530061
iter:  2
-5.049905527257581
iter:  3
-5.5550706123126705
iter:  4
-5.722809956542589
iter:  5
-5.7451386019860315
iter:  6
-5.791544425286014
iter:  7
-5.812716186344314
iter:  8
-5.831140920518232
iter:  9
-5.839076860557637
iter:  10
run:  2
iter:  0
-3.3436897264691225
iter:  1
-4.797900112258135
iter:  2
-5.4244416043001005
iter:  3
-5.8480357559605345
iter:  4
-6.04692631025874
iter:  5
-6.129957613276732
iter:  6
-6.179791815910414
iter:  7
-6.2367475508341155
iter:  8
-6.253307079606141
iter:  9
run:  3
iter:  0
-3.1369751960479637
iter:  1
-4.804598424049658
iter:  2

AssertionError: gmm_score is either NaN or -Inf, motif = DKKKKYEPPVP

In [None]:
centers

#### R2Y/Q2Y

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7,6))
plsr = PLSRegression(n_components=2)
plotR2YQ2Y(ax, plsr, centers, y, 1, ncl+1)

#### Measured vs Predicted

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,5))
plotMeasuredVsPredicted(ax, plsr, centers, y)

#### Scores & Loadings

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,6))
plotScoresLoadings(ax, plsr.fit(centers, y), centers, y, ncl, all_lines, 1)

In [None]:
# MSC.clustermembers(X.T).head()

#### Cluster Averages

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 6))

plotclusteraverages(ax, MSC.transform(d).T, all_lines)

## Cross-validation Strategy 2: Across entire pipeline

### Fitting entire model pipeline each time

In [None]:
ncomp = 2

CoCl_plsr = Pipeline([('CoCl', MassSpecClustering(i, ncl, GMMweight=GMMweight, distance_method=distance_method)), ('plsr', PLSRegression(ncomp))])
fit = CoCl_plsr.fit(d, y)
centers = CoCl_plsr.named_steps.CoCl.transform(d)

#### R2Y/Q2Y

In [None]:
# fig, ax = plt.subplots(1, 1, figsize=(7,6))
# plotR2YQ2Y(ax, CoCl_plsr, d, y, cv=2, b=ncl+1)

#### GridSearch

Run:

In [None]:
# scores = MSclusPLSR_tuning(d, i, y, "Binomial")

# scores = pd.DataFrame(scores)
# hp = scores.sort_values(by="mean_test_scores", ascending=False)
# hp.insert(0, "Ranking", list(np.arange(1, hp.shape[0]+1)))
# hp2 = scores[scores["#Components"]==2].sort_values(by="mean_test_scores", ascending=False)
# hp2.insert(0, "Ranking", list(np.arange(1, hp2.shape[0]+1)))

Import:

In [None]:
gs = pd.read_csv("msresist/data/Model/20200320-GridSearch_pam250_CVWC_wPC9.csv")

In [None]:
gs[gs["#Components"] == 2].head(10)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7,6))
plotGridSearch(ax, gs)

#### Measured vs Predicted

In [None]:
fig, ax = plt.subplots(1,2,figsize=(12,5))
plotMeasuredVsPredicted(ax, CoCl_plsr, d, y)

#### Scores & Loadings

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,6))

plotScoresLoadings(ax, fit, centers, y, ncl, all_lines, 2)

In [None]:
clustermembers = CoCl_plsr.named_steps.CoCl.clustermembers(X.T)
create_download_link(clustermembers, "20200115-AXLaf154_BMP_W1/2.csv")

#### Cluster Averages

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 6))

plotclusteraverages(ax, centers.T, all_lines)