In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.cross_decomposition import PLSRegression
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import Pipeline
from msresist.parameter_tuning import MSclusPLSR_tuning
from msresist.plsr import Q2Y_across_components, R2Y_across_components
from msresist.figures.figure2 import plotR2YQ2Y, plotMixedClusteringPLSR_GridSearch, plotMeasuredVsPredicted, plotScoresLoadings, plotclusteraverages
from msresist.clustering import MassSpecClustering
from msresist.sequence_analysis import preprocess_seqs, FormatName, pYmotifs
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
from msresist.pre_processing import preprocessing, MergeDfbyMean, LinearFoldChange, FoldChangeToControl, MapOverlappingPeptides, BuildMatrix, TripsMeanAndStd, CorrCoefFilter
from msresist.FileExporter import create_download_link
import warnings
warnings.simplefilter("ignore")

In [2]:
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 1000000)
pd.set_option('display.max_columns', 1000)

## Re-implementation with AXL mutants

### Phosphorylation measurements:

#### + Erlotinib + AXL-activating Antibody (AF154)

In [None]:
X = preprocessing(Axlmuts_ErlF154=True, motifs=True, Vfilter=True, FCfilter=True, log2T=True, mc_row=True)
X = preprocess_seqs(X, "Y").sort_values(by="Protein")
X.columns = list(X.columns[:7]) + ["PC9", "Knock-Out", "Kin. Dead", "Knock-In", "Y634F", "Y643F", "Y698F", "Y726F", "Y750F ", "Y821F"]

treatments = ["PC9", "Knock-Out", "Kin. Dead", "Knock-In", "Y634F", "Y643F", "Y698F", "Y726F", "Y750F ", "Y821F"]
# treatments = ["Knock-Out", "Kin. Dead", "Knock-In", "Y634F", "Y643F", "Y698F", "Y726F", "Y750F ", "Y821F"]
X = X[list(X.columns[:7]) + treatments]

d = X.select_dtypes(include=['float64']).T
info = X.select_dtypes(include=['object'])

In [None]:
d.index

### Phenotypes

#### Cell Viability of AXL mutant Cell Lines

In [None]:
cv1 = pd.read_csv("msresist/data/Phenotypic_data/AXLmutants/20200130-AXLmutantsPhase_MeanTRs_BR1.csv").iloc[:, 1:]
cv1_ab = cv1.loc[:, cv1.columns.str.contains('-A/E')]
cv2 = pd.read_csv('msresist/data/Phenotypic_data/AXLmutants/20200130-AXLmutantsPhase_MeanTRs_BR2.csv').iloc[:, 1:]
cv2_ab = cv2.loc[:, cv2.columns.str.contains('-A/E')]
cv3 = pd.read_csv('msresist/data/Phenotypic_data/AXLmutants/20200130-AXLmutantsPhase_MeanTRs_BR3.csv').iloc[:, 1:]
cv3_ab = cv3.loc[:, cv2.columns.str.contains('-A/E')]

for ii in range(0, cv2_ab.columns.size):
    cv1_ab.iloc[:, ii] /= cv1_ab.iloc[0, ii]
    cv2_ab.iloc[:, ii] /= cv2_ab.iloc[0, ii]
    cv3_ab.iloc[:, ii] /= cv3_ab.iloc[0, ii]

cv = pd.concat([cv1_ab, cv2_ab], axis=0)
cv.insert(0, "Elapsed",  cv1.iloc[:, 0])
cv =  MergeDfbyMean(cv, cv1_ab.columns, "Elapsed").reset_index()
v = cv[cv["Elapsed"] == 96].iloc[0, 1:]
v = v[["PC9-A/E", "AXL KO-A/E", "Kdead-A/E", "Kin-A/E", "M4-A/E", "M5-A/E", "M7-A/E", "M10-A/E", "M11-A/E", "M15-A/E"]]
# v = v[["AXL KO-A/E", "Kdead-A/E", "Kin-A/E", "M4-A/E", "M5-A/E", "M7-A/E", "M10-A/E", "M11-A/E", "M15-A/E"]]

v

# Co-clustering and PLSR model

## Cross-validation Strategy 1: By clusters

In [None]:
distance_method = "Binomial"
ncl = 5
GMMweight = 0.75

MSC = MassSpecClustering(info, ncl, GMMweight=GMMweight, distance_method=distance_method).fit(d, v)
centers = MSC.transform(d)

fig, ax = plt.subplots(1, 1, figsize=(7,6))
plsr = PLSRegression(n_components=2)
plotR2YQ2Y(ax, plsr, centers, v, 1, ncl+1)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(6,5))
pls2 = PLSRegression(n_components=2)
plotMeasuredVsPredicted(ax, pls2, centers, v)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,6))
plotScoresLoadings(ax, pls2.fit(centers, v), centers, v, ncl, treatments, CV=1)

In [None]:
X.insert(7, "Cluster", [z+1 for z in list(MSC.labels_)])

In [None]:
X[X["Protein"].str.contains("ypoxia")]

In [None]:
# X[X["Abbv"].str.contains("EP")]

In [None]:
# MSC.clustermembers(X.T)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 6))

plotclusteraverages(ax, MSC.transform(d).T, treatments)

## Cross-validation Strategy 2: Across entire pipeline

In [None]:
scores = MSclusPLSR_tuning(d, info, v, distance_method)
hp = scores[scores["#Components"]==2].sort_values(by="mean_test_scores", ascending=False)
hp.insert(0, "Ranking", list(np.arange(1, hp.shape[0]+1)))

In [None]:
create_download_link(hp, "20200227-GridSearch_AxlM_CellViab96h_NoPC9_Binomial_GMMiterLow.csv")

In [None]:
hp = scores.sort_values(by="mean_test_scores", ascending=False)

In [None]:
create_download_link(hp, "20200227-GridSearch_AxlM_CellViab96h_NoPC9_Binomial_GMMiterLow.csv")

In [None]:
ncomp = 2

mixedCl_plsr = Pipeline([('mixedCl', MassSpecClustering(info, ncl, GMMweight=GMMweight, distance_method=distance_method)), ('plsr', PLSRegression(ncomp))])
fit = mixedCl_plsr.fit(d, v)
centers = mixedCl_plsr.named_steps.mixedCl.transform(d)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(7,6))
plotR2YQ2Y(ax, mixedCl_plsr, d, v, cv=2, b=ncl+1)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(6,5))
plotMeasuredVsPredicted(ax, mixedCl_plsr, d, v)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,6))

plotScoresLoadings(ax, fit, centers, v, ncl, treatments, CV=2)

In [None]:
clustermembers = mixedCl_plsr.named_steps.mixedCl.clustermembers(X.T)
create_download_link(clustermembers, "20200115-AXLaf154_BMP_W1/2.csv")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 6))

plotclusteraverages(ax, centers.T, treatments)