In [1]:
import numpy as np
import pandas as pd
import math
from sklearn.cross_decomposition import PLSRegression
from sklearn.mixture import GaussianMixture
from sklearn.pipeline import Pipeline
from msresist.parameter_tuning import MSclusPLSR_tuning
from msresist.plsr import Q2Y_across_components, R2Y_across_components
from msresist.figures.figure2 import plotR2YQ2Y, plotMixedClusteringPLSR_GridSearch, plotMeasuredVsPredicted, plotScoresLoadings, plotclusteraverages
from msresist.clustering import MassSpecClustering
from msresist.sequence_analysis import preprocess_seqs, FormatName, pYmotifs
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
from msresist.pre_processing import preprocessing, MergeDfbyMean, LinearFoldChange, FoldChangeToControl, MapOverlappingPeptides, BuildMatrix, TripsMeanAndStd, CorrCoefFilter
from msresist.FileExporter import create_download_link
import warnings
warnings.simplefilter("ignore")

In [2]:
pd.set_option('display.max_colwidth', 1000)
# pd.set_option('display.max_rows', 1000)
# pd.set_option('display.max_columns', 1000)

## Re-implementation with AXL mutants

### Phosphorylation measurements:

Set *Axlmutants_Erl_F154* or *Axlmutants_Erl* to **True** in order to use these data.

In [3]:
ABC = preprocessing(Axlmuts_ErlF154=True, motifs=True, Vfilter=True, FCfilter=True, log2T=True)
ABC = preprocess_seqs(ABC, "Y").sort_values(by="Protein")

header = ABC.columns
treatments = ABC.columns[6:]

data = ABC.iloc[:, 6:].T
info = ABC.iloc[:, :6]

In [4]:
ABC

(125, 16)

## Principal Compoenent Analysis

In [None]:
from sklearn.decomposition import PCA

In [None]:
explained = PCA(n_components=10).fit(data.T).explained_variance_ratio_

acc_expl = []
for i, exp in enumerate(explained):
    if i > 0:
        exp+=acc_expl[i-1]
        acc_expl.append(exp)
    else:
        acc_expl.append(exp)

fig, ax = plt.subplots(1, 1, figsize=(7,6))
plt.bar(range(10), acc_expl)
plt.ylabel("% Variance Explained")
plt.xlabel("Components")
plt.xticks(range(10), [i+1 for i in range(10)]);

In [None]:
fit = PCA(n_components=2).fit(data)

In [None]:
PC1_scores = fit.transform(data)[:, 0]
PC2_scores = fit.transform(data)[:, 1]

PC1_loadings = fit.components_[0]
PC2_loadings = fit.components_[1]

In [None]:
colors_ = cm.rainbow(np.linspace(0, 1, PC1_scores.size))

# Scores
fig, ax = plt.subplots(1, 2, figsize=(20,10))
ax[0].scatter(PC1_scores, PC2_scores)
for j, txt in enumerate(list(ABC.columns)[6:]):
    ax[0].annotate(txt, (PC1_scores[j], PC2_scores[j]))
ax[0].set_title('PCA Scores')
ax[0].set_xlabel('Principal Component 1')
ax[0].set_ylabel('Principal Component 2')
ax[0].axhline(y=0, color='0.25', linestyle='--')
ax[0].axvline(x=0, color='0.25', linestyle='--')

spacer = 0.5
ax[0].set_xlim([(-1 * max(PC1_scores)) - spacer, max(PC1_scores) + spacer])
ax[0].set_ylim([(-1 * max(PC2_scores)) - spacer, max(PC2_scores) + spacer])

# Loadings
for i, txt in enumerate(list(ABC.iloc[:, 0])):
    ax[1].annotate(txt.split(";")[1], (PC1_loadings[i], PC2_loadings[i]))
ax[1].scatter(PC1_loadings, PC2_loadings, c=np.arange(PC1_loadings.size), cmap=colors.ListedColormap(colors_))
ax[1].set_title('PCA Loadings')
ax[1].set_xlabel('Principal Component 1')
ax[1].set_ylabel('Principal Component 2')
ax[1].axhline(y=0, color='0.25', linestyle='--')
ax[1].axvline(x=0, color='0.25', linestyle='--')
spacer = 0.04
ax[1].set_xlim([(-1 * max(PC1_loadings) - spacer), (max(PC1_loadings) + spacer)])
ax[1].set_ylim([(-1 * max(PC2_loadings) - spacer), (max(PC2_loadings) + spacer)]);

### Phenotypes

#### Cell Viability (from luminex...):

In [None]:
Y_cv = pd.read_csv('./msresist/data/Phenotypic_data/101819-CellTiterGlo_AxlMutants_F154_BR1_raw.csv')
# Y_cv = pd.read_csv('./msresist/data/Phenotypic_data/101819-CellTiterGlo_AxlMutants_F154_BR1_raw.csv').T

Y_cv = Y_cv.iloc[:, 0:10].div(Y_cv.iloc[:, 0], axis=0).T

# Y_cv = Y_cv.iloc[:, 0:10].sub(Y_cv.iloc[:, 0:10].mean(axis=1), axis=0).T

# Chained MS mixed clustering + PLSR analysis

In [None]:
ncl = 3
GMMweight = 0

MSC = MassSpecClustering(info, ncl, GMMweight=GMMweight, distance_method="Binomial").fit(data, Y_cv)
centers = MSC.transform(data)

fig, ax = plt.subplots(1, 1, figsize=(7,6))
plotR2YQ2Y(ax, ncl, centers, Y_cv)

In [None]:
# fig, ax = plt.subplots(1, 1, figsize=(8,6))
# plotMixedClusteringPLSR_GridSearch(ax, data, info, Y_cv)

In [None]:
ncomp = 2

mixedCl_plsr = Pipeline([('mixedCl', MassSpecClustering(info, ncl, GMMweight=GMMweight, distance_method="Binomial")), ('plsr', PLSRegression(ncomp))])
fit = mixedCl_plsr.fit(data, Y_cv)

In [None]:
# fig, ax = plt.subplots(1,1,figsize=(6,5))
# plotMeasuredVsPredicted(ax, mixedCl_plsr, data, Y_cv)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(12,6))
colors_ = cm.rainbow(np.linspace(0, 1, ncl))

plotScoresLoadings(ax, fit, centers, Y_cv, ncl, colors_, treatments)

In [None]:
clustermembers = mixedCl_plsr.named_steps.mixedCl.clustermembers(ABC.T)
clustermembers.iloc[:10, :]

In [None]:
create_download_link(clustermembers, "DataClustering")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 5))

plotclusteraverages(ax, ABC, mixedCl_plsr, colors_)