# LUAD CPTAC analysis

In [1]:
import pandas as pd
import numpy as np
from msresist.pre_processing import preprocessing
from msresist.clustering import MassSpecClustering
import matplotlib.pyplot as plt

In [2]:
# pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 1000000)
pd.set_option('display.max_columns', 1000)

## 1 - Import and Preprocess Data

In [3]:
# X = preprocessing(CPTAC=True, log2T=True)
X = pd.read_csv("msresist/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv")

d = X.select_dtypes(include=['float64']).T
i = X.select_dtypes(include=['object'])

In [4]:
print(X.shape)
X.head()

(42383, 220)


Unnamed: 0.1,Unnamed: 0,Sequence,Protein,Gene,Position,C3N.01799,C3N.01799.N,C3L.01890,C3L.01890.N,C3N.00572,C3N.00572.N,C3N.02423,C3N.02423.N,C3N.02729,C3N.02729.N,C3L.00263,C3L.00263.N,C3N.01410,C3N.01410.N,C3N.00578,C3N.00578.N,C3N.02587,C3N.02587.N,C3L.00893,C3L.00893.N,C3N.01488,C3N.01488.N,C3N.01413,C3N.01413.N,C3N.01030,C3N.01030.N,C3N.02588,C3N.02588.N,C3N.00552,C3N.00552.N,C3L.01889,C3L.01889.N,C3N.00169,C3N.00169.N,C3L.00422,C3L.00422.N,C3L.00083,C3L.00083.N,C3N.00551,C3N.00551.N,C3N.01842,C3N.02089,C3N.02089.N,C3L.01682,C3L.01682.N,C3N.01016,C3N.01016.N,C3N.00580,C3N.00580.N,X11LU013,C3N.02000,C3N.02000.N,C3N.01489,C3N.01489.N,C3N.00737,C3N.00737.N,C3N.01405,C3N.01405.N,C3N.02587.1,C3N.01416,C3N.01416.N,C3N.02149,C3N.02149.N,C3N.02424,C3N.02424.N,C3L.00279,C3L.00279.N,C3N.02379,C3L.00095,C3L.00095.N,C3N.02572,C3N.02572.N,C3N.00559,C3N.00559.N,C3N.00545,C3N.00545.N,X11LU016,C3L.02345,C3L.02345.N,C3L.00093,C3L.00093.N,C3L.00510,C3L.00510.N,C3L.00913,C3L.00913.N,X11LU022,C3L.00412,C3L.00412.N,C3N.01415,C3N.01415.N,C3N.00549,C3N.00549.N,C3N.02433,C3N.02433.N,C3N.01021,C3N.01021.N,C3N.00203,C3N.00203.N,C3L.01330,C3L.01330.N,C3N.00574,C3N.00574.N,X11LU035,C3N.01071,C3N.01071.N,C3N.00550,C3N.00550.N,C3N.00167,C3N.00167.N,C3L.00001,C3L.00001.N,C3N.01074,C3L.01632,C3L.01632.N,C3N.02155,C3N.02155.N,C3N.00217,C3N.00217.N,C3N.02002,C3N.02002.N,C3N.02145,C3N.02145.N,C3N.02586,C3N.02586.N,C3L.02219,C3L.02219.N,C3N.00180,C3N.00180.N,C3L.01862,C3N.00704,C3N.00704.N,C3N.02158,C3N.02158.N,C3N.02421,C3N.02421.N,C3N.00556,C3N.00556.N,C3N.00294,C3N.01072,C3N.01072.N,C3L.00094,C3L.00094.N,C3L.00144,C3L.00144.N,C3N.00223,C3N.00223.N,C3N.02380,C3L.01924,C3L.01924.N,C3N.00433,C3N.00433.N,C3L.01683,C3L.01683.N,C3N.00293,C3N.00293.N,C3N.02380.N,C3L.00009,C3L.00009.N,C3N.00546,C3N.00546.N,C3N.01414,C3N.01414.N,C3N.02087,C3N.02087.N,C3L.00140,C3L.00140.N,C3N.02529,C3N.02529.N,C3L.00368,C3L.00368.N,C3N.02067,C3N.02067.N,C3N.02422,C3L.00080,C3L.00080.N,C3N.00547,C3N.00547.N,C3N.01023,C3N.01023.N,C3N.01024,C3N.01024.N,C3L.00973,C3L.00973.N,C3L.02348,C3L.02348.N,C3L.02508,C3L.02508.N,C3N.00560,C3N.00560.N,C3L.00604,C3L.00604.N,C3N.00199,C3N.00199.N,C3N.00579,C3N.00579.N,C3N.02582,C3N.02582.N,C3N.02379.1,C3L.02350,C3L.02350.N,C3N.00738,C3N.00738.N,C3N.00959,C3N.00959.N,C3N.02003,C3N.02003.N,C3N.02379.N,C3N.00175,C3N.00175.N,C3N.01823,C3N.01823.N,C3L.02549,C3L.02549.N,C3L.02365,C3L.02365.N
0,0,AAAAAsGGGVS,NP_002577.2,PBX2,S146-p,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.333072,-1.157027,1.091254,-0.108078,-1.077948,-1.782146,1.253047,-0.931758,,,,,,,,,,,,,,,,,,,,,,,,,,
1,1,AAAAAsQQGSA,NP_001308122.1,TBL1XR1,S119-p,,,,,,,,,,,,,,,,,,,,,,,,,0.040414,-1.014198,0.822105,-0.375466,1.29633,-0.256019,1.091794,-0.452485,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.512168,-1.336851,0.45291,-1.263757,1.508932,-0.70834,-0.02606,-2.216061,0.342936,-0.142478,-0.438031,0.422947,-1.116016,0.58994,-0.756644,-0.000108,-1.592715,0.742549,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.368677,-0.607873,0.7757,-1.187604,0.584401,-0.55509,0.711051,-0.864431,-0.042739,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.687655,0.14593,0.43872,-0.999488,0.311292,-0.755449,0.130682,0.111021,,,,,,,,,,,,,,,,,,-0.459554,-0.776329,0.071547,-0.600298,0.172339,-1.761406,0.421872,-0.372907,-0.979491,-0.083036,-0.361962,1.275506,-0.568015,0.25423,-0.378522,0.987749,-0.189486
2,2,AAAAAsTQAQQ,NP_001035540.1,MINDY2,S575-p,0.012034,-0.630863,-1.159479,-0.740044,0.107228,-0.016971,-0.023379,-0.481045,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,3,AAAAGsASPRS,NP_064520.2,WRNIP1,S151-p,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.855284,-0.359693,-0.471123,-0.621433,0.276805,-1.309244,0.315092,-1.015324,-0.397338,-0.400324,0.700924,-1.157659,-0.237313,2.361092,-0.526513,-0.5873,-0.60944,-0.526462,0.250665,1.10509,-0.835017,-0.019921,0.621453,0.121022,-1.376484,-1.10228,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,4,AAAAGsGPSPP,NP_060895.1,PI4K2A,S44-p,,,,,,,,,0.83783,-0.993838,0.317079,-2.318871,-1.158439,-1.42635,-0.227882,-0.705463,,,,,,,,,,,,,,,,,0.443256,0.08327,-0.257535,-0.60158,-0.241584,-1.26658,0.435344,-0.201686,0.639204,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.138453,-0.342698,0.405259,-0.421219,0.270056,-1.342466,0.511572,-0.191681,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-0.148911,-0.384748,0.031572,-0.554645,-0.03364,-1.268642,0.26679,-0.0633,-0.6475,,,,,,,,


## 2 - Run Model

Test speed of generating the motif pam250 scores:

In [5]:
# %%time
# from msresist.sequence_analysis import MotifPam250Scores
# n = 5000
# seqs = [s.upper() for s in X["Sequence"][:n]]
# print(len(seqs))
# MotifPam250Scores(seqs)

Run model:

In [6]:
# Reduce data set?
# npept = 7000
# d = d.iloc[:, :npept]
# i = i.iloc[:npept, :]

#Drop NaN?
# d = d.iloc[:8, :].dropna(axis=1)
# i = i.iloc[d.columns, :]

In [7]:
%%time
distance_method = "PAM250"
ncl = 2
SeqWeight = 5

MSC = MassSpecClustering(i, ncl, GMMweight=SeqWeight, distance_method=distance_method, n_runs=1).fit(d, "NA")

start initialization...
gmm initialized
N_ITER:  0
SeqW:  4125 DataW:  17411 BothWin:  20847 MixWins:  0
-144.71976005427467
N_ITER:  1
SeqW:  10377 DataW:  7491 BothWin:  24515 MixWins:  0
-129.12381936293255
N_ITER:  2
SeqW:  14120 DataW:  2072 BothWin:  26191 MixWins:  0
-97.47430403623926
N_ITER:  3
SeqW:  14788 DataW:  565 BothWin:  27030 MixWins:  0
-72.65732328126562
N_ITER:  4
SeqW:  14998 DataW:  187 BothWin:  27198 MixWins:  0
-64.2469568556254
N_ITER:  5
SeqW:  15039 DataW:  110 BothWin:  27234 MixWins:  0
-62.53474752623692
N_ITER:  6
SeqW:  15040 DataW:  97 BothWin:  27246 MixWins:  0
-62.259936933233575
N_ITER:  7
SeqW:  15044 DataW:  95 BothWin:  27244 MixWins:  0
-62.25281015240406
N_ITER:  8
SeqW:  15044 DataW:  94 BothWin:  27245 MixWins:  0
-62.25157424471896
N_ITER:  9
SeqW:  15044 DataW:  94 BothWin:  27245 MixWins:  0
-62.25157424471896
CPU times: user 2h 34min 1s, sys: 17.3 s, total: 2h 34min 18s
Wall time: 2h 37min 46s


In [9]:
centers = MSC.transform(d)