# Sequence Analysis

Cluster global mass spectrometry-generated data by both, phosphorylation levels of phosphopeptides, and amino acid sequence. 

In [1]:
import pandas as pd
import numpy as np
from msresist.pre_processing import preprocessing
from msresist.sequence_analysis import MassSpecClustering, preprocess_seqs
from msresist.FileExporter import create_download_link
from msresist.parameter_tuning import GridSearch_CV
from sklearn.utils.estimator_checks import check_estimator
import warnings
warnings.simplefilter("ignore")

Import and pre-process data:

In [2]:
pd.set_option('display.max_colwidth', 1000)
pd.set_option('display.max_rows', 300)

ABC = preprocessing(motifs=True, Vfilter=True, FCfilter=True, log2T=True)
ABC = preprocess_seqs(ABC, "Y")

Define parameters/arguments:

In [3]:
ncl = 5
pYTS = "Y"
GMMweight = 1
covariance_type = "tied" 
max_n_iter = 20

Fit data to model and display results:

In [4]:
MSC = MassSpecClustering(ncl, GMMweight, pYTS, covariance_type, max_n_iter).fit(ABC)
MSC.score_

clusters= 5 GMMweight= 1
convergence has been reached at iteration 5


-7.953954824996154

In [5]:
clusters = pd.DataFrame(MSC.Cl_seqs_).T
clusters.columns = ["cluster %s" % (i+1) for i in range(ncl)]

In [6]:
clusters

Unnamed: 0,cluster 1,cluster 2,cluster 3,cluster 4,cluster 5
0,AAAIAYGLDRT,AVRLHYGLPVV,AEAAIYHLQLF,EDLPLYQHQAT,ATKVVYSAPRS
1,AAEPEYPKGIR,DARDLYDAGVK,ASHPNYPYSDE,LDSNLYRIGQS,AVCSTYLQSRY
2,AANPAYGRYNP,IYETDYYRKGG,EKQLLYSENKT,QGWQRYYFEGI,AVGFEYQGKTE
3,AEDAVYELQSK,KKLHEYNTQFQ,HVEAVYIDIAD,RKTVTYEDPQA,AWPSPYKDYEV
4,AEEVEYYYRRA,NAKPRYFYTSA,KRFGPYYTEPV,RQKSTYTSYPK,DKSREYDQLYE
5,AENPEYLGLDV,NGKQIYVGRAQ,KVGINYQPPTV,SELLRYYTSAS,DVAEKYLDIPK
6,AERDLYLENPE,NLDRAYEFAER,KTTILYKLKLG,VQNPVYHNQPL,DVDAAYMNKVE
7,AGEEHYNCISA,QQQMIYDSPPS,KTTLLYKLKGH,ASQKDYSSGFG,ELHRKYGTDLS
8,AGMTGYGMPRQ,QTFTTYSDNQP,PETPGYVGFAN,FRDSTYDLPRS,EVAEKYLDIPK
9,AIDVGYRHIDC,RKEREYVIPKR,PNQGRYYEGYY,GLGVKYYIDPS,GCFDPYSDDPR


## Hyperparameter Search

In [7]:
# cv = int(ABC.shape[0]/2)
# cv = 3
# param_grid = {"ncl": list(range(2,5)),  "GMMweight": list(np.linspace(0,5,6))}

In [8]:
# CVresults_max = GridSearch_CV(MSC, param_grid, cv, ABC)
# CVresults_max
# CVresults_min.nlargest(20, "mean_test_scores")