In [1]:
import numpy as np
import pandas as pd
import random
from Bio import motifs
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from sklearn.cluster import KMeans
from msresist.pre_processing import preprocessing
from msresist.sequence_analysis import BackgroundSeqs, ForegroundSeqs, position_weight_matrix, counts, BinomialMatrix, ExtractMotif
import warnings
warnings.simplefilter("ignore")

In [2]:
pd.set_option('display.max_colwidth', 1000)

ABC = preprocessing(motifs=True, Vfilter=True, FCfilter=True, log2T=True)
ABC = ABC[~ABC["peptide-phosphosite"].str.contains("-")]

header = ABC.columns
treatments = ABC.columns[2:12]

data = ABC.iloc[:,2:12].T
protnames = list(ABC.iloc[:,1])
Allseqs = list(ABC.iloc[:,0])

Amino acid frequency:

In [3]:
AAfreq = {"A":0.074, "R":0.042, "N":0.044, "D":0.059, "C":0.033, "Q":0.058, "E":0.037, "G":0.074, "H":0.029, "I":0.038, "L":0.076, "K":0.072, "M":0.018, "F":0.04, "P":0.05, "S":0.081, "T":0.062, "W":0.013, "Y":0.033, "V":0.068}

Define clusters by k-means:

In [4]:
kmeans = KMeans(4).fit(data.T)
X = ABC.assign(cluster=kmeans.labels_)

seqs = []
for i in range(0, max(kmeans.labels_) + 1):
    seqs.append(list(X[X["cluster"] == i].iloc[:, 0]))

Generate Seq Instances:

In [5]:
instances1 = []
for i in range(len(seqs)):
    currentcl = []
    for seq in seqs[i]:
        currentcl.append(Seq(seq.upper(), IUPAC.protein))
    instances1.append(currentcl)

In [6]:
instances2 = []
for i in range(len(seqs)):
    currentcl = []
    for seq in seqs[i]:
        currentcl.append(Seq(seq.upper(), IUPAC.protein))
    instances2.append(currentcl)

In [15]:
instances1 == instances2

True

In [8]:
raise SystemExit

SystemExit: 

In [None]:
for cluster in instances:
    for seq in cluster:
        print(type(seq))
        raise SystemExit

In [None]:
print(len(instances))

Create Motif objects for each cluster, build a PSSM for each cluster and print the information content of the motif compared to the background (relative entropy):

In [None]:
for i in range(len(instances)):
    m = motifs.create(instances[i])
#     m.weblogo("cluster %0.f motif.png" % (i+1))
    pwm = m.counts.normalize(pseudocounts=AAfreq)
    pssm = pwm.log_odds()
    print("cluster %0.f: consensus motif = %s, mean = %0.2f, standard deviation = %0.2f, max = %0.2f, min = %0.2f" % (i+1, m.consensus, pssm.mean(), pssm.std(), pssm.max, pssm.min))

Re-implement using all sequences instead of clusters:

In [None]:
instances_ = []
for seq in Allseqs:
    instances_.append(Seq(seq.upper(), IUPAC.protein))

m_ = motifs.create(instances_)
# m_.weblogo("allseqs_motif.png")
pwm_ = m_.counts.normalize(pseudocounts=AAfreq)
pssm_ = pwm_.log_odds()
print("All sequences: consensus motif = %s, mean = %0.2f, standard deviation = %0.2f, max = %0.2f, min = %0.2f" % (m_.consensus, pssm_.mean(), pssm_.std(), pssm_.max, pssm_.min))

## Re-implementation from Schwartz & Gygi Nat. Biotech 2005 and Cheng et al Bioinfo. 2018

Build Background data set and position-weight matrix:

In [None]:
bg_seqs1 = BackgroundSeqs("Y")

In [None]:
bg_pwm1 = position_weight_matrix(bg_seqs1)

Build Phosphorylation data set and position-weight matrix:

In [None]:
seqs1 = ForegroundSeqs(Allseqs, "Y")

Calculate Binomial Probability Matrix:

In [None]:
freq_matrix = counts(seqs1)

In [None]:
binomp1 = BinomialMatrix(len(seqs1), freq_matrix, bg_pwm1)

In [None]:
binomp1

In [None]:
motif1 = ExtractMotif(binomp1, freq_matrix, pvalCut=10**(-4), occurCut=7)

In [None]:
print(motif1)
motifs.create(seqs1).consensus

In [None]:
probs = []
for i, aa in enumerate(m.consensus):
    IDX = binomp1["Residue"] == aa
    probs.append(float(binomp1[IDX][i]))
print(probs)
np.mean(probs)