In [17]:
import numpy as np
import pandas as pd
import random
from Bio import motifs
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from sklearn.cluster import KMeans
from msresist.pre_processing import preprocessing
import warnings
warnings.simplefilter("ignore")

In [18]:
pd.set_option('display.max_colwidth', 1000)

ABC = preprocessing(motifs=True, Vfilter=True, FCfilter=True, log2T=True)
ABC = ABC[~ABC["peptide-phosphosite"].str.contains("-")]

header = ABC.columns
treatments = ABC.columns[2:12]

data = ABC.iloc[:,2:12].T
protnames = list(ABC.iloc[:,1])
Allseqs = list(ABC.iloc[:,0])

Amino acid frequency:

In [19]:
AAfreq = {"A":0.074, "R":0.042, "N":0.044, "D":0.059, "C":0.033, "Q":0.058, "E":0.037, "G":0.074, "H":0.029, "I":0.038, "L":0.076, "K":0.072, "M":0.018, "F":0.04, "P":0.05, "S":0.081, "T":0.062, "W":0.013, "Y":0.033, "V":0.068}

Define clusters by k-means:

In [20]:
kmeans = KMeans(4).fit(data.T)
X = ABC.assign(cluster=kmeans.labels_)

seqs = []
for i in range(0, max(kmeans.labels_) + 1):
    seqs.append(list(X[X["cluster"] == i].iloc[:, 0]))

Generate Seq Instances:

In [21]:
instances = []
for i in range(len(seqs)):
    currentcl = []
    for seq in seqs[i]:
        currentcl.append(Seq(seq.upper(), IUPAC.protein))
    instances.append(currentcl)

Create Motif objects for each cluster, build a PSSM for each cluster and print the information content of the motif compared to the background (relative entropy):

In [22]:
for i in range(len(instances)):
    m = motifs.create(instances[i])
#     m.weblogo("cluster %0.f motif.png" % (i+1))
    pwm = m.counts.normalize(pseudocounts=AAfreq)
    pssm = pwm.log_odds()
    print("cluster %0.f: consensus motif = %s, mean = %0.2f, standard deviation = %0.2f, max = %0.2f, min = %0.2f" % (i+1, m.consensus, pssm.mean(), pssm.std(), pssm.max, pssm.min))

cluster 1: consensus motif = RSSPPYVSLPE, mean = 6.81, standard deviation = 2.82, max = 16.07, min = -81.62
cluster 2: consensus motif = SKEEKYGTVRS, mean = 7.85, standard deviation = 3.10, max = 18.84, min = -82.47
cluster 3: consensus motif = AEEDRYDEESD, mean = 7.55, standard deviation = 3.15, max = 19.01, min = -86.47
cluster 4: consensus motif = KSKGEYDVLVP, mean = 9.02, standard deviation = 3.30, max = 20.33, min = -85.88


Re-implement using all sequences instead of clusters:

In [23]:
instances_ = []
for seq in Allseqs:
    instances_.append(Seq(seq.upper(), IUPAC.protein))

m_ = motifs.create(instances_)
# m_.weblogo("allseqs_motif.png")
pwm_ = m_.counts.normalize(pseudocounts=AAfreq)
pssm_ = pwm_.log_odds()
print("All sequences: consensus motif = %s, mean = %0.2f, standard deviation = %0.2f, max = %0.2f, min = %0.2f" % (m_.consensus, pssm_.mean(), pssm_.std(), pssm_.max, pssm_.min))

All sequences: consensus motif = SSEGRYDTLRE, mean = 6.26, standard deviation = 2.65, max = 14.81, min = -65.60


## Re-implementation from Schwartz & Gygi Nat. Biotech 2005 and Cheng et al Bioinfo. 2018

Build Background data set and position-weight matrix:

In [24]:
bg_seqs = []
# for seq in Allseqs:
#     shuffAA = seq[:5] + seq[6:]
#     shuffled = ''.join(random.sample(shuffAA, 5)) + seq[5] + ''.join(random.sample(shuffAA, 5))
#     shuffled = ''.join(random.sample(seq,11))
#     bg_seqs.append(Seq(shuffled.upper(), IUPAC.protein))

AAlist = ["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
for i in range(len(Allseqs)*10):
    seq = ''.join(random.sample(AAlist, 11))
    bg_seqs.append(Seq(seq, IUPAC.protein))

In [25]:
bg_m = motifs.create(bg_seqs)
bg_pwm = pd.DataFrame(bg_m.counts.normalize(pseudocounts=AAfreq)).T

Build Phosphorylation data set and position-weight matrix:

In [26]:
seqs = []
for seq in Allseqs:
    seqs.append(Seq(seq.upper(), IUPAC.protein))

In [27]:
m = motifs.create(seqs)
pwm = pd.DataFrame(m.counts.normalize(pseudocounts=AAfreq)).T

Calculate Binomial Probability Matrix:

In [28]:
from scipy.stats import binom

n = len(seqs)
k = pd.DataFrame(m.counts).T.reset_index(drop=False)
p = bg_pwm

binomp = []
for i, r in k.iterrows():
    CurrentResidue = []
    for j,v in enumerate(r[1:]):
        CurrentResidue.append(binom.sf(k=v, n=n, p=p.iloc[i, j], loc=0))
    binomp.append(CurrentResidue)
    
binomp = pd.DataFrame(binomp)
binomp.insert(0, "Residue", list(k.iloc[:,0]))

In [29]:
binomp

Unnamed: 0,Residue,0,1,2,3,4,5,6,7,8,9,10
0,A,0.000641,0.035002,0.5316955,0.0011,0.156848,1.0,0.005934379,0.415715,0.9907035,0.05171859,0.003017
1,C,0.999961,0.999265,0.9999952,1.0,1.0,1.0,1.0,0.999994,1.0,0.9999432,0.999798
2,D,0.495411,0.006909,0.004265825,0.000224,0.001936,1.0,8.246268e-08,0.270433,0.8654275,0.9366898,0.008671
3,E,0.050297,0.016676,1.22426e-07,0.029388,0.000207,1.0,0.0006321408,0.002598,0.003950861,0.06030827,1e-05
4,F,0.72939,0.994977,0.9939063,0.999697,0.999719,1.0,0.9883397,0.885101,0.9837856,0.9593501,0.986665
5,G,0.000586,0.022248,0.0002309641,6.9e-05,0.054512,1.0,0.003617345,0.650728,0.3955662,0.7445757,0.003714
6,H,0.998769,0.999067,0.9749775,0.976756,0.977626,1.0,0.2168554,0.99673,0.9388931,0.9933917,0.98976
7,I,0.907372,0.911036,0.9997753,0.866787,0.541152,1.0,0.6203763,0.798375,0.01510221,0.9360438,0.777206
8,K,0.110611,0.002777,0.00972727,0.955462,0.078253,1.0,0.9538386,0.763637,0.552957,0.681746,0.055465
9,L,0.026245,0.257462,0.1693965,0.086216,0.020356,1.0,0.1764181,0.009288,3.801208e-10,0.1449148,0.307273


In [30]:
motif = list("X"*11)
positions = list(binomp.columns[1:])
AA = list(binomp.iloc[:, 0])
binomp = binomp.iloc[:, 1:]
k = k.iloc[:, 1:]
pvalCut = 10**(-6)
occurCut = 20

In [31]:
for i in range(len(positions)):
    DoS = binomp.iloc[:, i].min()
    j = binomp[binomp.iloc[:, i] == DoS].index[0]
    aa = AA[j]
    if DoS < pvalCut and k.iloc[j, i] >= occurCut:
        motif[i] = aa
    else:
        motif[i] = "x"

motif1 = ''.join(motif)

In [32]:
print(motif1)
print(m.consensus)

xxExxYDxLRx
SSEGRYDTLRE


In [52]:
# s = pd.DataFrame(Allseqs)
# s[s[0].str.contains("yD")]
for i in list(s[0]):
    if i[2] == "E" and i[5] == "y" and i[6] == "D":
        print(i)

RPEGVyDIPPT
SKEDAyDGVTS
TAEFLyDEVHP


In [None]:
i[2] == "E"
i[6] == "D"
i[8] == "L"
i[9] == "R"