In [1]:
import numpy as np
import pandas as pd
import random
from Bio import motifs
from Bio.Seq import Seq
from Bio.Alphabet import IUPAC
from sklearn.cluster import KMeans
from msresist.pre_processing import preprocessing
import warnings
warnings.simplefilter("ignore")

In [2]:
pd.set_option('display.max_colwidth', 1000)

ABC = preprocessing(motifs=True, Vfilter=True, FCfilter=True, log2T=True)
ABC = ABC[~ABC["peptide-phosphosite"].str.contains("-")]

header = ABC.columns
treatments = ABC.columns[2:12]

data = ABC.iloc[:,2:12].T
protnames = list(ABC.iloc[:,1])
Allseqs = list(ABC.iloc[:,0])

Amino acid frequency:

In [None]:
AAfreq = {"A":0.074, "R":0.042, "N":0.044, "D":0.059, "C":0.033, "Q":0.058, "E":0.037, "G":0.074, "H":0.029, "I":0.038, "L":0.076, "K":0.072, "M":0.018, "F":0.04, "P":0.05, "S":0.081, "T":0.062, "W":0.013, "Y":0.033, "V":0.068}

Define clusters by k-means:

In [3]:
kmeans = KMeans(4).fit(data.T)
X = ABC.assign(cluster=kmeans.labels_)

seqs = []
for i in range(0, max(kmeans.labels_) + 1):
    seqs.append(list(X[X["cluster"] == i].iloc[:, 0]))

Generate Seq Instances:

In [4]:
instances = []
for i in range(len(seqs)):
    currentcl = []
    for seq in seqs[i]:
        currentcl.append(Seq(seq.upper(), IUPAC.protein))
    instances.append(currentcl)

Create Motif objects for each cluster, build a PSSM for each cluster and print the information content of the motif compared to the background (relative entropy):

In [5]:
for i in range(len(instances)):
    m = motifs.create(instances[i])
#     m.weblogo("cluster %0.f motif.png" % (i+1))
    pwm = m.counts.normalize(pseudocounts=AAfreq)
    pssm = pwm.log_odds()
    print("cluster %0.f: consensus motif = %s, mean = %0.2f, standard deviation = %0.2f, max = %0.2f, min = %0.2f" % (i+1, m.consensus, pssm.mean(), pssm.std(), pssm.max, pssm.min))

cluster 1: consensus motif = AEEDRYDEESD, mean = 7.55, standard deviation = 3.15, max = 19.01, min = -86.47
cluster 2: consensus motif = KSKGEYDVLVP, mean = 9.02, standard deviation = 3.30, max = 20.33, min = -85.88
cluster 3: consensus motif = RSSPPYVSLPE, mean = 6.81, standard deviation = 2.82, max = 16.07, min = -81.62
cluster 4: consensus motif = SKEEKYGTVRS, mean = 7.85, standard deviation = 3.10, max = 18.84, min = -82.47


Re-implement using all sequences instead of clusters:

In [6]:
instances_ = []
for seq in Allseqs:
    instances_.append(Seq(seq.upper(), IUPAC.protein))

m_ = motifs.create(instances_)
# m_.weblogo("allseqs_motif.png")
pwm_ = m_.counts.normalize(pseudocounts=AAfreq)
pssm_ = pwm_.log_odds()
print("All sequences: consensus motif = %s, mean = %0.2f, standard deviation = %0.2f, max = %0.2f, min = %0.2f" % (m_.consensus, pssm_.mean(), pssm_.std(), pssm_.max, pssm_.min))

All sequences: consensus motif = SSEGRYDTLRE, mean = 6.26, standard deviation = 2.65, max = 14.81, min = -65.60


## Re-implementation from Schwartz & Gygi Nat. Biotech 2005 and Cheng et al Bioinfo. 2018

Build Background data set and position-weight matrix:

In [7]:
bg_seqs = []
# for seq in Allseqs:
#     shuffAA = seq[:5] + seq[6:]
#     shuffled = ''.join(random.sample(shuffAA, 5)) + seq[5] + ''.join(random.sample(shuffAA, 5))
#     shuffled = ''.join(random.sample(seq,11))
#     bg_seqs.append(Seq(shuffled.upper(), IUPAC.protein))

AAlist = ["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I", "L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
for i in range(len(Allseqs)*10):
    seq = ''.join(random.sample(AAlist, 11))
    bg_seqs.append(Seq(seq, IUPAC.protein))

In [8]:
bg_m = motifs.create(bg_seqs)
bg_pwm = pd.DataFrame(bg_m.counts.normalize(pseudocounts=AAfreq)).T

Build Phosphorylation data set and position-weight matrix:

In [9]:
seqs = []
for seq in Allseqs:
    seqs.append(Seq(seq.upper(), IUPAC.protein))

In [10]:
m = motifs.create(seqs)
pwm = pd.DataFrame(m.counts.normalize(pseudocounts=AAfreq)).T

Calculate Binomial Probability Matrix:

In [11]:
from scipy.stats import binom

n = len(seqs)
k = pd.DataFrame(m.counts).T.reset_index(drop=False)
p = bg_pwm

binomp = []
for i, r in k.iterrows():
    CurrentResidue = []
    for j,v in enumerate(r[1:]):
        CurrentResidue.append(binom.sf(k=v, n=n, p=p.iloc[i, j], loc=0))
    binomp.append(CurrentResidue)
    
binomp = pd.DataFrame(binomp)
binomp.insert(0, "Residue", list(k.iloc[:,0]))

In [12]:
binomp

Unnamed: 0,Residue,0,1,2,3,4,5,6,7,8,9,10
0,A,2.4e-05,0.080847,0.273666,0.001552,0.1882676,1.0,0.01502284,0.247065,0.9926016,0.073578,0.033198
1,C,0.999886,0.998383,0.999986,0.999999,0.9999999,1.0,1.0,1.0,1.0,0.999997,0.999745
2,D,0.328954,0.024721,0.001635,0.000248,0.004265825,1.0,2.676856e-10,0.294732,0.7716881,0.85173,0.012234
3,E,0.067883,0.04963,4e-06,0.006897,0.002028029,1.0,6.111042e-06,0.000909,0.0005999917,0.205295,0.000273
4,F,0.844425,0.984746,0.998067,0.999739,0.9985972,1.0,0.9906856,0.970869,0.9738602,0.981363,0.994617
5,G,0.000487,0.013183,0.001596,0.000106,0.05741238,1.0,0.02472412,0.315761,0.2494848,0.8778,0.00131
6,H,0.998587,0.999652,0.994939,0.992862,0.9960654,1.0,0.3364869,0.995814,0.9145477,0.981927,0.991402
7,I,0.957078,0.887161,0.99915,0.889355,0.4299057,1.0,0.3741711,0.555519,0.1292412,0.9388,0.955392
8,K,0.027113,0.002034,0.041574,0.930315,0.05038896,1.0,0.9580251,0.871727,0.4165319,0.484905,0.006117
9,L,0.045376,0.377535,0.18243,0.315455,0.01692549,1.0,0.3561996,0.001446,1.083465e-09,0.041494,0.267486


In [13]:
motif = list("X"*11)
positions = list(binomp.columns[1:])
AA = list(binomp.iloc[:, 0])
binomp = binomp.iloc[:, 1:]
k = k.iloc[:, 1:]
pvalCut = 10**(-6)
occurCut = 20

In [14]:
for i in range(len(positions)):
    DoS = binomp.iloc[:, i].min()
    j = binomp[binomp.iloc[:, i] == DoS].index[0]
    aa = AA[j]
    if DoS < pvalCut and k.iloc[j, i] >= occurCut:
        motif[i] = aa
    else:
        motif[i] = "x"

motif1 = ''.join(motif)

In [15]:
print(motif1)
print(m.consensus)

xxxxRYDxLxx
SSEGRYDTLRE


In [16]:
s = pd.DataFrame(Allseqs)
s[s[0].str.contains("RyD")]

Unnamed: 0,0
249,PsPVRyDNLSR
