# Computation of the disorderedness of the data in our dataset

In [None]:
import os
import sys
import pandas as pd
import Bio

In [2]:
def read_pdb(pdbcode, pdbfilenm):
    """
    Read a PDB structure from a file.
    :param pdbcode: A PDB ID string
    :param pdbfilenm: The PDB file
    :return: a Bio.PDB.Structure object or None if something went wrong
    """
    try:
        pdbparser = Bio.PDB.PDBParser(QUIET=True)   # suppress PDBConstructionWarning
        struct = pdbparser.get_structure(pdbcode, pdbfilenm)
        return struct
    except Exception as err:
        print(str(err), file=sys.stderr)
        return None

In [3]:
struct = read_pdb('A0A0B4J1F3','../data/pdbs/A0A0B4J1F3.pdb')

In [4]:
len(list(struct.get_residues()))

663

In [5]:
def get_plddts(structure : Bio.PDB.Structure):
    plddts = [0 for _ in range(len(list(structure.get_residues())))]
    for residue in structure.get_residues():
        if 'CA' in residue: # carbon alpha atom
            # [1] because the id is structured as (' ', {id starting from 1}, ' ')
            plddts[int(residue.id[1]) - 1] = residue['CA'].bfactor # bfactor seems to be plddt

    return plddts

get_plddts(struct)

[43.33,
 34.15,
 39.88,
 37.2,
 44.85,
 37.09,
 42.09,
 46.55,
 39.62,
 43.41,
 40.28,
 45.23,
 36.89,
 42.39,
 36.39,
 39.97,
 31.83,
 37.72,
 32.6,
 43.22,
 32.4,
 31.89,
 36.71,
 50.63,
 70.73,
 84.01,
 88.36,
 91.68,
 93.62,
 94.94,
 96.11,
 94.88,
 94.8,
 92.6,
 91.13,
 89.61,
 91.54,
 92.2,
 93.69,
 92.79,
 93.47,
 92.17,
 93.24,
 94.75,
 95.25,
 96.31,
 96.61,
 95.78,
 94.67,
 93.33,
 90.02,
 85.66,
 82.09,
 81.68,
 83.56,
 85.31,
 90.42,
 93.7,
 94.54,
 96.37,
 96.78,
 95.82,
 92.75,
 91.81,
 94.93,
 95.24,
 93.88,
 94.81,
 93.47,
 91.83,
 88.34,
 83.86,
 73.11,
 64.44,
 59.61,
 65.42,
 67.29,
 78.1,
 86.82,
 91.58,
 92.49,
 94.1,
 95.35,
 93.61,
 92.77,
 89.66,
 90.7,
 92.77,
 93.48,
 92.31,
 92.23,
 94.48,
 94.71,
 95.43,
 96.89,
 97.47,
 97.28,
 95.46,
 94.13,
 92.11,
 88.55,
 83.22,
 75.97,
 73.73,
 83.82,
 85.78,
 89.77,
 89.76,
 93.92,
 94.97,
 94.15,
 95.59,
 95.7,
 94.88,
 96.03,
 94.84,
 95.2,
 93.85,
 86.09,
 85.73,
 88.71,
 89.33,
 93.43,
 93.74,
 94.02,
 94.38,
 93.

In [6]:
def mark_disordered(plddts, threshold=50):
    return [1 if plddts[i] < threshold else 0 for i in range(len(plddts))]

disordered = mark_disordered(get_plddts(struct))
disordered

[1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,


In [7]:
import numpy as np

labels = [np.random.randint(0, 2) for _ in range(len(disordered))]
np.corrcoef(labels, disordered)

array([[ 1.        , -0.01305589],
       [-0.01305589,  1.        ]])

In [6]:
prot_info = pd.read_json('../data/phosphosite_sequences/phosphosite_df.json')

In [9]:
seq = prot_info['sequence'][prot_info['id'] == 'A0A0B4J1F3']
len(seq.item())

663

In [10]:
len(disordered)

663

In [11]:
dummy_preds = pd.read_json('../data/preds.json')
dummy_preds

Unnamed: 0,id,sequence,label,probabilities,predictions
0,A0A024R4G9,MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,A0A087WP46,MARDGAEQPDSGPLPRPSPCPQEDRASNLMPPKPPRTWGLQLQGPS...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
11,A0A096LP55,MGLEDEQKMLTESGDPEEEEEEEEELVDPLTTVREQCEQLEKCVKA...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
12,A0A096MIZ1,MAEAEVHKERLQAIAEKRKRQTEIEGKRQQLDEQVLLLQHSKSKVL...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
16,A0A096MK47,MTSCVLAGSIETTPKVSPGDSEAKPLIFTFVPTLRRLPTHIQLADT...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...
42216,XP_576354,MGLPTLEFSDSYLDSPDFRERLQCHEIELERTNKFIKELLKDGSLL...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
42229,XP_577705,MIQVDGNWSGSISTDKMPTETEEELPTPTPKPSISEEGNFHSQYQV...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
42230,XP_577708,MSRSEETEMHGLNGEKQIGMVERLTCQKLQTPSRMTMENSEQFPSS...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
42241,XP_897723,MPAQSEEELPLPSPQPGNSEDRNFHSQYKVIRTIGHGTFAKVLLAQ...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [12]:
import glob

ids = set(dummy_preds.id.unique())
pdb_ids = []
for path in glob.glob('../data/pdbs/*.pdb'):
    id = os.path.basename(path)[:-4]
    if id in ids:
        pdb_ids.append(id)


In [13]:
len(pdb_ids)

8329

In [14]:
pdb_ids = []
for path in glob.glob('../data/pdbs/*.pdb'):
    id = os.path.basename(path)[:-4]
    pdb_ids.append(id)

In [15]:
pdb_ids = []
lengths = []
for path in glob.glob('../data/pdbs/*.pdb'):
    id = os.path.basename(path)[:-4]
    struct = read_pdb(id, path)
    print(id)
    lengths.append(len(list(struct.get_residues())))

A0A024R4G9
A0A075B759
A0A087WP46
A0A087WPF7
A0A087WQ89
A0A087WQP5
A0A087WR82
A0A096LP49
A0A096LP55
A0A096MIZ1
A0A096MJJ4
A0A096MJN4
A0A096MK47
A0A096MKA8
A0A0A6YY25
A0A0B4J1F3
A0A0B4J1G0
A0A0B4J1L0
A0A0B4J1M2
A0A0B4J2A2
A0A0B4J2F2
A0A0C4DFX5
A0A0G2JFH3
A0A0G2JTM7
A0A0G2JTR4
A0A0G2JTZ2
A0A0G2JUG7
A0A0G2JV04
A0A0G2JV12
A0A0G2JVL5
A0A0G2JVZ7
A0A0G2JXN2
A0A0G2JY22
A0A0G2JZ71
A0A0G2JZA1
A0A0G2K094
A0A0G2K0D3
A0A0G2K1J3
A0A0G2K1Q8
A0A0G2K344
A0A0G2K369
A0A0G2K3F3
A0A0G2K472
A0A0G2K475
A0A0G2K543
A0A0G2K719
A0A0G2K744
A0A0G2K7U7
A0A0G2K9M5
A0A0G2KA14
A0A0G2QC33
A0A0J9YU71
A0A0J9YUW3
A0A0J9YWL9
A0A0M3U1B0
A0A0R4J0D1
A0A0U1RPR8
A0A140LIF8
A0A1B0GTR3
A0A1D5RMD1
A0A1W2P872
A0A1W2P884
A0A286YCX6
A0A286YDN9
A0A2U3TZ82
A0A338P6K9
A0A494BA31
A0A494BBH0
A0A571BF63
A0A5F8MPU3
A0AUP1
A0AUZ9
A0AV02
A0AV96
A0AVF1
A0AVI2
A0AVI4
A0AVK6
A0AVT1
A0FGR8
A0FGR9
A0FK58
A0FKI7
A0JLT2
A0JN27
A0JN29
A0JN30
A0JNC2
A0JNI5
A0JNT9
A0JNU3
A0JNW5
A0JNY3
A0JP26
A0JP43
A0JPJ0
A0JPM9
A0JPN0
A0JPN3
A0JPN4
A0JPP5
A0JPP6
A0JPP8

KeyboardInterrupt: 

In [20]:
pdb_lengths = pd.DataFrame.from_dict({'id' : pdb_ids, 'length' : lengths}, orient='columns')

In [21]:
pdb_lengths

Unnamed: 0,id,length
0,A0A024R4G9,117
1,A0A075B759,164
2,A0A087WP46,1202
3,A0A087WPF7,1261
4,A0A087WQ89,206
...,...,...
37099,V9GX81,722
37100,V9GXG1,899
37101,W4VSP2,855
37102,W4VSP4,379


In [23]:
pdb_lengths.set_index('id').to_csv('../data/pdb_lengths.csv')

In [28]:
pdb_prot_info = prot_info.set_index('id').join(pdb_lengths.set_index('id'), 'id', 'inner')

In [29]:
pdb_prot_info

Unnamed: 0_level_0,sites,sequence,length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A0A024R4G9,"[14, 16, 20]",MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH...,117
A0A075B759,"[40, 79, 93, 119]",MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF...,164
A0A087WP46,"[359, 972, 973, 974, 988, 997, 1000, 1005, 101...",MARDGAEQPDSGPLPRPSPCPQEDRASNLMPPKPPRTWGLQLQGPS...,1202
A0A087WPF7,"[32, 43, 622, 626, 798, 941, 956, 1031, 1038, ...",MDGPTRGHGLRKKRRSRSQRDRERRSRAGLGTGAAGGIGAGRTRAP...,1261
A0A087WQ89,"[18, 26]",METPIQREIRRSCEREESLRRSRGLSPGRAGEELIELRVRPVLSRP...,206
...,...,...,...
V9GX81,"[702, 705]",MAGGVWGRGRDGRDGPVGSLTLTALAEGIRASQGQPVGPSSTGPQS...,722
V9GXG1,"[24, 26, 406]",MPYAEITVNLGKVTLGEENRKKMTNSCLKRHENSSLVQAVCALLNS...,899
W4VSP2,"[174, 175, 184, 272, 277]",MKLLIAFSPLVVLILFQEHISCYYLTKYASSGYYQDADFVIGGLFS...,855
W4VSP4,[130],MMDPLLEANATFALNLLKILGEDRSKNVFLSPISISSALVMVLLGA...,379


In [32]:
pdb_prot_info['length_matches'] = pdb_prot_info.apply(lambda x: len(x['sequence']) == x['length'], axis=1)

In [33]:
pdb_prot_info

Unnamed: 0_level_0,sites,sequence,length,length_matches
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
A0A024R4G9,"[14, 16, 20]",MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH...,117,True
A0A075B759,"[40, 79, 93, 119]",MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF...,164,True
A0A087WP46,"[359, 972, 973, 974, 988, 997, 1000, 1005, 101...",MARDGAEQPDSGPLPRPSPCPQEDRASNLMPPKPPRTWGLQLQGPS...,1202,False
A0A087WPF7,"[32, 43, 622, 626, 798, 941, 956, 1031, 1038, ...",MDGPTRGHGLRKKRRSRSQRDRERRSRAGLGTGAAGGIGAGRTRAP...,1261,True
A0A087WQ89,"[18, 26]",METPIQREIRRSCEREESLRRSRGLSPGRAGEELIELRVRPVLSRP...,206,True
...,...,...,...,...
V9GX81,"[702, 705]",MAGGVWGRGRDGRDGPVGSLTLTALAEGIRASQGQPVGPSSTGPQS...,722,True
V9GXG1,"[24, 26, 406]",MPYAEITVNLGKVTLGEENRKKMTNSCLKRHENSSLVQAVCALLNS...,899,True
W4VSP2,"[174, 175, 184, 272, 277]",MKLLIAFSPLVVLILFQEHISCYYLTKYASSGYYQDADFVIGGLFS...,855,True
W4VSP4,[130],MMDPLLEANATFALNLLKILGEDRSKNVFLSPISISSALVMVLLGA...,379,True


In [35]:
print(f'Number of mismatched proteins {pdb_prot_info.shape[0] -pdb_prot_info.length_matches.sum()}')

Number of mismatched proteins 598


In [43]:
pdb_prot_info['psp_length'] = pdb_prot_info['sequence'].apply(lambda x: len(x))

In [48]:
match_df = pdb_prot_info[['psp_length', 'length', 'length_matches']].rename(columns={'length' : 'pdb_structure_length'} )
match_df.to_csv('../data/pdb_match_df.csv')

In [49]:
mismatched = match_df[~match_df['length_matches']]
mismatched

Unnamed: 0_level_0,psp_length,pdb_structure_length,length_matches
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A0A087WP46,1171,1202,False
A0A096MK47,966,807,False
A0A0G2JVZ7,1511,1834,False
A0A0G2K3F3,579,704,False
A0A140LIF8,395,407,False
...,...,...,...
Q9WVE9,1217,1713,False
Q9WVP6,642,641,False
Q9WVR1,647,648,False
Q9Y4E1,1341,1320,False


In [61]:
len(set(reps_cnn_preds['id']) & set(mismatched.index))

47

In [62]:
reps_cnn_preds.shape[0]

3714

In [50]:
mismatched[['psp_length', 'pdb_structure_length']].to_csv('../data/mismatch.csv')

In [42]:
pdb_prot_info

Unnamed: 0_level_0,sites,sequence,length,length_matches,psp_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A0A024R4G9,"[14, 16, 20]",MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH...,117,True,117
A0A075B759,"[40, 79, 93, 119]",MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF...,164,True,164
A0A087WP46,"[359, 972, 973, 974, 988, 997, 1000, 1005, 101...",MARDGAEQPDSGPLPRPSPCPQEDRASNLMPPKPPRTWGLQLQGPS...,1202,False,1171
A0A087WPF7,"[32, 43, 622, 626, 798, 941, 956, 1031, 1038, ...",MDGPTRGHGLRKKRRSRSQRDRERRSRAGLGTGAAGGIGAGRTRAP...,1261,True,1261
A0A087WQ89,"[18, 26]",METPIQREIRRSCEREESLRRSRGLSPGRAGEELIELRVRPVLSRP...,206,True,206
...,...,...,...,...,...
V9GX81,"[702, 705]",MAGGVWGRGRDGRDGPVGSLTLTALAEGIRASQGQPVGPSSTGPQS...,722,True,722
V9GXG1,"[24, 26, 406]",MPYAEITVNLGKVTLGEENRKKMTNSCLKRHENSSLVQAVCALLNS...,899,True,899
W4VSP2,"[174, 175, 184, 272, 277]",MKLLIAFSPLVVLILFQEHISCYYLTKYASSGYYQDADFVIGGLFS...,855,True,855
W4VSP4,[130],MMDPLLEANATFALNLLKILGEDRSKNVFLSPISISSALVMVLLGA...,379,True,379


In [19]:
pdb_data = {'id' : [], 'is_disordered' : []}
for path in glob.glob('../data/pdbs/*.pdb'):
    id = os.path.basename(path)[:-4]
    struct = read_pdb(id, path)
    disordered_labels = mark_disordered(get_plddts(struct))
    pdb_data['id'].append(id)
    pdb_data['is_disordered'].append(disordered_labels)

pdb_df = pd.DataFrame.from_dict(pdb_data, 'columns')
pdb_df

KeyboardInterrupt: 

In [31]:
struct = read_pdb('A0A087WP46', '../data/pdbs/A0A087WP46.pdb')
len(get_plddts(struct))

1202

In [32]:
pdb_df.to_json('../data/disordered_50_df.json')

NameError: name 'pdb_df' is not defined

In [2]:
pdb_df = pd.read_json('../data/disordered_50_df.json')

In [3]:
pdb_df

Unnamed: 0,id,is_disordered
0,A0A024R4G9,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,A0A075B759,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,A0A087WP46,"[1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,A0A087WPF7,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, ..."
4,A0A087WQ89,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...
37099,V9GX81,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
37100,V9GXG1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
37101,W4VSP2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
37102,W4VSP4,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [7]:
prot_info

Unnamed: 0,id,sites,sequence
0,A0A024R4G9,"[14, 16, 20]",MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH...
1,A0A075B759,"[40, 79, 93, 119]",MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF...
2,A0A087WP46,"[359, 972, 973, 974, 988, 997, 1000, 1005, 101...",MARDGAEQPDSGPLPRPSPCPQEDRASNLMPPKPPRTWGLQLQGPS...
3,A0A087WPF7,"[32, 43, 622, 626, 798, 941, 956, 1031, 1038, ...",MDGPTRGHGLRKKRRSRSQRDRERRSRAGLGTGAAGGIGAGRTRAP...
4,A0A087WQ53,[58],MGQNNNVTEFILLGLTQDPAGQKVLFVMFLLIYIVKIVGNLLIVGT...
...,...,...,...
42252,XP_997087,"[347, 907, 915, 918, 927]",MENFLALMNSISDTWMSPSCMDIAMDMGIAFVCGAGLFFLLLPFLK...
42253,YP_009725299,"[504, 660, 661, 794, 1826]",APTKVTFGDDTVIEVQGYKSVNITFELDERIDKVLNEKCSAYTVEL...
42254,YP_009725305,[5],NNELSPVALRQMSCAAGTTQTACTDDNALAYYNTTKGGRFVLALLS...
42255,YP_009725309,[56],AENVTGLFKDCSKVITGLHPTQAPTHLSVDTKFKTEGLCVDIPGIP...


In [15]:
joined = pdb_df.join(prot_info.set_index('id'), on='id')
joined

Unnamed: 0,id,is_disordered,sites,sequence
0,A0A024R4G9,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[14, 16, 20]",MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH...
1,A0A075B759,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[40, 79, 93, 119]",MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF...
2,A0A087WP46,"[1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[359, 972, 973, 974, 988, 997, 1000, 1005, 101...",MARDGAEQPDSGPLPRPSPCPQEDRASNLMPPKPPRTWGLQLQGPS...
3,A0A087WPF7,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, ...","[32, 43, 622, 626, 798, 941, 956, 1031, 1038, ...",MDGPTRGHGLRKKRRSRSQRDRERRSRAGLGTGAAGGIGAGRTRAP...
4,A0A087WQ89,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[18, 26]",METPIQREIRRSCEREESLRRSRGLSPGRAGEELIELRVRPVLSRP...
...,...,...,...,...
37099,V9GX81,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[702, 705]",MAGGVWGRGRDGRDGPVGSLTLTALAEGIRASQGQPVGPSSTGPQS...
37100,V9GXG1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[24, 26, 406]",MPYAEITVNLGKVTLGEENRKKMTNSCLKRHENSSLVQAVCALLNS...
37101,W4VSP2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[174, 175, 184, 272, 277]",MKLLIAFSPLVVLILFQEHISCYYLTKYASSGYYQDADFVIGGLFS...
37102,W4VSP4,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[130],MMDPLLEANATFALNLLKILGEDRSKNVFLSPISISSALVMVLLGA...


In [9]:
pdb_mismatch = pd.read_csv('../data/pdb_mismatch.csv')

In [16]:
matching_prots = joined[pdb_mismatch['length_matches']]
matching_prots

Unnamed: 0,id,is_disordered,sites,sequence
0,A0A024R4G9,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[14, 16, 20]",MTVLEAVLEIQAITGSRLLSMVPGPARPPGSCWDPTQCTRTWLLSH...
1,A0A075B759,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[40, 79, 93, 119]",MVNSVVFFEITRDGKPLGRISIKLFADKIPKTAENFRALSTGEKGF...
3,A0A087WPF7,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, ...","[32, 43, 622, 626, 798, 941, 956, 1031, 1038, ...",MDGPTRGHGLRKKRRSRSQRDRERRSRAGLGTGAAGGIGAGRTRAP...
4,A0A087WQ89,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[18, 26]",METPIQREIRRSCEREESLRRSRGLSPGRAGEELIELRVRPVLSRP...
5,A0A087WQP5,"[0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[98, 102, 107]",MGCCGCGGCGGCGGCGCGGCGCGGCGCGGCGCGGCGCGGCGCGGCG...
...,...,...,...,...
37099,V9GX81,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[702, 705]",MAGGVWGRGRDGRDGPVGSLTLTALAEGIRASQGQPVGPSSTGPQS...
37100,V9GXG1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[24, 26, 406]",MPYAEITVNLGKVTLGEENRKKMTNSCLKRHENSSLVQAVCALLNS...
37101,W4VSP2,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[174, 175, 184, 272, 277]",MKLLIAFSPLVVLILFQEHISCYYLTKYASSGYYQDADFVIGGLFS...
37102,W4VSP4,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[130],MMDPLLEANATFALNLLKILGEDRSKNVFLSPISISSALVMVLLGA...


In [36]:
import numpy as np
from ast import literal_eval
from sklearn.metrics import matthews_corrcoef

In [61]:
def analyze_correlation(df, residues = {'S', 'T', 'Y'}):
    buf_x = []
    buf_y = []
    site_disordered = []

    def marking_fn(row):
        mask = [char in residues for char in list(row['sequence'])]
        labels = np.zeros(shape=(len(row['sequence'])))
        sites = np.array([eval(i) for i in row['sites'] if row['sequence'][eval(i) - 1] in residues]) - 1
        if len(sites) > 0:
            labels[sites] = 1
        x = np.array(row['is_disordered'])[mask]
        y = labels[mask]
        if len(sites) > 0:
            site_disordered.extend(np.array(row['is_disordered'])[sites])
        buf_x.extend(x)
        buf_y.extend(y)

    df.apply(marking_fn, axis=1)

    buf_x = np.array(buf_x).flatten()
    buf_y = np.array(buf_y).flatten()

    print(f'Correlation between disordered sites and phosphorylation of {residues} is \nMCC: {matthews_corrcoef(buf_x, buf_y)}')
    print(f'Percentage of phosphorylated sites being disordered: {sum(site_disordered) / len(site_disordered)}')


In [62]:
types = [
    {'S', 'T', 'Y'},
    {'S', 'T'},
    {'S'},
    {'T'},
    {'Y'},
]
for type in types:
    analyze_correlation(matching_prots, type)

Correlation between disordered sites and phosphorylation of {'T', 'S', 'Y'} is 
MCC: 0.1248780426226746
Percentage of phosphorylated sites being disordered: 0.53432716937754
Correlation between disordered sites and phosphorylation of {'T', 'S'} is 
MCC: 0.13046793136861168
Percentage of phosphorylated sites being disordered: 0.5806488071061507
Correlation between disordered sites and phosphorylation of {'S'} is 
MCC: 0.14302940844669923
Percentage of phosphorylated sites being disordered: 0.6264375819331666
Correlation between disordered sites and phosphorylation of {'T'} is 
MCC: 0.07890527459587142
Percentage of phosphorylated sites being disordered: 0.4533157065406084
Correlation between disordered sites and phosphorylation of {'Y'} is 
MCC: 0.08193402517427602
Percentage of phosphorylated sites being disordered: 0.26283431315724104
