In [7]:
import sys
import os
import pandas as pd
import numpy as np

grandparent_dir = os.path.dirname(os.path.dirname(os.getcwd()))

sys.path.append(grandparent_dir)

from funcs import preprocessing

dataset = 'MV2014'

print('Loading raw data for', dataset, '...')
data = pd.read_csv('/Users/maryamkoddus/Documents/maryam-ko-QMUL-MSc-Project/01_input_data/raw_data/e22-sup-tables1.csv', header=0)
print('Raw data loaded.')

Loading raw data for MV2014 ...
Raw data loaded.


In [8]:
data.columns = data.columns.str.strip()
print(f"Dataset Columns: {data.columns}")
print(data.head())  # Print first few rows to inspect data

# filter data to keep only those with localization probability >= 0.85
data = data[data['Localization prob'] >= 0.85] 

# Filtering out semi-colons from 'Amino acid', 'Positions within proteins', and 'Gene names' columns
data = data[~data['Amino acid'].str.contains(';', na=False)]
data = data[~data['Positions within proteins'].str.contains(';', na=False)]
data = data[~data['Gene names'].str.contains(';', na=False)]


Dataset Columns: Index(['id', 'Proteins', 'Protein names', 'Gene names', 'Amino acid',
       'Positions within proteins', 'Sequence window', 'Modified sequence',
       'Known site (PhosphoSite Plus)?', 'Known site (Uniprot)?',
       'Regulatory site (PhosphoSitePlus)?', 'Localization prob',
       'Score difference', 'Ratio M/L normalized A1___1 (2 min)',
       'Ratio M/L normalized A1___2 (2 min)',
       'Ratio M/L normalized A1___3 (2 min)',
       'Ratio H/M normalized A2___1 (2 min)',
       'Ratio H/M normalized A2___2 (2 min)',
       'Ratio H/M normalized A2___3 (2 min)',
       'Ratio L/H normalized A3___1 (2 min)',
       'Ratio L/H normalized A3___2 (2 min)',
       'Ratio L/H normalized A3___3 (2 min)',
       'Ratio H/L normalized A1___1 (10 min)',
       'Ratio H/L normalized A1___2 (10 min)',
       'Ratio H/L normalized A1___3 (10 min)',
       'Ratio L/M normalized A2___1 (10 min)',
       'Ratio L/M normalized A2___2 (10 min)',
       'Ratio L/M normalized A2___3 

In [9]:
# filter data
data['Sequence window'] = data['Sequence window'].str.replace('_', '')

preprocessing.match_seq_to_genename(data, 'Sequence window')
print('Amino acid sequences matched to gene names.')

Amino acid sequences matched to gene names.
Amino acid sequences matched to gene names.


In [10]:
print("Before creating Phosphosite, columns:", data.columns)

# Ensure GeneName exists before proceeding
if 'GeneName' not in data.columns:
    raise ValueError("GeneName column is missing! Check match_seq_to_genename function.")


data['Phosphosite'] = data['Amino acid'].astype(str) + '(' + data['Positions within proteins'].astype(str) + ')'

# Debugging: Check if 'Phosphosite' column is present after creation
if 'Phosphosite' not in data.columns:
    print("Error: 'Phosphosite' column not created!")
else:
    print("Phosphosite column created successfully.")

print("After creating Phosphosite, columns:", data.columns)  # Check if 'Phosphosite' is added


Before creating Phosphosite, columns: Index(['id', 'Proteins', 'Protein names', 'Gene names', 'Amino acid',
       'Positions within proteins', 'Sequence window', 'Modified sequence',
       'Known site (PhosphoSite Plus)?', 'Known site (Uniprot)?',
       'Regulatory site (PhosphoSitePlus)?', 'Localization prob',
       'Score difference', 'Ratio M/L normalized A1___1 (2 min)',
       'Ratio M/L normalized A1___2 (2 min)',
       'Ratio M/L normalized A1___3 (2 min)',
       'Ratio H/M normalized A2___1 (2 min)',
       'Ratio H/M normalized A2___2 (2 min)',
       'Ratio H/M normalized A2___3 (2 min)',
       'Ratio L/H normalized A3___1 (2 min)',
       'Ratio L/H normalized A3___2 (2 min)',
       'Ratio L/H normalized A3___3 (2 min)',
       'Ratio H/L normalized A1___1 (10 min)',
       'Ratio H/L normalized A1___2 (10 min)',
       'Ratio H/L normalized A1___3 (10 min)',
       'Ratio L/M normalized A2___1 (10 min)',
       'Ratio L/M normalized A2___2 (10 min)',
       'Ratio L

In [11]:
# Keep only 'Phosphosite' and ratio columns
keepcols = ['Phosphosite'] + ['GeneName'] + [col for col in data.columns if 'Ratio' in col]
data = data[keepcols]

print("Data after subsetting columns:", data)
print("Cols after subsetting:", data.columns)

Data after subsetting columns:       Phosphosite       GeneName  Ratio M/L normalized A1___1 (2 min)  \
7          S(226)       Mediator                             0.176067   
14         S(137)        Protein                                  NaN   
31         S(491)            SH3                            -1.560728   
32         S(528)            SH3                             1.245496   
58          S(62)        Protein                            -0.234720   
...           ...            ...                                  ...   
10324      Y(928)         Nestin                                  NaN   
10329        Y(6)     Caveolin-1                                  NaN   
10330       Y(14)     Caveolin-1                                  NaN   
10332       Y(87)  Transformer-2                                  NaN   
10339      Y(464)     Kanadaptin                            -0.330157   

       Ratio M/L normalized A1___2 (2 min)  \
7                                      NaN   


In [12]:
# log2 transform the ratio columns (Ratio columns)
Ratio_columns = [col for col in data.columns if 'Ratio' in col]
data[Ratio_columns] = data[Ratio_columns].apply(pd.to_numeric, errors='coerce')
print("After transformation:")
print(data.head())  # Show the first few rows after processing

After transformation:
   Phosphosite  GeneName  Ratio M/L normalized A1___1 (2 min)  \
7       S(226)  Mediator                             0.176067   
14      S(137)   Protein                                  NaN   
31      S(491)       SH3                            -1.560728   
32      S(528)       SH3                             1.245496   
58       S(62)   Protein                            -0.234720   

    Ratio M/L normalized A1___2 (2 min)  Ratio M/L normalized A1___3 (2 min)  \
7                                   NaN                                  NaN   
14                                  NaN                                  NaN   
31                                  NaN                                  NaN   
32                                  NaN                                  NaN   
58                                  NaN                                  NaN   

    Ratio H/M normalized A2___1 (2 min)  Ratio H/M normalized A2___2 (2 min)  \
7                         

In [13]:
data = preprocessing.create_phos_ID(data) # call function to create phosphosite_ID column
print('Phosphosite IDs created.')

Phosphosite IDs created.
Phosphosite IDs created.


In [14]:
ata = preprocessing.clean_phosID_col(data)
print("After cleaning phosphosite_ID column:")
print(data.head())


Phosphosites with multiple measurements have been averaged
         phosphosite_ID  Ratio M/L normalized A1___1 (2 min)  \
0            17S_S(642)                            -0.245933   
1            17S_S(676)                            -0.181199   
2           182_S(1297)                            -0.008421   
3           182_S(1328)                                  NaN   
4           182_S(1331)                                  NaN   
...                 ...                                  ...   
1245         mRNA_S(70)                                  NaN   
1246         mRNA_S(75)                                  NaN   
1247         mRNA_S(80)                            -0.193734   
1248  tRNA:m(4)X_S(397)                                  NaN   
1249        tRNA_S(318)                                  NaN   

      Ratio M/L normalized A1___2 (2 min)  \
0                                     NaN   
1                                     NaN   
2                                    

In [15]:
final_columns = ['phosphosite_ID'] + [col for col in data.columns if 'Ratio' in col]
data = data[final_columns]
print("Final dataset preview:")
print(data.head())  # Display first few rows
print(data.tail())  # Display last few rows

Final dataset preview:
     phosphosite_ID  Ratio M/L normalized A1___1 (2 min)  \
7   Mediator_S(226)                             0.176067   
14   Protein_S(137)                                  NaN   
31       SH3_S(491)                            -1.560728   
32       SH3_S(528)                             1.245496   
58    Protein_S(62)                            -0.234720   

    Ratio M/L normalized A1___2 (2 min)  Ratio M/L normalized A1___3 (2 min)  \
7                                   NaN                                  NaN   
14                                  NaN                                  NaN   
31                                  NaN                                  NaN   
32                                  NaN                                  NaN   
58                                  NaN                                  NaN   

    Ratio H/M normalized A2___1 (2 min)  Ratio H/M normalized A2___2 (2 min)  \
7                              0.040963                

In [16]:
data.to_csv(f'/Users/maryamkoddus/Documents/maryam-ko-QMUL-MSc-Project/01_input_data/PreprocessedDatasets/MV2014.csv', index=False)


print(dataset, 'has been saved to CSV successfully!', data)

MV2014 has been saved to CSV successfully!             phosphosite_ID  Ratio M/L normalized A1___1 (2 min)  \
7          Mediator_S(226)                             0.176067   
14          Protein_S(137)                                  NaN   
31              SH3_S(491)                            -1.560728   
32              SH3_S(528)                             1.245496   
58           Protein_S(62)                            -0.234720   
...                    ...                                  ...   
10324        Nestin_Y(928)                                  NaN   
10329      Caveolin-1_Y(6)                                  NaN   
10330     Caveolin-1_Y(14)                                  NaN   
10332  Transformer-2_Y(87)                                  NaN   
10339    Kanadaptin_Y(464)                            -0.330157   

       Ratio M/L normalized A1___2 (2 min)  \
7                                      NaN   
14                                     NaN   
31            