In [6]:
import sys
import os
import pandas as pd
import numpy as np

grandparent_dir = os.path.dirname(os.path.dirname(os.getcwd()))

sys.path.append(grandparent_dir)

from funcs import preprocessingks

dataset = 'KS2014'

print('Loading raw data for', dataset, '...')
data = pd.read_csv('/Users/maryamkoddus/Documents/maryam-ko-QMUL-MSc-Project/01_input_data/raw_data/mmc3.csv', header=0)
print('Raw data loaded.')

print(f"Dataset Columns: {data.columns}")
print(data.head())  # Print first few rows to inspect data

Loading raw data for KS2014 ...


  data = pd.read_csv('/Users/maryamkoddus/Documents/maryam-ko-QMUL-MSc-Project/01_input_data/raw_data/mmc3.csv', header=0)


Raw data loaded.
Dataset Columns: Index(['id', 'Uniprot ID', 'Protein names', 'Gene names', 'Sequence window',
       'Number of Phospho (STY)', 'Position', 'Amino acid', 'Class1',
       'Localization prob', 'Score diff', 'PEP', 'Diagnostic peak', 'Score',
       'Delta score', 'Mass error [ppm]', 'Intensity', 'Intensity C1',
       'Intensity C2', 'Intensity C3', 'Intensity C4', 'Intensity C5',
       'Intensity C6', 'Intensity E15_1', 'Intensity E15_2', 'Intensity E15_3',
       'Intensity E15_4', 'Intensity N1', 'Intensity N2', 'Intensity N3',
       'Intensity N4', 'Intensity pY_C4', 'Intensity pY_C5', 'Intensity pY_C6',
       'Intensity pY_E5_1', 'Intensity pY_E5_2', 'Intensity pY_E5_3',
       'Intensity pY_E5_4', 'Intensity pY_N1', 'Intensity pY_N2',
       'Intensity pY_N3', 'Intensity pY_N4', 'Intensity pY_PV1',
       'Intensity pY_PV2', 'Intensity pY_PV3', 'Intensity pY_PV4'],
      dtype='object')
        id                                         Uniprot ID  \
0  12045.0

In [7]:
# Ensure 'Gene names' column exists
if 'Gene names' not in data.columns:
    raise ValueError("The 'Gene names' column is missing from the dataset.")

In [8]:
# filter data to keep only those with localization probability >= 0.85
data = data[data['Localization prob'] >= 0.85] 

# Filtering out semi-colons from 'Amino acid', 'Positions', and 'Gene names' columns
data = data[~data['Amino acid'].str.contains(';', na=False)]
data = data[~data['Gene names'].str.contains(';', na=False)]

# filter data
data['Sequence window'] = data['Sequence window'].str.replace('_', '')


In [9]:
#Define the path to your FASTA file
fasta_path = "/Users/maryamkoddus/Documents/maryam-ko-QMUL-MSc-Project/01_input_data/raw_data/UP000005640_9606.fasta"

# Attempt to match sequences to FASTA sequences
data = preprocessingks.match_seq_to_genename(data, 'Sequence window')

# Check if data is None
if data is None:
    print("Error: match_seq_to_genename returned None")
else:
    # Use the gene names from the initial dataset
    data['GeneName'] = data['Gene names']
    print('Using gene names from the initial dataset.')

Amino acid sequences matched to FASTA sequences.
Error: match_seq_to_genename returned None


In [10]:
data['Phosphosite'] = data['Amino acid'].astype(str) + '(' + data['Position'].astype(str) + ')'

# Keep only 'Phosphosite' and ratio columns
keepcols = ['Phosphosite'] + ['GeneName'] + [col for col in data.columns if 'Intensity' in col]
data = data[keepcols]

print("Data after subsetting columns:", data)
print("Cols after subsetting:", data.columns)


TypeError: 'NoneType' object is not subscriptable

In [None]:
# log2 transform the ratio columns 
Intensity_columns = [col for col in data.columns if 'Intensity' in col]
data[Intensity_columns] = data[Intensity_columns].apply(pd.to_numeric, errors="coerce")
data[Intensity_columns] = np.log2(data[Intensity_columns] + 1)  # Avoid log(0) errors
print("After transformation:")
print(data.head())  # Show the first few rows after processing

In [None]:
data = preprocessingks.create_phos_ID(data) # call function to create phosphosite_ID column
print('Phosphosite IDs created.')
data = data.drop(columns=['Gene names', 'Amino acid', 'Position'])

data = preprocessingks.clean_phosID_col(data)
print("After cleaning phosphosite_ID column:")
print(data.head())

In [None]:
final_columns = ['phosphosite_ID'] + [col for col in data.columns if 'Intensity' in col]
data = data[final_columns]
print("Final dataset preview:")
print(data.head())  # Display first few rows
print(data.tail())  # Display last few rows

In [None]:
data.to_csv(f'/Users/maryamkoddus/Documents/maryam-ko-QMUL-MSc-Project/01_input_data/PreprocessedDatasets/KS2014.csv', index=False)


print(dataset, 'has been saved to CSV successfully!', data)
                              