In [None]:
#|default_exp app_v2

In [1]:
#| export
from fastai.tabular.all import *

import seaborn as sns

import tqdm

pd.options.display.max_rows = 20
pd.options.display.max_columns = 8

In [2]:
#| export
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *

#### Downloading Datasets

In [3]:
#| export
comp = 'amp-parkinsons-disease-progression-prediction'
path = setup_comp(comp, install='fastai')

#### Create Dataframes

Training Data

In [4]:
#| export
train_peptides = pd.read_csv(path/'train_peptides.csv')
train_proteins = pd.read_csv(path/'train_proteins.csv')
train_clinical_data = pd.read_csv(path/'train_clinical_data.csv')
supplemental_clinical_data = pd.read_csv(path/'supplemental_clinical_data.csv')

In [29]:
train_peptides.head()

Unnamed: 0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
0,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3
1,55_0,0,55,O00533,GNPEPTFSWTK,102060.0
2,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0
3,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
4,55_0,0,55,O00533,SMEQNGPGLEYR,30838.7


In [41]:
train_clinical_data.isnull().sum()

In [42]:
supplemental_clinical_data.isnull().sum()

#### Remove Null and Preprocess with Tabular Pandas

In [43]:
# Combine train_clinical_data and supplemental_clinical_data
combined_clinical_data = pd.concat([train_clinical_data, supplemental_clinical_data], ignore_index=True)

In [44]:
# Define categorical and continuous variable columns
cat_names = ['patient_id', 'upd23b_clinical_state_on_medication']
cont_names = ['visit_month', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

In [45]:
# Define the preprocessing steps
procs = [Categorify, FillMissing, Normalize]

In [46]:
# Create a TabularPandas object
to = TabularPandas(combined_clinical_data, procs, cat_names, cont_names, splits=None)

In [51]:
# Apply the preprocessing steps
preprocessed_data = to.train.xs.reset_index(drop=True)

In [53]:
# Check the preprocessed data
preprocessed_data.head()


Unnamed: 0,patient_id,upd23b_clinical_state_on_medication,updrs_1_na,updrs_2_na,...,updrs_1,updrs_2,updrs_3,updrs_4
0,2,0,1,1,...,0.715326,-0.100438,-0.433622,-0.390889
1,2,0,1,1,...,0.715326,0.076453,0.285735,-0.390889
2,2,0,1,1,...,0.315193,0.607123,0.933157,-0.390889
3,2,2,1,1,...,0.315193,0.430233,0.645414,-0.390889
4,2,2,1,1,...,0.715326,0.607123,1.436707,-0.390889


In [None]:
# Aggregate peptide data by visit_id, UniProt, and visit_month
peptide_agg = train_peptides.groupby(['visit_id', 'UniProt', 'visit_month']).agg({'PeptideAbundance': 'sum'}).reset_index()

In [None]:
# Merge aggregated peptide data with protein data
merged_protein_data = train_proteins.merge(peptide_agg, on=['visit_id', 'UniProt', 'visit_month'], how='left')

In [None]:
# Calculate the ratio of peptide abundance to NPX
merged_protein_data['Peptide_NPX_ratio'] = merged_protein_data['PeptideAbundance'] / merged_protein_data['NPX']

In [None]:
# Merge the protein-level data with the preprocessed clinical data
merged_data = preprocessed_data.merge(merged_protein_data, on=['visit_id', 'visit_month'])

In [None]:
# Check the merged data
print(merged_data.head())


In [16]:
import nbdev
nbdev.export.nb_export('pb_parkinsons_progression_1.ipynb', 'app_v2')
print("export successful")

export successful


In [18]:
import re

with open('./app_v2/app_v2.py', 'r') as f:
    lines = f.readlines()

with open('./app_v2/app_v2_clean.py', 'w') as f:
    for line in lines:
        if not re.match(r'# %% ../pb_parkinsons_progression_1\.ipynb \d+', line):
            f.write(line)