In [None]:
#|default_exp app_v2

In [None]:
#| export
from fastai.tabular.all import *

import seaborn as sns

import tqdm

pd.options.display.max_rows = 20
pd.options.display.max_columns = 8

In [None]:
#| export
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

from fastkaggle import *

#### Downloading Datasets

In [None]:
#| export
comp = 'amp-parkinsons-disease-progression-prediction'
path = setup_comp(comp, install='fastai')

#### Create Dataframes

Training Data

In [None]:
#| export
train_peptides = pd.read_csv(path/'train_peptides.csv')
train_proteins = pd.read_csv(path/'train_proteins.csv')
train_clinical_data = pd.read_csv(path/'train_clinical_data.csv')
supplemental_clinical_data = pd.read_csv(path/'supplemental_clinical_data.csv')

#### Remove Null and Preprocess with Tabular Pandas

In [None]:
#| export
# Combine train_clinical_data and supplemental_clinical_data
combined_clinical_data = pd.concat([train_clinical_data, supplemental_clinical_data], ignore_index=True)

In [None]:
#| export
cat_names = ['patient_id', 'upd23b_clinical_state_on_medication']
cont_names = ['visit_month', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']

In [None]:
#| export
# Define the preprocessing steps
procs = [Categorify, FillMissing, Normalize]

In [None]:
#| export
# Create a TabularPandas object
to = TabularPandas(combined_clinical_data, procs, cat_names, cont_names, splits=None)

In [None]:
#| export
# Apply the preprocessing steps
preprocessed_data = to.train.xs.reset_index(drop=True)

In [None]:
#| export
# Check the preprocessed data
preprocessed_data.head()


#### Merge Peptide data with Protein data by Sum of Peptide Abundance

In [None]:
#| export
# Aggregate peptide data by visit_id, UniProt, and visit_month
peptide_agg = train_peptides.groupby(['visit_id', 'UniProt', 'visit_month']).agg({'PeptideAbundance': 'sum'}).reset_index()

In [None]:
#| export
# Merge aggregated peptide data with protein data
merged_protein_data = train_proteins.merge(peptide_agg, on=['visit_id', 'UniProt', 'visit_month'], how='left')

In [None]:
merged_protein_data.head()

In [None]:
#| export
# Calculate the ratio of peptide abundance to NPX
merged_protein_data['Peptide_NPX_ratio'] = merged_protein_data['PeptideAbundance'] / merged_protein_data['NPX']

In [None]:
# Calculate the ratio of peptide abundance to NPX
merged_protein_data['Peptide_NPX_ratio'] = merged_protein_data['PeptideAbundance'] / merged_protein_data['NPX']

# Create a scatterplot of the ratio of peptide abundance to NPX
plt.figure(figsize=(10, 6))
sns.scatterplot(data=merged_protein_data, x='NPX', y='Peptide_NPX_ratio', alpha=0.5)
plt.xlabel('NPX')
plt.ylabel('Peptide Abundance / NPX Ratio')
plt.title('Scatterplot of Peptide Abundance to NPX Ratio')
plt.show()

In [None]:
merged_protein_data.head()

In [None]:
#| export
# Merge the protein-level data with the preprocessed clinical data
merged_data = preprocessed_data.merge(merged_protein_data, on=['patient_id', 'visit_month'])

In [None]:
merged_data

In [None]:
# Check the merged data
print(merged_data.head())


In [2]:
import nbdev
nbdev.export.nb_export('pb_parkinsons_progression_1.ipynb', 'app_v2')
print("export successful")

export successful


In [3]:
import re

with open('./app_v2/app_v2.py', 'r') as f:
    lines = f.readlines()

with open('./app_v2/app_v2_clean.py', 'w') as f:
    for line in lines:
        if not re.match(r'# %% ../pb_parkinsons_progression_1\.ipynb \d+', line):
            f.write(line)