In [1]:
#Install and load required packages
library(forecast)
library(caret)
library(dplyr)
library(tidyr)
library(lubridate)


Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 

Loading required package: ggplot2

Loading required package: lattice


Attaching package: ‘caret’


The following object is masked from ‘package:httr’:

    progress



Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘lubridate’


The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union




In [2]:
#Load and preprocess the data
train_peptides <- read.csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv")
train_proteins <- read.csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv")
train_clinical_data <- read.csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv")
supplemental_clinical_data <- read.csv("/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv")


In [3]:

head(train_peptides)
head(train_proteins)
head(train_clinical_data)
head(supplemental_clinical_data)

Unnamed: 0_level_0,visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>,<chr>,<dbl>
1,55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3
2,55_0,0,55,O00533,GNPEPTFSWTK,102060.0
3,55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0
4,55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9
5,55_0,0,55,O00533,SMEQNGPGLEYR,30838.7
6,55_0,0,55,O00533,TLKIENVSYQDKGNYR,23216.5


Unnamed: 0_level_0,visit_id,visit_month,patient_id,UniProt,NPX
Unnamed: 0_level_1,<chr>,<int>,<int>,<chr>,<dbl>
1,55_0,0,55,O00391,11254.3
2,55_0,0,55,O00533,732430.0
3,55_0,0,55,O00584,39585.8
4,55_0,0,55,O14498,41526.9
5,55_0,0,55,O14773,31238.0
6,55_0,0,55,O14791,4202.71


Unnamed: 0_level_0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
Unnamed: 0_level_1,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>
1,55_0,55,0,10,6,15,,
2,55_3,55,3,10,7,25,,
3,55_6,55,6,8,10,34,,
4,55_9,55,9,8,9,30,0.0,On
5,55_12,55,12,10,10,41,0.0,On
6,55_18,55,18,7,13,38,0.0,On


Unnamed: 0_level_0,visit_id,patient_id,visit_month,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
Unnamed: 0_level_1,<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
1,35_0,35,0,5.0,3.0,16,0.0,
2,35_36,35,36,6.0,4.0,20,0.0,
3,75_0,75,0,4.0,6.0,26,0.0,
4,75_36,75,36,1.0,8.0,38,0.0,On
5,155_0,155,0,,,0,,
6,337_0,337,0,5.0,7.0,6,0.0,On


In [4]:
# Merge train_peptides and train_proteins datasets
train_data <- train_peptides %>%
  inner_join(train_proteins, by = c("visit_id", "visit_month", "patient_id", "UniProt"))

print(colSums(is.na(train_data)))
class(train_data)

        visit_id      visit_month       patient_id          UniProt 
               0                0                0                0 
         Peptide PeptideAbundance              NPX 
               0                0                0 


In [5]:

# Merge train_data with train_clinical_data
train_data <- train_data %>%
  inner_join(train_clinical_data, by = c("visit_id", "visit_month", "patient_id"))
print(colSums(is.na(train_data)))
View(train_data)


                           visit_id                         visit_month 
                                  0                                   0 
                         patient_id                             UniProt 
                                  0                                   0 
                            Peptide                    PeptideAbundance 
                                  0                                   0 
                                NPX                             updrs_1 
                                  0                                   0 
                            updrs_2                             updrs_3 
                                  0                                9120 
                            updrs_4 upd23b_clinical_state_on_medication 
                             446214                                   0 


visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance,NPX,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication
<chr>,<int>,<int>,<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<chr>
55_0,0,55,O00391,NEQEQPLGQWHLS,11254.30,11254.30,10,6,15,,
55_0,0,55,O00533,GNPEPTFSWTK,102060.00,732430.00,10,6,15,,
55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.00,732430.00,10,6,15,,
55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.90,732430.00,10,6,15,,
55_0,0,55,O00533,SMEQNGPGLEYR,30838.70,732430.00,10,6,15,,
55_0,0,55,O00533,TLKIENVSYQDKGNYR,23216.50,732430.00,10,6,15,,
55_0,0,55,O00533,VIAVNEVGR,170878.00,732430.00,10,6,15,,
55_0,0,55,O00533,VMTPAVYAPYDVK,148771.00,732430.00,10,6,15,,
55_0,0,55,O00533,VNGSPVDNHPFAGDVVFPR,55202.10,732430.00,10,6,15,,
55_0,0,55,O00584,ELDLNSVLLK,27229.30,39585.80,10,6,15,,


In [6]:
# Calculate the sum of peptide abundance per patient visit
train_data <- train_data %>%
  group_by(visit_id, visit_month, patient_id) %>%
  mutate(peptide_abundance_sum = sum(PeptideAbundance))
head(train_data)

visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance,NPX,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,peptide_abundance_sum
<chr>,<int>,<int>,<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<chr>,<dbl>
55_0,0,55,O00391,NEQEQPLGQWHLS,11254.3,11254.3,10,6,15,,,696531287
55_0,0,55,O00533,GNPEPTFSWTK,102060.0,732430.0,10,6,15,,,696531287
55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0,732430.0,10,6,15,,,696531287
55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.9,732430.0,10,6,15,,,696531287
55_0,0,55,O00533,SMEQNGPGLEYR,30838.7,732430.0,10,6,15,,,696531287
55_0,0,55,O00533,TLKIENVSYQDKGNYR,23216.5,732430.0,10,6,15,,,696531287


In [7]:
# Calculate the mean protein expression per patient visit
train_data <- train_data %>%
  group_by(visit_id, visit_month, patient_id) %>%
  mutate(mean_protein_expression = mean(NPX))
View(train_data)

visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance,NPX,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,peptide_abundance_sum,mean_protein_expression
<chr>,<int>,<int>,<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<chr>,<dbl>,<dbl>
55_0,0,55,O00391,NEQEQPLGQWHLS,11254.30,11254.30,10,6,15,,,696531287,20287892
55_0,0,55,O00533,GNPEPTFSWTK,102060.00,732430.00,10,6,15,,,696531287,20287892
55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.00,732430.00,10,6,15,,,696531287,20287892
55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.90,732430.00,10,6,15,,,696531287,20287892
55_0,0,55,O00533,SMEQNGPGLEYR,30838.70,732430.00,10,6,15,,,696531287,20287892
55_0,0,55,O00533,TLKIENVSYQDKGNYR,23216.50,732430.00,10,6,15,,,696531287,20287892
55_0,0,55,O00533,VIAVNEVGR,170878.00,732430.00,10,6,15,,,696531287,20287892
55_0,0,55,O00533,VMTPAVYAPYDVK,148771.00,732430.00,10,6,15,,,696531287,20287892
55_0,0,55,O00533,VNGSPVDNHPFAGDVVFPR,55202.10,732430.00,10,6,15,,,696531287,20287892
55_0,0,55,O00584,ELDLNSVLLK,27229.30,39585.80,10,6,15,,,696531287,20287892


In [8]:
# Check for missing (NULL or NA) values in the dataset
missing_values <- colSums(is.na(train_data))
print(colSums(is.na(train_data)))


                           visit_id                         visit_month 
                                  0                                   0 
                         patient_id                             UniProt 
                                  0                                   0 
                            Peptide                    PeptideAbundance 
                                  0                                   0 
                                NPX                             updrs_1 
                                  0                                   0 
                            updrs_2                             updrs_3 
                                  0                                9120 
                            updrs_4 upd23b_clinical_state_on_medication 
                             446214                                   0 
              peptide_abundance_sum             mean_protein_expression 
                                  0                

In [9]:
#4 Handling missing values
train_data$updrs_3 <- ifelse(is.na(train_data$updrs_3), mean(train_data$updrs_3, na.rm = TRUE), train_data$updrs_3)
train_data$updrs_4 <- ifelse(is.na(train_data$updrs_4), mean(train_data$updrs_4, na.rm = TRUE), train_data$updrs_4)
missing_values <- colSums(is.na(train_data))
print(missing_values)

                           visit_id                         visit_month 
                                  0                                   0 
                         patient_id                             UniProt 
                                  0                                   0 
                            Peptide                    PeptideAbundance 
                                  0                                   0 
                                NPX                             updrs_1 
                                  0                                   0 
                            updrs_2                             updrs_3 
                                  0                                   0 
                            updrs_4 upd23b_clinical_state_on_medication 
                                  0                                   0 
              peptide_abundance_sum             mean_protein_expression 
                                  0                

In [10]:
#5 Split the data into training and testing sets
# Create a list to store data partitions for each updrs_ column
data_partitions <- lapply(c("updrs_1", "updrs_2", "updrs_3", "updrs_4"), function(updrs_col) {
  set.seed(123)
  createDataPartition(train_data[[updrs_col]], p = 0.8, list = FALSE)
})

In [11]:
# Split the data for each updrs_ column
train_sets <- lapply(data_partitions, function(trainIndex) train_data[trainIndex,])
test_sets <- lapply(data_partitions, function(trainIndex) train_data[-trainIndex,])
test_sets

visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance,NPX,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,peptide_abundance_sum,mean_protein_expression
<chr>,<int>,<int>,<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
55_0,0,55,O00533,IEIPSSVQQVPTIIK,174185.0,732430.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O00533,VIAVNEVGR,170878.0,732430.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O00533,VMTPAVYAPYDVK,148771.0,732430.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O00533,VNGSPVDNHPFAGDVVFPR,55202.1,732430.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O14773,LFGGNFAHQASVAR,24884.4,31238.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O15394,NIINSDGGPYVC(UniMod_4)R,23209.4,62898.2,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O43505,TALASGGVLDASGDYR,333376.0,333376.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,P00450,GAYPLSIEPIGVR,308396.0,1181230.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,P00450,SVPPSASHVAPTETFTYEWTVPK,63714.5,1181230.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,P00734,ELLESYIDGR,42128.5,688909.0,10,6,15,1.887924,,696531287,20287892

visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance,NPX,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,peptide_abundance_sum,mean_protein_expression
<chr>,<int>,<int>,<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
55_0,0,55,O00533,KPQSAVYSTGSNGILLC(UniMod_4)EAEGEPQPTIK,27278.90,732430.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O00533,SMEQNGPGLEYR,30838.70,732430.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O14498,ALPGTPVASSQPR,41526.90,41526.9,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O14773,LYQQHGAGLFDVTR,6353.65,31238.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O15240,AYQGVAAPFPK,107076.00,177775.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O15240,QQETAAAETETR,3095.35,177775.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O60888,TQSSLVPALTDFVR,166850.00,166850.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O94919,QALNTDYLDSDYQR,23463.50,67567.6,10,6,15,1.887924,,696531287,20287892
55_0,0,55,P00441,TLVVHEKADDLGKGGNEESTK,24728.60,64117.8,10,6,15,1.887924,,696531287,20287892
55_0,0,55,P00450,NNEGTYYSPNYNPQSR,51048.30,1181230.0,10,6,15,1.887924,,696531287,20287892

visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance,NPX,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,peptide_abundance_sum,mean_protein_expression
<chr>,<int>,<int>,<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
55_0,0,55,O00533,GNPEPTFSWTK,102060.00,732430.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O00533,VIAVNEVGR,170878.00,732430.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O00533,VMTPAVYAPYDVK,148771.00,732430.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O14498,ALPGTPVASSQPR,41526.90,41526.9,10,6,15,1.887924,,696531287,20287892
55_0,0,55,P00441,ADDLGKGGNEESTKTGNAGSR,39389.30,64117.8,10,6,15,1.887924,,696531287,20287892
55_0,0,55,P00450,QYTDSTFRVPVER,30828.60,1181230.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,P00734,LAVTTHGLPC(UniMod_4)LAWASAQAK,199779.00,688909.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,P00734,SGIEC(UniMod_4)QLWR,90420.10,688909.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,P00736,MGNFPWQVFTNIHGR,62060.40,109541.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,P00738,AVGDKLPEC(UniMod_4)EADDGC(UniMod_4)PKPPEIAHGYVEHSVR,44946.40,3956470.0,10,6,15,1.887924,,696531287,20287892

visit_id,visit_month,patient_id,UniProt,Peptide,PeptideAbundance,NPX,updrs_1,updrs_2,updrs_3,updrs_4,upd23b_clinical_state_on_medication,peptide_abundance_sum,mean_protein_expression
<chr>,<int>,<int>,<chr>,<chr>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<chr>,<dbl>,<dbl>
55_0,0,55,O00533,SMEQNGPGLEYR,30838.7,732430.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O00533,VIAVNEVGR,170878.0,732430.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O00533,VMTPAVYAPYDVK,148771.0,732430.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O00584,HGTC(UniMod_4)AAQVDALNSQKK,12356.5,39585.8,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O14773,LFGGNFAHQASVAR,24884.4,31238.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O60888,TQSSLVPALTDFVR,166850.0,166850.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O75144,GLYDVVSVLR,44662.4,98746.3,10,6,15,1.887924,,696531287,20287892
55_0,0,55,O94919,QALNTDYLDSDYQR,23463.5,67567.6,10,6,15,1.887924,,696531287,20287892
55_0,0,55,P00450,MYYSAVDPTKDIFTGLIGPM(UniMod_35)K,18890.3,1181230.0,10,6,15,1.887924,,696531287,20287892
55_0,0,55,P00450,NNEGTYYSPNYNPQSR,51048.3,1181230.0,10,6,15,1.887924,,696531287,20287892


In [12]:

#6 Train the model
predict_updrs <- function(train_set, test_set, updrs_type) {
  
  predictions <- data.frame()
  
  # Add browser()
  #browser()
  
  # Loop through unique patients
  for (patient in unique(test_set$patient_id)) {
    # Filter data for the specific patient
    train_patient <- train_set[train_set$patient_id == patient,]
    test_patient <- test_set[test_set$patient_id == patient,]
    
    # Create a time series object for the specific updrs_type and patient
    updrs_ts <- ts(train_patient[, updrs_type], start = 0, frequency = 12)
    
    # Fit the best ARIMA model
    arima_model <- auto.arima(updrs_ts)
    
    # Forecast the next 24 months
    forecasted_values <- forecast(arima_model, h = 24)
    
    # Calculate the rating vector
    rating_vector <- forecasted_values$mean[seq(1, 24, 6)]
    
    
    # Prepare the submission format
    patient_predictions <- data.frame(
      prediction_id = paste(patient, updrs_type, "plus", seq(6, 24, 6), "months", sep = "_"),
      rating = rating_vector,
      group_key = rep(0, length(rating_vector)),
      stringsAsFactors = FALSE,
      row.names = NULL
    )
    
    # Add patient predictions to the overall predictions using bind_rows
    predictions <- bind_rows(predictions, patient_predictions)
  }
  
  return(predictions)
}



In [13]:


#7 make predictions
# Make predictions for all updrs types and combine them
updrs_1_predictions <- predict_updrs(train_sets[[1]], test_sets[[1]], "updrs_1")
updrs_2_predictions <- predict_updrs(train_sets[[2]], test_sets[[2]], "updrs_2")
updrs_3_predictions <- predict_updrs(train_sets[[3]], test_sets[[3]], "updrs_3")
updrs_4_predictions <- predict_updrs(train_sets[[4]], test_sets[[4]], "updrs_4")

In [14]:
# Combine all predictions
all_predictions <- rbind(updrs_1_predictions, updrs_2_predictions, updrs_3_predictions, updrs_4_predictions)

In [15]:
#8 Save the predictions to a CSV file
write.csv(all_predictions, "submission_v2.7.csv", row.names = FALSE)