## Libs

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(palette="Set2", style='whitegrid', font_scale=1.3)

import scipy.stats as sps
from scipy.optimize import minimize

from functools import partial

import warnings
warnings.filterwarnings('ignore')

In [None]:
import xgboost as xgb
from sklearn.ensemble import RandomForestRegressor


## Metric

In [None]:
def smape_plus_1(y_true, y_pred):
    y_true_plus_1 = y_true + 1
    y_pred_plus_1 = y_pred + 1
    metric = np.zeros(len(y_true_plus_1))
    
    numerator = np.abs(y_true_plus_1 - y_pred_plus_1)
    denominator = ((np.abs(y_true_plus_1) + np.abs(y_pred_plus_1)) / 2)
    
    mask_not_zeros = (y_true_plus_1 != 0) | (y_pred_plus_1 != 0)
    metric[mask_not_zeros] = numerator[mask_not_zeros] / denominator[mask_not_zeros]
    
    return 100 * np.nanmean(metric)

## Data import

In [None]:
df_cl = pd.read_csv('./data/train_clinical_data.csv')
df_pep = pd.read_csv('./data/train_peptides.csv')
df_prot = pd.read_csv('./data/train_proteins.csv')
df_sup_cl = pd.read_csv('./data/supplemental_clinical_data.csv')

In [None]:
# df_cl= pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')


# df_pep = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_peptides.csv')
# df_prot = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv')

# df_sup_cl = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/supplemental_clinical_data.csv')


## Main Functionality

### Data preparation

In [None]:
def generate_targetset_cols(data):
    data['pred_month'] = data['visit_month']
    for plus_month in [6, 12, 24]:
        train_shift = data[['patient_id', 'visit_month', 'pred_month', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']].copy()
        train_shift['visit_month'] -= plus_month
        train_shift.rename(columns={f'updrs_{i}': f'updrs_{i}_plus_{plus_month}' for i in range(1, 5)}, inplace=True)
        train_shift.rename(columns={'pred_month': f'pred_month_plus_{plus_month}'}, inplace=True)
        data = data.merge(train_shift, how='left', on=['patient_id', 'visit_month'])
       
    data.rename(columns={f'updrs_{i}': f'updrs_{i}_plus_0' for i in range(1, 5)}, inplace=True)
    data.rename(columns={'pred_month': f'pred_month_plus_0'}, inplace=True)
    return data
    
def generate_targetset_rows(data):
    data['pred_month'] = data['visit_month']
    data['plus_month'] = data['visit_month'] - data['visit_month']
    output = data.copy()
    for plus_month in [6, 12, 24]:
        train_shift = data.copy()
        train_shift['visit_month'] -= plus_month
        train_shift['plus_month'] += plus_month
        train_shift['visit_id'] = train_shift['patient_id'].astype('str') + '_' + train_shift['visit_month'].astype('str')
        train_shift.drop(index=train_shift[train_shift.visit_month < 0].index, inplace=True)
        output = pd.concat([output, train_shift])

    output.sort_values(by=['patient_id', 'visit_month'], inplace=True)
    output.set_index(['visit_id'], inplace=True)
    return output

In [None]:
def build_features(peptides, proteins, cols):
    merged = pd.merge(peptides, proteins, on = ['visit_id', 'visit_month', 'patient_id', 'UniProt'])
    t_p = merged.pivot(index = 'visit_id', columns = ['Peptide', 'UniProt'], values = ['PeptideAbundance', 'NPX'])
    t_p.fillna(0, inplace=True)
    list_a = list((Counter(t_p.columns) - Counter(cols)).elements()) 
    t_p.drop(columns=list_a, inplace=True)
    list_c = list((Counter(cols) - Counter(t_p.columns)).elements()) 
    t_p[list_c] = np.zeros((t_p.shape[0], len(list_c)))
    return t_p[sorted(t_p.columns)]

### Trends

In [None]:
def calculate_trend(pred_month, trend, target):
    if target == 'updrs_4': pred_month = pred_month.clip(54, None)
    # return trend[0] + pred_month * trend[1]
    if len(trend) == 2:
        return np.round(trend[0] + pred_month * trend[1]) # linear prediction
    return np.round(trend[0] + pred_month * trend[1] + np.square(pred_month) * trend[2])

def function_to_minimize(x, y_true_array, pred_month_array, target):    
    metric = smape_plus_1(
        y_true=y_true_array, 
        y_pred=calculate_trend(
            pred_month=pred_month_array,
            trend=x,
            target=target
        )
    )
    return metric


def find_trend(data):
    
    target_to_trend = {}
    for i in range(1, 5):
        target = f'updrs_{i}'
        columns_with_target = [f'{target}_plus_{plus_month}' for plus_month in [0, 6, 12, 24]]
        columns_with_pred_month = [f'pred_month_plus_{plus_month}' for plus_month in [0, 6, 12, 24]]
        y_true_array = data[columns_with_target].values.ravel()
        pred_month_array = data[columns_with_pred_month].values.ravel()
        # trend = list(minimize(
        #         fun=partial(function_to_minimize,
        #                 y_true_array=y_true_array,
        #                 pred_month_array=pred_month_array,
        #                 target=target),
        #     x0=[0, 0.0048],
        #     method='Powell'
        # ).x)
        trend = list(minimize(
                fun=partial(function_to_minimize,
                    y_true_array=y_true_array,
                    pred_month_array=pred_month_array,
                    target=target
                   ),
        # if x0 has two elements, the predictions will be linear
        # if x0 has three elements, the predictions will be quadratic
                x0=[0, 0.0048] if target != 'updrs_3' else [0, 0, 0],
                method='Powell'
                ).x)
        target_to_trend[target] = trend
    return target_to_trend

### Models

In [None]:
def train_models(models, features, target):#may be add model and params, after finding good hyperparams
    for col in target.columns: 
        # models[col] = xgb.XGBRegressor().fit(features, target[col])
        models[col] = models[col].fit(features, target[col])
    return models

### Prediction 

In [None]:
def calculate_predictions(features, target_df, target, basis_id, pp_id, basis_trend, model):
    target_df.loc[basis_id, 'rating'] = calculate_trend(pred_month=target_df.loc[basis_id,'pred_month'],
                                                                                    trend=basis_trend[target],
                                                                                    target=target)
    if target in ['updrs_1', 'updrs_3']:
        target_df.loc[pp_id, 'rating'] = calculate_trend(pred_month=target_df.loc[pp_id,'pred_month'],
                                                                                trend=basis_trend[target],
                                                                                target=target) + model[target].predict(features)
    else:
        target_df.loc[pp_id, 'rating'] = np.round(calculate_trend(pred_month=target_df.loc[pp_id,'pred_month'],
                                                                                trend=basis_trend[target],
                                                                                target=target) + model[target].predict(features))
    return target_df.rating

## Trainning stage

### Data preparation

In [None]:
df_all_cl = pd.concat([df_cl, df_sup_cl])
df_all_cl = df_all_cl[~df_all_cl.visit_month.isin([3, 5, 9])]

### Trends finding

In [None]:
df_basis_trend = generate_targetset_cols(df_all_cl)

basis_trend = find_trend(df_basis_trend)

In [None]:
merged_pep_prot =  pd.merge(df_prot, df_pep, on = ['visit_id', 'visit_month', 'patient_id', 'UniProt'])
pivoted_pep_prot = merged_pep_prot.pivot(index = 'visit_id', columns = ['Peptide', 'UniProt'], values = ['PeptideAbundance', 'NPX'])
pivoted_pep_prot.fillna(0, inplace=True)

target = generate_targetset_rows(df_all_cl)
df_train = pd.merge(target, pivoted_pep_prot[sorted(pivoted_pep_prot.columns)], on = 'visit_id', how = 'inner')

In [None]:
X = df_train.drop(columns=['patient_id', 'upd23b_clinical_state_on_medication', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4'])
y = df_train[['pred_month','updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']]

In [None]:
for i in range(1, 5):
    t = f'updrs_{i}'
    y[t] -= calculate_trend(y.pred_month, basis_trend[t], t)


In [None]:
y.drop(columns=['pred_month'], inplace=True)

In [None]:
basis_trend

In [None]:
models = {
    'updrs_1' : xgb.XGBRegressor(eta=0.05, max_depth=7, n_estimators=106),
    'updrs_2' : RandomForestRegressor(max_depth=36, n_estimators=660),
    'updrs_3' : xgb.XGBRegressor(eta=0.05, max_depth=7, n_estimators=106),
    'updrs_4' : xgb.XGBRegressor(eta=0.05, max_depth=7, n_estimators=106)
}

In [None]:
xgb_models = train_models(models, X, y)

## Submission

In [None]:
import amp_pd_peptide
from collections import Counter

In [None]:
# amp_pd_peptide.make_env.func_dict['__called__'] = False
env = amp_pd_peptide.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files

# The API will deliver four dataframes in this specific order:
for test_clinical_data, test_peptides, test_proteins, sample_submission in iter_test:
    sample_submission['visit_id'] = sample_submission['prediction_id'].map(lambda x: x.split('_')[0] + '_' + x.split('_')[1])
    sample_submission['patient_id'] = sample_submission['prediction_id'].map(lambda x: int(x.split('_')[0]))
    sample_submission['visit_month'] = sample_submission['prediction_id'].map(lambda x: int(x.split('_')[1]))
    sample_submission['target_name'] = sample_submission['prediction_id'].map(lambda x: 'updrs_' + x.split('_')[3])
    sample_submission['plus_month'] = sample_submission['prediction_id'].map(lambda x: int(x.split('_')[5]))
    sample_submission['pred_month'] = sample_submission['visit_month'] + sample_submission['plus_month']
    sample_submission.set_index('visit_id', inplace=True)
    prot_pep_features = build_features(test_peptides, test_proteins, X.columns[3:])
    
    df_test = pd.merge(sample_submission, prot_pep_features, on = 'visit_id', how = 'inner')
    # display(df_test)
    # df_test.set_index('visit_id', inplace=True)
    for i in range(1, 5):
    
        target = f'updrs_{i}'

        mask_target = sample_submission['target_name'] == target
        mask_target_1 = df_test['target_name'] == target
        # display(sample_submission.loc[mask_target,:])
        # sample_submission.loc[mask_target, 'rating'] = calculate_linear_trend(
        #     pred_month=np.array(sample_submission.loc[mask_target, 'pred_month']),
        #     target=target
        # )
        # display(sample_submission[sample_submission['visit_id']==])
        test = df_test.loc[mask_target_1, :]
        test = test[X.columns]
        list_c = list((Counter(sample_submission.index.unique()) - Counter(test.index.unique())).elements()) 
        # display(test)
        # print(list_c)
        # display(sample_submission.loc[list_c, :])
        sample_submission.loc[mask_target, 'rating'] = calculate_predictions(test, sample_submission.loc[mask_target, :], 
                                                                             target, list_c, test.index.unique(), basis_trend, linear_trends, xgb_models)
        display(sample_submission.loc[mask_target,:])
        

   
    # call env.predict for every iteration
    env.predict(sample_submission[['prediction_id', 'rating']])
