## Idea:
* Use month trend similar to [Only Trends](https://www.kaggle.com/code/vitalykudelya/only-trends)
* Divide NPX values of a protein P05060 into several groups and find the best shift after month trend predicitons for each group
* Sum predictions from the month trend and the corresponding NPX group shift

Protein P05060 imporoved cross-validation score, public score and private score over Trend. <br>
I'm not sure is it a pure luck or we have a real signal in P05060 protein.

Protein P05060 was chosen as the best protein improving the score on the train dataset (for NPX groups top5 quantile and low5 quantile)

In [None]:
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm.auto import tqdm

import plotly.express as px

import amp_pd_peptide

from scipy.optimize import minimize

In [None]:
def smape_plus_1(y_true, y_pred):
    y_true_plus_1 = y_true + 1
    y_pred_plus_1 = y_pred + 1
    metric = np.zeros(len(y_true_plus_1))
    
    numerator = np.abs(y_true_plus_1 - y_pred_plus_1)
    denominator = ((np.abs(y_true_plus_1) + np.abs(y_pred_plus_1)) / 2)
    
    mask_not_zeros = (y_true_plus_1 != 0) | (y_pred_plus_1 != 0)
    metric[mask_not_zeros] = numerator[mask_not_zeros] / denominator[mask_not_zeros]
    
    return 100 * np.nanmean(metric)

## Generate Train Dataset

In [None]:
train_clinical_all = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_clinical_data.csv')
proteins = pd.read_csv('/kaggle/input/amp-parkinsons-disease-progression-prediction/train_proteins.csv')
proteins_features = pd.pivot_table(proteins, values='NPX', index='visit_id', columns='UniProt', aggfunc='sum')

train_clinical_all = train_clinical_all.merge(
    proteins_features,
    left_on='visit_id',
    right_index=True,
    how='left'
)

In [None]:
train_clinical_all[proteins_features.columns] = train_clinical_all.groupby('patient_id')[proteins_features.columns].\
                                                                                        fillna(method='ffill')

In [None]:
train_clinical_all['pred_month'] = train_clinical_all['visit_month']

for plus_month in [6, 12, 24]:
    train_shift = train_clinical_all[['patient_id', 'visit_month', 'pred_month', 'updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']].copy()
    train_shift['visit_month'] -= plus_month
    train_shift.rename(columns={f'updrs_{i}': f'updrs_{i}_plus_{plus_month}' for i in range(1, 5)}, inplace=True)
    train_shift.rename(columns={'pred_month': f'pred_month_plus_{plus_month}'}, inplace=True)
    train_clinical_all = train_clinical_all.merge(train_shift, how='left', on=['patient_id', 'visit_month'])

train_clinical_all.rename(columns={f'updrs_{i}': f'updrs_{i}_plus_0' for i in range(1, 5)}, inplace=True)
train_clinical_all.rename(columns={'pred_month': f'pred_month_plus_0'}, inplace=True)
train_clinical_all

In [None]:
def calculate_month_trend_predicitons(pred_month, trend, min_month_with_non_zero_median_u4):
    if target == 'updrs_4': 
        result = trend[0] + pred_month * trend[1]
        result[pred_month <= min_month_with_non_zero_median_u4] = 0
        return result
    
    if len(trend) == 3:
        return trend[0] + pred_month * trend[1] + pred_month**2 * trend[2]
    else:
        return trend[0] + pred_month * trend[1]

target_to_trend = {
    'updrs_1': {
        'trend': [5.394793062665313, 0.027091086167821344],
        'min_month_with_non_zero_median_u4': 72
    },
    'updrs_2': {
        'trend': [5.469498130092747, 0.02824188329658148],
        'min_month_with_non_zero_median_u4': 72
    },
    'updrs_3': {
        'trend': [21.47671255789872, 0.030885385412370472, 0.00117880267326171],
        'min_month_with_non_zero_median_u4': 72
    },
    'updrs_4': {
        'trend': [2.2953201375507626, 0.002284573636684357],
        'min_month_with_non_zero_median_u4': 72
    }
}

In [None]:
def calculate_predicitons_protein(protein, pred_month, protein_shift):
    trend_pred_month = target_to_trend[target]
    pred_month_trend = calculate_month_trend_predicitons(
        pred_month=pred_month, 
        trend=trend_pred_month['trend'],
        min_month_with_non_zero_median_u4=trend_pred_month['min_month_with_non_zero_median_u4']
    )
    return np.round(pred_month_trend + protein_shift)

def function_to_minimize(x):
    metric = smape_plus_1(
        y_true=y_true_array, 
        y_pred=calculate_predicitons_protein(
            protein=protein_array,
            pred_month=pred_month_array,
            protein_shift=x[0]
        )
    )
    return metric

In [None]:
def find_best_const(train_clinical_all_filtered, target):
    columns_with_target = [f'{target}_plus_{plus_month}' for plus_month in [0, 6, 12, 24]]
    columns_with_pred_month = [f'pred_month_plus_{plus_month}' for plus_month in [0, 6, 12, 24]]
    global y_true_array
    global pred_month_array
    global protein_array
    y_true_array = train_clinical_all_filtered[columns_with_target].values.ravel()
    pred_month_array = train_clinical_all_filtered[columns_with_pred_month].values.ravel()
    protein_array = np.concatenate([train_clinical_all_filtered[feature].values] * 4)
    result = minimize(
        fun=function_to_minimize,
        x0=[0.0],
        method='Powell'
    ).x[0]
    return result

## Plot shifts

In [None]:
feature = 'P05060'
quantiles = [0, 0.025, 0.05, 0.15, 0.85, 0.95, 0.975, 1.0]

df_plot = []
for quantile_low, quantile_high in tqdm(zip(quantiles[:-1], quantiles[1:])):
    item = {
        'quantile_low': quantile_low,
        'quantile_high': quantile_high,
        'quantile_middle': (quantile_low + quantile_high) / 2
    }
    quantile_low_value = train_clinical_all[feature].quantile(quantile_low)
    quantile_high_value = train_clinical_all[feature].quantile(quantile_high)
    item['quantile_low_value'] = quantile_low_value
    item['quantile_high_value'] = quantile_high_value
    
    if quantile_high == 1:
        quantile_high_value += 0.00001
        
    train_clinical_all_filtered = train_clinical_all[
        (train_clinical_all[feature] >= quantile_low_value)
        & (train_clinical_all[feature] < quantile_high_value)
    ]
    for target in ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']:
        item[f'{target}_shift'] = find_best_const(train_clinical_all_filtered, target)
    df_plot.append(item)
    
df_plot = pd.DataFrame(df_plot)

In [None]:
for target in ['updrs_1', 'updrs_2', 'updrs_3', 'updrs_4']:
    fig = px.line(
        df_plot,
        y=f'{target}_shift',
        x='quantile_middle',
        title=feature + ' ' + target
    )
    fig.show()

## Find shifts

In [None]:
target_to_clip_max = {
    'updrs_1': 6,
    'updrs_2': 3,
    'updrs_3': 6,
    'updrs_4': 1
}

target_to_clips_025_low = {target: (0, target_to_clip_max[target]) for target in target_to_clip_max.keys()}
target_to_clips_025_high = {target: (-target_to_clip_max[target], 0) for target in target_to_clip_max.keys()}

target_to_clips_025_05_low = {target: (0, target_to_clip_max[target] / 4) for target in target_to_clip_max.keys()}
target_to_clips_025__05_high = {target: (-target_to_clip_max[target] / 4, 0) for target in target_to_clip_max.keys()}

target_to_clips_5_15_low = {target: (0, target_to_clip_max[target] / 5) for target in target_to_clip_max.keys()}
target_to_clips_5_15_high = {target: (-target_to_clip_max[target] / 5, 0) for target in target_to_clip_max.keys()}


npx_groups = [
    {'quantile_low': 0.0, 'quantile_high': 0.025, 'clip': target_to_clips_025_low},
    {'quantile_low': 0.975, 'quantile_high': 1.0, 'clip': target_to_clips_025_high},

    {'quantile_low': 0.025, 'quantile_high': 0.05, 'clip': target_to_clips_025_05_low},
    {'quantile_low': 0.95, 'quantile_high': 0.975, 'clip': target_to_clips_025__05_high},

    {'quantile_low': 0.05, 'quantile_high': 0.15, 'clip': target_to_clips_5_15_low},
    {'quantile_low': 0.85, 'quantile_high': 0.95, 'clip': target_to_clips_5_15_high},
]
target_to_npx_groups_shift = defaultdict(list)

for target in ['updrs_1', 'updrs_2', 'updrs_3']:
    for npx_group in npx_groups:
        item = npx_group.copy()
        item['feature'] = feature
        
        if item['quantile_low'] == 0:
            item['quantile_low_value'] = -np.inf
        else:
            item['quantile_low_value'] = train_clinical_all[feature].quantile(item['quantile_low'])
            
        if item['quantile_high'] == 1:
            item['quantile_high_value'] = np.inf
        else: 
            item['quantile_high_value'] = train_clinical_all[feature].quantile(item['quantile_high'])

        train_clinical_all_filtered = train_clinical_all[
            (train_clinical_all[feature] >= item['quantile_low_value'])
            & (train_clinical_all[feature] < item['quantile_high_value'])
        ]
        
        item['shift'] = find_best_const(train_clinical_all_filtered, target).clip(*item['clip'][target])
        target_to_npx_groups_shift[target].append(item)

target_to_npx_groups_shift

## Predictions

In [None]:
amp_pd_peptide.make_env.func_dict['__called__'] = False
env = amp_pd_peptide.make_env()   # initialize the environment
iter_test = env.iter_test()    # an iterator which loops over the test files

proteins_features_all = pd.DataFrame()
# The API will deliver four dataframes in this specific order:
for test_clinical_data, test_peptides, test_proteins, sample_submission in iter_test:
    sample_submission['patient_id'] = sample_submission['prediction_id'].map(lambda x: int(x.split('_')[0]))
    sample_submission['visit_month'] = sample_submission['prediction_id'].map(lambda x: int(x.split('_')[1]))
    sample_submission['target_name'] = sample_submission['prediction_id'].map(lambda x: 'updrs_' + x.split('_')[3])
    sample_submission['plus_month'] = sample_submission['prediction_id'].map(lambda x: int(x.split('_')[5]))
    sample_submission['pred_month'] = sample_submission['visit_month'] + sample_submission['plus_month']
    sample_submission['visit_id'] = sample_submission['patient_id'].astype(str) + '_' + sample_submission['visit_month'].astype(str)
    
    proteins_features = pd.pivot_table(test_proteins, values='NPX', index='visit_id', columns='UniProt', aggfunc='sum')
    proteins_features['visit_id'] = proteins_features.index
    proteins_features_all = pd.concat([proteins_features_all, proteins_features])
    proteins_features_all['patient_id'] = proteins_features_all.index.map(lambda x: int(x.split('_')[0]))
    proteins_features_all[proteins_features.columns] = proteins_features_all.groupby('patient_id')[proteins_features.columns].\
                                                                                                   fillna(method='ffill')
    proteins_features = proteins_features_all.groupby('patient_id', as_index=False).last()
    
    sample_submission = sample_submission.merge(
        proteins_features,
        on='patient_id',
        how='left'
    )

    for i in range(1, 5):
        target = f'updrs_{i}'
        mask_target = sample_submission['target_name'] == target
        sample_submission.loc[mask_target, 'rating'] = calculate_month_trend_predicitons(
            pred_month=sample_submission.loc[mask_target, 'pred_month'],
            trend=target_to_trend[target]['trend'],
            min_month_with_non_zero_median_u4=target_to_trend[target]['min_month_with_non_zero_median_u4']
        )
        
        for item in target_to_npx_groups_shift[target]:
            feature = item['feature']
            mask_feature_range = mask_target & (
                (sample_submission[feature] >= item['quantile_low_value'])
                & (sample_submission[feature] < item['quantile_high_value'])
            )
            sample_submission.loc[mask_feature_range, 'rating'] += item['shift']

        sample_submission.loc[mask_target, 'rating'] = np.round(sample_submission.loc[mask_target, 'rating']).clip(0, None)
        
    # call the env.predict for every iteration
    env.predict(sample_submission[['prediction_id', 'rating']])