In [None]:
import pandas as pd
import amp_pd_peptide
import numpy as np
import sklearn
import collections
import warnings
import polars as pl
from sklearn.model_selection import GroupKFold, StratifiedKFold
from catboost import CatBoostRegressor
from scipy.optimize import minimize
import joblib
warnings.simplefilter('ignore')

### load trend data

In [None]:
# trend all user (use visit month = 0)
first_linear_trend_df = pd.read_csv('/kaggle/input/amp-visitmonth-model-first-month/first_linear_trend_df.csv')
first_cb_trend_huber_df = pd.read_csv('/kaggle/input/amp-visitmonth-model-first-month/first_cb_trend_huber_df.csv')
first_cb_trend_mae_df = pd.read_csv('/kaggle/input/amp-visitmonth-model-first-month/first_cb_trend_mae_df.csv')

# trend non healthy user(use visit month > 0)
linear_trend_df = pd.read_csv('/kaggle/input/amp-visitmonth-model/linear_trend_df.csv')
cb_trend_huber_df = pd.read_csv('/kaggle/input/amp-visitmonth-model/cb_trend_huber_df.csv')
cb_trend_mae_df = pd.read_csv('/kaggle/input/amp-visitmonth-model/cb_trend_mae_df.csv')

# healthy
healthy_trend_df = pd.read_csv('/kaggle/input/amp-visitmonth-model/healthy_trend_df.csv')

In [None]:
display('first_linear_trend_df:',first_linear_trend_df.iloc[[0, 12, 24, 36, 48, 60, 72, 84, 96, 108]])
display('first_cb_trend_huber_df:',first_cb_trend_huber_df.iloc[[0, 12, 24, 36, 48, 60, 72, 84, 96, 108]])
display('first_cb_trend_mae_df:',first_cb_trend_mae_df.iloc[[0, 12, 24, 36, 48, 60, 72, 84, 96, 108]])
display('linear_trend_df:',linear_trend_df.iloc[[0, 12, 24, 36, 48, 60, 72, 84, 96, 108]])
display('cb_trend_huber_df:',cb_trend_huber_df.iloc[[0, 12, 24, 36, 48, 60, 72, 84, 96, 108]])
display('cb_trend_mae_df:',cb_trend_mae_df.iloc[[0, 12, 24, 36, 48, 60, 72, 84, 96, 108]])
display('healthy_trend:',healthy_trend_df.iloc[[0, 12, 24, 36, 48, 60, 72, 84, 96, 108]])

### Catboost protein model

In [None]:
cb_model_path = '/kaggle/input/amp-catboost-model'
cb_model_dict = {}
cb_feature_dict = {}

folds = 10

for t in range(1,4):
    cb_feature_dict[f'updrs_{t}'] = joblib.load(f'{cb_model_path}/cb_use_features_updrs_{t}.pkl')
    for f in range(folds):
        cb_model_dict[f'model_updrs_{t}_{f}'] = joblib.load(f'{cb_model_path}/model_cb_updrs_{t}_{f}.pkl')

### pred

In [None]:
env = amp_pd_peptide.make_env() 
iter_test = env.iter_test()

In [None]:
%%time

patient_check_dict = {}

use_model_ratio = 0
first_cb_huber_use_ratio = {'updrs_1':0.8, 'updrs_2':0.8, 'updrs_3':0.3, 'updrs_4':0}
first_cb_mae_use_ratio = {'updrs_1':0.2, 'updrs_2':0.8, 'updrs_3':0.1, 'updrs_4':0}
cb_huber_use_ratio = {'updrs_1':0.4, 'updrs_2':0.5, 'updrs_3':0.6, 'updrs_4':0.5}
cb_mae_use_ratio = {'updrs_1':0.2, 'updrs_2':0.2, 'updrs_3':0.05, 'updrs_4':0.5}

for (test, test_peptides, test_proteins, sample_submission) in iter_test:
    
    # load data
    visit_month = test['visit_month'].iloc[0]
    test_pl = pl.DataFrame(test[['patient_id', 'visit_month']]).unique()
    test_proteins_pl = pl.DataFrame(test_proteins)
    test_peptides_pl = pl.DataFrame(test_peptides)
    protein_user_list = list(test_proteins_pl['patient_id'].unique())
    
    #--------------------------------------
    # get protein model prediction
    #--------------------------------------
        
    print('protein prediction...')
    test_proteins_pl_pivot = test_proteins_pl.pivot(values = 'NPX', index = 'patient_id', columns = 'UniProt')
    test_peptides_pl_pivot = test_peptides_pl.pivot(values = 'PeptideAbundance', index = 'patient_id', columns = 'Peptide')
    test_pr_pe_base = test_proteins_pl_pivot.join(test_peptides_pl_pivot, on = 'patient_id', how = 'left')
    test_pr_pe_base = test_pr_pe_base.to_pandas()
    oof_df = test_pr_pe_base[['patient_id']]
        
    # get cb oof
    for t in [1, 2, 3]:
        cb_use_features = cb_feature_dict[f'updrs_{t}']
        cb_null_cols = [col for col in cb_use_features if col not in test_pr_pe_base.columns]
        pred_model = np.zeros(len(oof_df))

        if len(cb_null_cols) > 0:
            for col in cb_null_cols:
                test_pr_pe_base[col] = np.nan

        for fold in range(folds):
            model_cb = cb_model_dict[f'model_updrs_{t}_{fold}']
            pred_model += model_cb.predict(test_pr_pe_base[cb_use_features]) / folds

        oof_df[f'pred_updrs_{t}'] = pred_model
    
    #--------------------------------------
    # pred loop
    #--------------------------------------
    
    prediction_id_list = []
    pred_list = []
    
    for row in test_pl.to_numpy():

        patient_id = row[0]
        visit_month = row[1]
        
        # Creating a dict to determine if a patient is healthy or not
        check_dict_value = patient_check_dict.get(patient_id, 'nothing')
        if check_dict_value == 'nothing':
            patient_check_dict[patient_id] = 0
        
        # Add value if patient does not have 6/18 month value
        if visit_month == 6 or visit_month == 18:
            patient_check_dict[patient_id] += 1
            
        # start pred
        for t in [1, 2, 3, 4]:
            for p in [0, 6, 12, 24]:
                pred_month = visit_month + p
                prediction_id = f'{patient_id}_{visit_month}_updrs_{t}_plus_{p}_months'
                pred = 0
                pred_trend = 0
                pred_huber_cb = 0
                pred_mae_cb = 0
                
                if visit_month == 0:                    
                    pred_trend = first_linear_trend_df.iloc[pred_month][f'updrs_{t}']
                    pred_huber_cb = first_cb_trend_huber_df.iloc[pred_month][f'updrs_{t}']
                    pred_mae_cb = first_cb_trend_mae_df.iloc[pred_month][f'updrs_{t}']

                    pred = pred_trend
                    pred = (pred * (1 - first_cb_huber_use_ratio[f'updrs_{t}'])) + (pred_huber_cb * first_cb_huber_use_ratio[f'updrs_{t}'])
                    pred = (pred * (1 - first_cb_mae_use_ratio[f'updrs_{t}'])) + (pred_mae_cb * first_cb_mae_use_ratio[f'updrs_{t}'])
                    
                    if t != 4:
                        if patient_id in protein_user_list:
                            pred_model = oof_df[oof_df['patient_id'] == patient_id][f'pred_updrs_{t}'].item()
                            pred = (pred * (1 - use_model_ratio)) + (pred_model * use_model_ratio)
                                
                    pred = np.round(pred)
                    
                else:
                    check_healthy = patient_check_dict[patient_id]
                    
                    # Healthy patients add another trend
                    if check_healthy == 0:
                        pred = healthy_trend_df.iloc[pred_month][f'updrs_{t}']
                    else:                      
                        pred_trend = linear_trend_df.iloc[pred_month][f'updrs_{t}']
                        pred_huber_cb = cb_trend_huber_df.iloc[pred_month][f'updrs_{t}']
                        pred_mae_cb = cb_trend_mae_df.iloc[pred_month][f'updrs_{t}']
                                                
                        pred = pred_trend
                        pred = (pred * (1 - cb_huber_use_ratio[f'updrs_{t}'])) + (pred_huber_cb * cb_huber_use_ratio[f'updrs_{t}'])
                        pred = (pred * (1 - cb_mae_use_ratio[f'updrs_{t}'])) + (pred_mae_cb * cb_mae_use_ratio[f'updrs_{t}'])

                        if t != 4:
                            if patient_id in protein_user_list:
                                pred_model = oof_df[oof_df['patient_id'] == patient_id][f'pred_updrs_{t}'].item()
                                pred = (pred * (1 - use_model_ratio)) + (pred_model * use_model_ratio)
                                    
                        pred = np.round(pred)
                        
                                                       
                prediction_id_list.append(prediction_id)
                pred_list.append(pred)

    result = pd.DataFrame(prediction_id_list, columns = ['prediction_id'])
    result['rating'] = pred_list

    env.predict(result) 