In [1]:
import numpy as np
import pandas as pd 
import os
import pickle
from random import randrange

from sklearn.decomposition import PCA
from sklearn.preprocessing import scale

import lightgbm as lgb
from lightgbm import LGBMRegressor, early_stopping

from scipy.stats import pearsonr
from scipy.stats import skew, kurtosis
from sklearn.metrics import mean_absolute_error, mean_squared_error

# preprocessing

In [2]:
def compute_rank_percentile(df, features, pca):
    
    def lambda_rank_percentile(d):
        
        n = d.shape[0]-1
        d = np.argsort(np.argsort(d, axis=0), axis=0)/n
        
        return d
    
    data = df.groupby("time_id").apply(lambda x: lambda_rank_percentile(pca.transform(x[features])))
    data = np.concatenate(data.values)

    df[[f"f_quantilized_{i}" for i in range(data.shape[1])]] = data
    
    return df

In [3]:
def scale_by_time_id(df, features):
    
    def lambda_scale(d):
        
        d = scale(d)

        return d
    
    data = df.groupby("time_id").apply(lambda x: lambda_scale(x[features]))
    data = np.concatenate(data.values)
    
    df[features] = data
    
    return df

In [4]:
def modify_predictions(std_preds):

    cutoff_one = np.quantile(std_preds, 0.04)

    std_preds[std_preds >= cutoff_one] = 1.5**std_preds[std_preds >= cutoff_one]
    std_preds[std_preds < cutoff_one] = std_preds[std_preds < cutoff_one] + (np.min(std_preds[std_preds >= cutoff_one]) - cutoff_one)

    cutoff_two = np.quantile(std_preds, 0.9)

    std_preds[std_preds >= cutoff_two] = 1.6**std_preds[std_preds >= cutoff_two]
    std_preds[std_preds < cutoff_two] = std_preds[std_preds < cutoff_two] + (np.min(std_preds[std_preds >= cutoff_two]) - cutoff_two)
    
    return std_preds

# target features

In [5]:
def get_aggregated_dataset(df, col_features_org, agg_method): 

    if agg_method == "mean":
        agg_train = df.groupby("time_id").apply(lambda x: np.mean(x[col_features_org])).apply(pd.Series)
    elif agg_method == "std":
        agg_train = df.groupby("time_id").apply(lambda x: np.std(x[col_features_org])).apply(pd.Series)
    elif agg_method == "skew":
        agg_train = df.groupby("time_id").apply(lambda x: skew(x[col_features_org])).apply(pd.Series)
    elif agg_method == "kurtosis":
        agg_train = df.groupby("time_id").apply(lambda x: kurtosis(x[col_features_org])).apply(pd.Series)
    else:
        agg_train = df.groupby("time_id").apply(lambda x: np.quantile(x[col_features_org], agg_method, 0)).apply(pd.Series)

    agg_train.columns = col_features_org
    
    return agg_train

In [6]:
def add_target_features(df, col_features_org, model_list):
    
    target_pred = []
    
    # quantile_list = list(np.arange(0, 1.01, 0.01))
    # list_agg_method = ["mean", "std", "skew", "kurtosis"] + quantile_list
    
    list_agg_method = ["skew", "kurtosis"]
    
    for method in list_agg_method:

        agg_df = get_aggregated_dataset(df, col_features_org, method)

        predictions = np.mean(np.array([m.predict(agg_df) for m in models_quantiles]), axis=0)

        target_pred.append(predictions)
    
    target_pred = np.expand_dims(np.array(target_pred), axis=1)
        
    df.loc[:, [f"f_target_{f}" for f in  ["skew", "kurtosis"]]] = np.squeeze(np.transpose(np.repeat(np.array(target_pred), df.shape[0], axis=1)))
    
    return df

# loss function

In [7]:
def mse_minus_cov_objective(y_true, y_pred):
    
    if not val:
        
        corr_list = []
        current_index = 0

        for i in repeat_array_train:
            corr = pearsonr(y_true[current_index:current_index+i], y_pred[current_index:current_index+i])[0]

            corr_list.append(corr)
            current_index += i
        score = np.mean(corr_list)
        
    if (not val) and (score-score_to_match) > 0.0001:
        grad_list = np.repeat(np.array([0]), np.sum(repeat_array_train))
        hess_list = np.repeat(np.array([0]), np.sum(repeat_array_train))
        
    else:
        
        grad_list = []
        hess_list = []

        mean_true, std_true = np.mean(y_true), np.std(y_true)

        current_index = 0
        for i in repeat_array_train:

            y_true_sub = y_true[current_index:current_index+i]
            y_pred_sub = y_pred[current_index:current_index+i]

            e = (y_true_sub - y_pred_sub)**2

            q_e = np.quantile(e, 0.99)

            grad = -2*(y_true_sub - y_pred_sub)
            hess = np.repeat(2, grad.shape[0])

            diff = e - q_e
            mask = diff > 0

            grad[mask] = grad[mask]/(1+(diff[mask]))
            # hess[mask] = hess[mask]/(1+2*np.abs(hess[mask]))

            grad_list.append(grad)
            hess_list.append(hess)

            current_index += i

        grad_list = np.concatenate(np.array(grad_list))
        hess_list = np.concatenate(np.array(hess_list))
    
    return grad_list, hess_list

# make predictions in the environment

In [8]:
pca = pickle.load(open('../input/pickle-files/pca_train.p', "rb"))
models_quantiles = pickle.load(open('../input/pickle-files/list_models_quantiles_scaled.p', "rb"))



In [9]:
models_quantiles = np.concatenate(models_quantiles)

In [10]:
model_list = []

for f in os.listdir("../input/models-056/"):
    path = f"../input/models-056/{f}"
    b = lgb.Booster(model_file = path)
    
    model_list.append(b)
    
for f in os.listdir("../input/models-044/"):
    path = f"../input/models-044/{f}"
    b = lgb.Booster(model_file = path)
    
    model_list.append(b)

In [11]:
import ubiquant

In [12]:
env = ubiquant.make_env()  
iter_test = env.iter_test() 

In [13]:
col_features_org = [f"f_{i}" for i in range(300)]

for (test_df, sample_prediction_df) in iter_test:
    
    cols = test_df.columns
    if "time_id" not in cols:
        test_df.insert(0, "time_id", np.repeat(0, test_df.shape[0]))
    
    # add the moments and quantiles
    test_df = test_df.groupby("time_id").apply(lambda x: add_target_features(x, col_features_org, models_quantiles))
    
    # quantilized data
    test_df = compute_rank_percentile(test_df, col_features_org, pca)
    test_df = scale_by_time_id(test_df, col_features_org)
    
    features = [col for col in test_df.columns if "f_" in col]

    preds = np.mean(np.array([m.predict(test_df[features]) for m in model_list]), axis=0)
    preds = modify_predictions(preds)
    
    sample_prediction_df['target'] = preds  # make your predictions here

    env.predict(sample_prediction_df)   # register your predictions


This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


  self[col] = igetitem(value, i)
  self[col] = igetitem(value, i)
  self[col] = igetitem(value, i)
  
  self[col] = igetitem(value, i)
