# Directory settings

In [None]:
# ====================================================
# Directory settings
# ====================================================
import os

INPUT_DIR = '../input/vesuvius-challenge-ink-detection/'
OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# Library

In [None]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import shutil
import string
import pickle
import random
import joblib
import itertools
from pathlib import Path
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
import itertools
import more_itertools

import torch
print(f"torch.__version__: {torch.__version__}")
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
import joblib

# Utils

In [None]:
# ====================================================
# Utils
# ====================================================
def get_score(y_true, y_pred):
    score = sp.stats.pearsonr(y_true, y_pred)[0]
    return score


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# OOF

# 使い方　OOF_FILE_LISTにoof.csv.pklを追加するとlist内の全ての組み合わせでweightoptimして最もいい組み合わせを探索


In [None]:
OOF_FILE_LIST = []

In [None]:
def oof_concat(OOF_FILE_LIST):
    oof_concat_df = pd.DataFrame()
    for idx, filename in enumerate(OOF_FILE_LIST):
        print(filename)
        print(filename[-4:])
        if len(oof_concat_df) == 0:
            if filename[-4:] == ".pkl":
                oof_df = pd.read_pickle(filename)
            else:
                oof_df = pd.read_csv(filename)
            col_name = "pred_" + str(idx)
            #oof_df = oof_df.drop(["anchor","target","context","context_text","text"], axis=1)
            oof_df = oof_df.rename(columns={'pred': col_name})
            oof_concat_df = oof_df
        else:
            if filename[-4:] == ".pkl":
                oof_df = pd.read_pickle(filename)
            else:
                oof_df = pd.read_csv(filename)
                oof_df = oof_df.drop("Unnamed: 0", axis=1)
            col_name = "pred_" + str(idx)
            #oof_df = oof_df.drop(["anchor","target","context","context_text","text","fold","score"], axis=1)
            oof_df = oof_df.rename(columns={'pred': col_name})
            oof_concat_df = pd.merge(oof_concat_df,oof_df)
    return oof_concat_df

In [None]:
oof_df = oof_concat(OOF_FILE_LIST)

In [None]:
def calc_prediction_org(df, weights):
    oof_df["pred_org"] = 0
    for idx, weight in enumerate(weights):
        col_name = "pred_" + str(idx)
        oof_df["pred_org"] += weight*df[col_name]
    oof_df["pred_org"] = oof_df["pred_org"]/len(weights)
    return oof_df

In [None]:
def calc_prediction(df, weights):
    oof_df["pred"] = 0
    for idx, weight in enumerate(weights):
        col_name = "pred_" + str(idx)
        oof_df["pred"] += weight*df[col_name]
    oof_df["pred"] = oof_df["pred"]/sum(weights)
    return oof_df

In [None]:
weights = [1.0]*len(OOF_FILE_LIST)
oof_df = calc_prediction_org(oof_df, weights)

In [None]:
get_score(oof_df["score"], oof_df["pred_org"])

In [None]:
import optuna

r_min = 0
r_max = 1
# You can increase iteration number.
iteration = 200

optuna.logging.disable_default_handler() 

class Objective:
    def __init__(self, weights, ens_pair_df, pairs):
        self.weights = np.random.rand(len(weights))
        self.ens_pair_df = ens_pair_df
        self.pairs = pairs

    def __call__(self, trial):
        weights = [trial.suggest_uniform('weight' + str(self.pairs[n]), 0, 1) for n in range(len(self.weights))]
        pred = calc_prediction_score(self.ens_pair_df, weights, self.pairs)
        return get_score(train_labels, pred)

In [None]:
def optim(weights, ens_pair_df, pairs,objective):
    #objective = Objective(weights, ens_pair_df, pairs)
    SEED = 1234
    study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=SEED))
    study.optimize(objective, n_trials=iteration)
    
    return study.best_params

In [None]:
def calc_prediction_score(df, weights, pairs):
    oof_df["pred"] = 0
    for idx, weight in enumerate(weights):
        col_name = "pred_" + str(pairs[idx])
        oof_df["pred"] += weight*df[col_name]
    oof_df["pred"] = oof_df["pred"]/sum(weights)
    return oof_df["pred"]

In [None]:
def calc_prediction_score(df, weights, pairs):
    oof_df["pred"] = 0
    for idx, weight in enumerate(weights):
        col_name = "pred_" + str(pairs[idx])
        oof_df["pred"] += weight*df[col_name]
    oof_df["pred"] = oof_df["pred"]/sum(weights)
    return oof_df["pred"]

In [None]:
models_list = [x for x in range(0, len(OOF_FILE_LIST))]
pair_list = [s for s in more_itertools.powerset(models_list) if len(s) >= 7]

train_labels = oof_df["score"]
ens_pair_df = pd.DataFrame()
oof_score = []
oof_ens_weights = []

In [None]:
from concurrent import futures
import time

In [None]:
%%time
for counts ,pairs in tqdm(enumerate(pair_list)):
    pair_counts = len(pairs)
    with futures.ThreadPoolExecutor(max_workers=4) as executor:
        for model_no in range(0,len(pairs)):
            col_name = "pred_" + str(pairs[model_no])
            ens_pair_df = pd.concat([ens_pair_df, oof_df[col_name]], axis=1)

        weights = [1.0]*len(pairs)
        objective = Objective(weights, ens_pair_df, pairs)
        future = executor.submit(optim, weights, ens_pair_df, pairs,objective)
        oof_ens_weights.append(future.result())
        weight_ens_pred = calc_prediction_score(oof_df, list(oof_ens_weights[counts].values()),pairs)
        weight_ens_pred_norm = weight_ens_pred/max(weight_ens_pred)

        oof_score.append(get_score(train_labels, weight_ens_pred_norm))

        ens_pair_df = pd.DataFrame()
        del objective

In [None]:
oof_ens_weights

In [None]:
oof_score

In [None]:
print(f"best CV : {max(oof_score)}")

In [None]:
print(f"best CVmodel & Weights{oof_ens_weights[oof_score.index(max(oof_score))]}")