In [1]:
# %load ../general_settings.py
import glob
import os
import shutil
import subprocess
import warnings
from array import array
from collections import defaultdict, namedtuple
from copy import copy
from functools import partial
from itertools import chain, combinations, product
from pathlib import Path
from time import strftime

ON_KAGGLE = os.getenv("KAGGLE_KERNEL_RUN_TYPE") is not None
if ON_KAGGLE:
    warnings.filterwarnings("ignore")

import joblib
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import scipy.stats as stats
import seaborn as sns
import shap
import swifter
from colorama import Fore, Style
from IPython.core.display import HTML, display_html
from plotly.subplots import make_subplots
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform

# Colorama settings.
CLR = (Style.BRIGHT + Fore.BLACK) if ON_KAGGLE else (Style.BRIGHT + Fore.WHITE)
RED = Style.BRIGHT + Fore.RED
BLUE = Style.BRIGHT + Fore.BLUE
CYAN = Style.BRIGHT + Fore.CYAN
RESET = Style.RESET_ALL

FONT_COLOR = "#010D36"
BACKGROUND_COLOR = "#FFFCFA"

CELL_HOVER = {  # for row hover use <tr> instead of <td>
    "selector": "td:hover",
    "props": "background-color: #FFFCFA",
}
TEXT_HIGHLIGHT = {
    "selector": "td",
    "props": "color: #FF2079; font-weight: bold",
}
INDEX_NAMES = {
    "selector": ".index_name",
    "props": "font-style: italic; background-color: #010D36; color: #F2F2F0;",
}
HEADERS = {
    "selector": "th:not(.index_name)",
    "props": "font-style: italic; background-color: #010D36; color: #F2F2F0;",
}
DF_STYLE = (INDEX_NAMES, HEADERS, TEXT_HIGHLIGHT)
DF_CMAP = sns.light_palette("#D4D0A9", as_cmap=True)

# Utility functions.
def download_from_kaggle(expr: list[str], directory: Path | None = None) -> None:
    if not directory:
        directory = Path("data")
    if not isinstance(directory, Path):
        raise TypeError("The `directory` argument must be `Path` instance!")
    match expr:
        case ["kaggle", _, "download", *args] if args:
            directory.parent.mkdir(parents=True, exist_ok=True)
            filename = args[-1].split("/")[-1] + ".zip"
            if not (directory / filename).is_file():
                subprocess.run(expr)
                shutil.unpack_archive(filename, directory)
                shutil.move(filename, directory)
        case _:
            raise SyntaxError("Invalid expression!")


def interpolate_color(color1, color2, t):
    r1, g1, b1 = int(color1[1:3], 16), int(color1[3:5], 16), int(color1[5:7], 16)
    r2, g2, b2 = int(color2[1:3], 16), int(color2[3:5], 16), int(color2[5:7], 16)
    r = int(r1 + (r2 - r1) * t)
    g = int(g1 + (g2 - g1) * t)
    b = int(b1 + (b2 - b1) * t)
    return f"#{r:02X}{g:02X}{b:02X}"


def get_interpolated_colors(color1, color2, num_colors=2):
    """Return `num_colors` interpolated beetwen `color1` and `color2`.
    Arguments need to be HEX."""
    num_colors = num_colors + 2
    return [interpolate_color(color1, color2, i / (num_colors - 1)) for i in range(num_colors)]


# Html `code` block highlight. Must be included at the end of all imports!
HTML(
    """
<style>
code {
    background: rgba(42, 53, 125, 0.10) !important;
    border-radius: 4px !important;
}
a {
    color: rgba(123, 171, 237, 1.0) !important;
}
</style>
"""
)


In [2]:
user = "yeoyunsianggeremie"
dataset = "s3e20-top-public-notebook-submissions"

expr = f"kaggle datasets download -d {user}/{dataset}".split()
directory = Path("best_public_notebooks")

download_from_kaggle(expr, directory)


In [3]:
best_public_lb_paths = glob.glob(str(directory / "*.csv"))
best_lbs = pd.concat(
    map(
        partial(pd.read_csv, index_col="ID_LAT_LON_YEAR_WEEK"),
        best_public_lb_paths,
    ),
    axis=1,
)
best_lbs.columns = [name.split("\\")[-1] for name in best_public_lb_paths]
best_lbs.head()


Unnamed: 0_level_0,ansh_21.32_W2.csv,arunklenin_26.10_W1.csv,arvind_21.29_W2.csv,bassem_28.52_W1.csv,bogoconic1_19.70_W3.csv,bogoconic1_22.33_W2.csv,chunfu_18.32.csv,chunfu_21.32_W2.csv,chunfu_23.97_W1.csv,dmitry_20.88_W2.csv,itachi_22.34_W2.csv,jean_27.05_W1.csv,kdmitrie_22.97_W2.csv,lucas_21.88_W2.csv,mateuszk013_23.85.csv,randomdraw_23.02_W2.csv,satya_23.06_W3.csv,satya_28.15_W1.csv,syerramilli_26.50_W1.csv
ID_LAT_LON_YEAR_WEEK,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
ID_-0.510_29.290_2022_00,4.252746,4.036974,4.22808,3.929144,3.753601,3.753601,3.753601,4.252746,4.03724,4.004916,3.753601,3.586367,3.723572,5.097905,4.121728,3.753601,3.774066,2.374829,4.239101
ID_-0.510_29.290_2022_01,4.425709,4.490421,4.40004,4.189758,4.051966,4.051966,4.051966,4.425709,4.203962,4.277571,4.051966,4.268736,4.01955,5.680008,4.121728,4.051966,4.114785,3.799547,4.41416
ID_-0.510_29.290_2022_02,4.484926,4.989249,4.458913,4.320027,4.231381,4.231381,4.231381,4.484926,4.236259,4.375822,4.541696,4.334028,4.19753,5.830926,4.121728,4.231381,4.276951,3.953253,4.448072
ID_-0.510_29.290_2022_03,4.491671,4.394686,4.465619,4.416591,4.305286,4.305286,4.305286,4.491671,4.247674,4.414508,4.730381,4.376069,4.270843,5.916702,4.152686,4.305286,4.352062,4.024823,4.460058
ID_-0.510_29.290_2022_04,4.531024,4.318355,4.504744,4.444601,4.347317,4.347317,4.347317,4.531024,4.240461,4.453966,4.825974,4.45438,4.312538,5.970254,4.216769,4.347317,4.393192,4.405814,4.452485


In [5]:
submissions_to_include = [
    "mateuszk013_23.85.csv",
    "chunfu_18.32.csv",
    # "bogoconic1_19.70_W3.csv",
]

best_lbs = best_lbs[submissions_to_include]
best_lbs.head()


Unnamed: 0_level_0,mateuszk013_23.85.csv,chunfu_18.32.csv
ID_LAT_LON_YEAR_WEEK,Unnamed: 1_level_1,Unnamed: 2_level_1
ID_-0.510_29.290_2022_00,4.121728,3.753601
ID_-0.510_29.290_2022_01,4.121728,4.051966
ID_-0.510_29.290_2022_02,4.121728,4.231381
ID_-0.510_29.290_2022_03,4.152686,4.305286
ID_-0.510_29.290_2022_04,4.216769,4.347317


In [9]:
submission = pd.DataFrame(
    {
        "ID_LAT_LON_YEAR_WEEK": best_lbs.index,
        "Emission": np.average(best_lbs, axis=1, weights=(2, 1)),
    }
).set_index("ID_LAT_LON_YEAR_WEEK")

submission.to_csv("submission_blend.csv")
submission.head()


Unnamed: 0_level_0,Emission
ID_LAT_LON_YEAR_WEEK,Unnamed: 1_level_1
ID_-0.510_29.290_2022_00,3.999019
ID_-0.510_29.290_2022_01,4.098474
ID_-0.510_29.290_2022_02,4.158279
ID_-0.510_29.290_2022_03,4.203553
ID_-0.510_29.290_2022_04,4.260285
