In [1]:
# %load ../general_settings.py
import glob
import os
import shutil
import subprocess
import warnings
from collections import defaultdict, namedtuple
from copy import copy
from functools import partial
from itertools import chain, combinations, product
from pathlib import Path
from time import strftime

ON_KAGGLE = os.getenv("KAGGLE_KERNEL_RUN_TYPE") is not None
if ON_KAGGLE:
    warnings.filterwarnings("ignore")

import joblib
import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import scipy.stats as stats
import seaborn as sns
import shap
from colorama import Fore, Style
from IPython.core.display import HTML, display_html
from plotly.subplots import make_subplots
from scipy.cluster.hierarchy import linkage
from scipy.spatial.distance import squareform

# Colorama settings.
CLR = (Style.BRIGHT + Fore.BLACK) if ON_KAGGLE else (Style.BRIGHT + Fore.WHITE)
RED = Style.BRIGHT + Fore.RED
BLUE = Style.BRIGHT + Fore.BLUE
CYAN = Style.BRIGHT + Fore.CYAN
RESET = Style.RESET_ALL

FONT_COLOR = "#010D36"
BACKGROUND_COLOR = "#F6F5F5"

CELL_HOVER = {  # for row hover use <tr> instead of <td>
    "selector": "td:hover",
    "props": "background-color: #F6F5F5",
}
TEXT_HIGHLIGHT = {
    "selector": "td",
    "props": "color: #FF2079; font-weight: bold",
}
INDEX_NAMES = {
    "selector": ".index_name",
    "props": "font-style: italic; background-color: #010D36; color: #F2F2F0;",
}
HEADERS = {
    "selector": "th:not(.index_name)",
    "props": "font-style: italic; background-color: #010D36; color: #F2F2F0;",
}
DF_STYLE = (INDEX_NAMES, HEADERS, TEXT_HIGHLIGHT)
DF_CMAP = sns.light_palette("#D4D0A9", as_cmap=True)

# Utility functions.
def download_dataset_from_kaggle(user, dataset, directory):
    command = "kaggle datasets download -d "
    filepath = directory / (dataset + ".zip")
    if not filepath.is_file():
        subprocess.run((command + user + "/" + dataset).split())
        filepath.parent.mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(dataset + ".zip", str(directory))
        shutil.move(dataset + ".zip", str(directory))


def download_competition_from_kaggle(competition):
    command = "kaggle competitions download -c "
    filepath = Path("data/" + competition + ".zip")
    if not filepath.is_file():
        subprocess.run((command + competition).split())
        Path("data").mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(competition + ".zip", "data")
        shutil.move(competition + ".zip", "data")


def interpolate_color(color1, color2, t):
    r1, g1, b1 = int(color1[1:3], 16), int(color1[3:5], 16), int(color1[5:7], 16)
    r2, g2, b2 = int(color2[1:3], 16), int(color2[3:5], 16), int(color2[5:7], 16)
    r = int(r1 + (r2 - r1) * t)
    g = int(g1 + (g2 - g1) * t)
    b = int(b1 + (b2 - b1) * t)
    return f"#{r:02X}{g:02X}{b:02X}"


def get_interpolated_colors(color1, color2, num_colors=2):
    """Return `num_colors` interpolated beetwen `color1` and `color2`.
    Arguments need to be HEX."""
    num_colors = num_colors + 2
    return [interpolate_color(color1, color2, i / (num_colors - 1)) for i in range(num_colors)]


# Html `code` block highlight. Must be included at the end of all imports!
HTML(
    """
<style>
code {
    background: rgba(42, 53, 125, 0.10) !important;
    border-radius: 4px !important;
}
a {
    color: rgba(123, 171, 237, 1.0) !important;
}
</style>
"""
)


In [2]:
user = "yeoyunsianggeremie"
dataset = "s3e19-top-public-notebook-submissions"
directory = Path("best_public_notebooks")

download_dataset_from_kaggle(user, dataset, directory)


In [3]:
best_public_lb_paths = glob.glob(str(directory / "*.csv"))
best_lbs = pd.concat(map(partial(pd.read_csv, index_col="id"), best_public_lb_paths), axis=1)
best_lbs.columns = [name.split("\\")[-1] for name in best_public_lb_paths]
best_lbs.head()


Unnamed: 0_level_0,bogoconic1_48.86.csv,bogoconic1_6.48243.csv,bogoconic1_7.48391.csv,chingiznurzhanov_6.90364.csv,christph_5.19836.csv,iqbal_6.41209.csv,kdmitrie_6.14867.csv,nivedithavudayagiri_38.87.csv,oscar_6.14720.csv,oscar_no_postprocessing.csv,paddykb_5.18855.csv,paddykb_no_postprocessing.csv,tetsutani_7.21382.csv,weichens_6.66950.csv,zhukovoleksiy_6.56112.csv
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
136950,36.130182,153.0,157.0,177,160,139,129.809581,45.57409,133.0,32,160,47.433048,149.0,176.276589,162.0
136951,37.548315,159.0,161.0,180,160,140,127.939697,44.848445,130.0,32,160,47.59679,150.0,181.585009,163.0
136952,5.449291,23.0,21.0,28,24,20,19.908087,6.544573,20.0,5,24,7.197306,23.0,11.373482,28.0
136953,34.465968,146.0,152.0,168,149,128,121.741967,43.928529,127.0,27,149,44.235366,139.0,168.050173,155.0
136954,28.533556,121.0,118.0,143,122,109,98.894905,32.658667,103.0,24,122,36.077329,116.0,140.796397,133.0


In [4]:
best_lbs = best_lbs.drop(
    [
        "bogoconic1_48.86.csv",
        "nivedithavudayagiri_38.87.csv",
        "oscar_no_postprocessing.csv",
        "paddykb_no_postprocessing.csv",
    ],
    axis=1,
)
best_lbs.head()


Unnamed: 0_level_0,bogoconic1_6.48243.csv,bogoconic1_7.48391.csv,chingiznurzhanov_6.90364.csv,christph_5.19836.csv,iqbal_6.41209.csv,kdmitrie_6.14867.csv,oscar_6.14720.csv,paddykb_5.18855.csv,tetsutani_7.21382.csv,weichens_6.66950.csv,zhukovoleksiy_6.56112.csv
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
136950,153.0,157.0,177,160,139,129.809581,133.0,160,149.0,176.276589,162.0
136951,159.0,161.0,180,160,140,127.939697,130.0,160,150.0,181.585009,163.0
136952,23.0,21.0,28,24,20,19.908087,20.0,24,23.0,11.373482,28.0
136953,146.0,152.0,168,149,128,121.741967,127.0,149,139.0,168.050173,155.0
136954,121.0,118.0,143,122,109,98.894905,103.0,122,116.0,140.796397,133.0


In [5]:
submission = pd.DataFrame(
    {
        "id": best_lbs.index,
        "num_sold": best_lbs.mean(axis=1).astype(np.int32),
    }
).set_index("id")

submission.to_csv("submission.csv")
submission.head()


Unnamed: 0_level_0,num_sold
id,Unnamed: 1_level_1
136950,154
136951,155
136952,22
136953,145
136954,120


In [6]:
competition = "playground-series-s3e19"

if not ON_KAGGLE:
    download_competition_from_kaggle(competition)
    train_path = "data/train.csv"
    test_path = "data/test.csv"
else:
    train_path = f"/kaggle/input/{competition}/train.csv"
    test_path = f"/kaggle/input/{competition}/test.csv"

train = pd.read_csv(train_path, index_col="id", parse_dates=["date"]).rename(columns=str.title)
test = pd.read_csv(test_path, index_col="id", parse_dates=["date"]).rename(columns=str.title)


In [7]:
submission = submission.join(test.Country)

In [8]:
submission.query("Country == 'Canada'").num_sold.mean()

261.42867579908676