# <p style="padding:15px; background-color:#2D3142; font-family:JetBrains Mono; font-weight:bold; color:#f2f2f0; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 10px 10px">Playground Series S3E15</p>

In [1]:
# %load ../initial_settings2.py
import os
import shutil
import subprocess
import sys
import warnings
from pathlib import Path

ON_KAGGLE = os.getenv("KAGGLE_KERNEL_RUN_TYPE") is not None
if ON_KAGGLE:
    warnings.filterwarnings("ignore")
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
import tensorflow as tf
import tensorflow_datasets as tfds

# Sub-modules and so on.
from colorama import Fore, Style
from IPython.core.display import HTML
from IPython.display import display_html
from keras import layers
from plotly.subplots import make_subplots
from tensorflow import keras

K = keras.backend

# Colorama settings.
CLR = (Style.BRIGHT + Fore.BLACK) if ON_KAGGLE else (Style.BRIGHT + Fore.WHITE)
RED = Style.BRIGHT + Fore.RED
BLUE = Style.BRIGHT + Fore.BLUE
CYAN = Style.BRIGHT + Fore.CYAN
RESET = Style.RESET_ALL

# Colors
DF_CMAP = sns.light_palette("#8C92AC", as_cmap=True)
FONT_COLOR = "#141B4D"
BACKGROUND_COLOR = "#F6F5F5"
NOTEBOOK_PALETTE = {
    "Coral": "#FF7F51",
    "DarkNavy": "#2D3142",
    "SlateBlue": "#8C92AC",
}
MY_RC = {
    "axes.labelcolor": FONT_COLOR,
    "axes.labelsize": 10,
    "axes.labelpad": 15,
    "axes.labelweight": "bold",
    "axes.titlesize": 14,
    "axes.titleweight": "bold",
    "axes.titlepad": 15,
    "xtick.labelsize": 10,
    "xtick.color": FONT_COLOR,
    "ytick.labelsize": 10,
    "ytick.color": FONT_COLOR,
    "figure.titlesize": 14,
    "figure.titleweight": "bold",
    "figure.facecolor": BACKGROUND_COLOR,
    "figure.edgecolor": BACKGROUND_COLOR,
    "figure.dpi": 72,  # Locally Seaborn uses 72, meanwhile Kaggle 96.
    "font.size": 10,
    "font.family": "Serif",
    "text.color": FONT_COLOR,
}

sns.set_theme(rc=MY_RC)


# Utility functions.
def download_dataset_from_kaggle(user, dataset, directory):
    command = "kaggle datasets download -d "
    filepath = directory / (dataset + ".zip")

    if not filepath.is_file():
        subprocess.run((command + user + "/" + dataset).split())
        filepath.parent.mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(dataset + ".zip", "data")
        shutil.move(dataset + ".zip", "data")


def download_competition_from_kaggle(competition):
    command = "kaggle competitions download -c "
    filepath = Path("data/" + competition + ".zip")

    if not filepath.is_file():
        subprocess.run((command + competition).split())
        Path("data").mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(competition + ".zip", "data")
        shutil.move(competition + ".zip", "data")


# Html `code` block highlight.
HTML(
    """
<style>
code {
    background: rgba(42, 53, 125, 0.10) !important;
    border-radius: 4px !important;
}
</style>
"""
)


<b><span style="font-size:20px; font-family:JetBrains Mono; margin-left: 5px;">
    Competition Description 📜
</span></b>
<p style="font-size:16px;font-family: JetBrains Mono; margin-left: 25px; margin-right: 25px; margin-top: 15px; margin-bottom: 20px">
    The dataset for this competition (both train and test) was generated from a deep learning model trained on the <a href="https://www.kaggle.com/datasets/saurabhshahane/predicting-heat-flux">Predicting Critical Heat Flux</a> dataset. Feature distributions are close to, but not exactly the same, as the original. Feel free to use the original dataset as part of this competition, both to explore differences as well as to see whether incorporating the original in training improves model performance.
</p>
<b><span style="font-size:20px; font-family:JetBrains Mono; margin-left: 5px;">
    Task 🕵
</span></b>
<p style="font-size:16px;font-family: JetBrains Mono; margin-left: 25px; margin-right: 25px; margin-top: 15px; margin-bottom: 20px">
    
</p>
<b><span style="font-size:20px;font-family:JetBrains Mono; margin-left: 5px;">
    This Notebook Covers 📔
</span></b>
<ul style="font-size:16px; font-family: JetBrains Mono; margin-left: 10px; margin-right: 15px; margin-top: 15px; margin-bottom: 20px">
    <li></li>
</ul>
<b><span style="font-size:20px;font-family:JetBrains Mono; margin-left: 5px;">
    See More Here 📈
</span></b>
<p style="font-size:16px; font-family: JetBrains Mono; margin-left: 25px; margin-right: 25px; margin-top: 15px; margin-bottom: 20px">
    
</p>

# <p style="padding:15px; background-color:#2D3142; font-family:JetBrains Mono; font-weight:bold; color:#f2f2f0; font-size:100%; letter-spacing: 2px; text-align:center; border-radius: 10px 10px">EDA</p>

<p style="font-size:20px; font-family:JetBrains Mono; border-bottom: 3px solid #FF7F51; margin-left: 5px; margin-right: 5px;"><b>Notes</b> 📜</p>
<ul style="font-size:16px; font-family:JetBrains Mono; margin-right: 10px; margin-top: 2px; margin-bottom: 2px">
    <li></li>
</ul>

In [2]:
competition = "playground-series-s3e15"

if not ON_KAGGLE:
    download_competition_from_kaggle(competition)
    data_path = "data/data.csv"
else:
    data_path = f"/kaggle/input/{competition}/data.csv"

data = pd.read_csv(data_path, index_col="id")


In [3]:
data.head()


Unnamed: 0_level_0,author,geometry,pressure [MPa],mass_flux [kg/m2-s],x_e_out [-],D_e [mm],D_h [mm],length [mm],chf_exp [MW/m2]
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,Thompson,tube,7.0,3770.0,0.1754,,10.8,432.0,3.6
1,Thompson,tube,,6049.0,-0.0416,10.3,10.3,762.0,6.2
2,Thompson,,13.79,2034.0,0.0335,7.7,7.7,457.0,2.5
3,Beus,annulus,13.79,3679.0,-0.0279,5.6,15.2,2134.0,3.0
4,,tube,13.79,686.0,,11.1,11.1,457.0,2.8


In [4]:
data.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 31644 entries, 0 to 31643
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   author               26620 non-null  object 
 1   geometry             26144 non-null  object 
 2   pressure [MPa]       27192 non-null  float64
 3   mass_flux [kg/m2-s]  26853 non-null  float64
 4   x_e_out [-]          21229 non-null  float64
 5   D_e [mm]             26156 non-null  float64
 6   D_h [mm]             27055 non-null  float64
 7   length [mm]          26885 non-null  float64
 8   chf_exp [MW/m2]      31644 non-null  float64
dtypes: float64(7), object(2)
memory usage: 2.4+ MB


In [5]:
data.describe().T.rename(columns=str.title).style.background_gradient(DF_CMAP)


Unnamed: 0,Count,Mean,Std,Min,25%,50%,75%,Max
pressure [MPa],27192.0,10.640747,4.333683,0.1,6.89,11.07,13.79,20.68
mass_flux [kg/m2-s],26853.0,3068.011023,1777.03208,0.0,1519.0,2731.0,4069.0,7975.0
x_e_out [-],21229.0,-0.000453,0.100911,-0.8667,-0.0466,0.0038,0.0648,0.232
D_e [mm],26156.0,8.629255,5.185692,1.0,5.0,7.8,10.8,37.5
D_h [mm],27055.0,14.17433,19.838489,1.0,5.6,10.0,11.5,120.0
length [mm],26885.0,832.987391,672.299239,10.0,318.0,610.0,914.0,3048.0
chf_exp [MW/m2],31644.0,3.796985,1.983991,0.8,2.4,3.4,4.6,19.3


In [6]:
data["x_e_out_missing"] = data["x_e_out [-]"].isna().map({False: "False", True: "True"})


In [7]:
fig = px.pie(
    data,
    names="x_e_out_missing",
    height=520,
    width=840,
    hole=0.65,
    title="Imputation Target Overview - x_e_out [-]",
    color_discrete_sequence=["#2D3142", "#FF7F51"],
)
fig.update_layout(
    font_color=FONT_COLOR,
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    showlegend=False,
)
fig.add_annotation(
    dict(
        x=0.5,
        y=0.5,
        align="center",
        xref="paper",
        yref="paper",
        showarrow=False,
        font_size=22,
        text="Missing Values",
    )
)
fig.update_traces(
    hovertemplate=None,
    textposition="outside",
    textinfo="percent+label",
    textfont_size=16,
    rotation=20,
    marker_line_width=15,
    marker_line_color=BACKGROUND_COLOR,
)
fig.show()


In [8]:
fig = px.scatter_matrix(
    data,
    dimensions=data.select_dtypes("number").columns,
    color="x_e_out_missing",
    color_discrete_sequence=["#8C92AC", "#FF7F51"],
    symbol_sequence=["x-thin", "cross-thin"],
    opacity=0.2,
    title="Numerical Features - Scatter Pair Plots",
    width=840,
    height=840,
)
fig.update_traces(
    diagonal_visible=False,
    showupperhalf=False,
    marker_size=3,
)
fig.update_layout(
    font_color=FONT_COLOR,
    font_size=9,
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    showlegend=True,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        xanchor="right",
        y=1,
        x=1,
        itemsizing="constant",
    ),
)
fig.show()


In [192]:
corr = data.corr(numeric_only=True).round(2)
mask = np.triu(np.ones_like(corr, dtype=bool))
masked_corr = (
    corr.mask(mask).dropna(axis="index", how="all").dropna(axis="columns", how="all")
)

heatmap = go.Heatmap(
    z=masked_corr,
    x=masked_corr.columns,
    y=masked_corr.index,
    text=masked_corr.fillna(""),
    texttemplate="%{text}",
    xgap=5,
    ygap=5,
    showscale=True,
    colorscale=px.colors.sequential.matter_r,
    colorbar_len=1.03,
    hoverinfo="none",
)
fig = go.Figure(heatmap)
fig.update_layout(
    font_color=FONT_COLOR,
    title="Correlation Matrix - Lower Triangular",
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    width=720,
    height=720,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    yaxis_autorange="reversed",
)
fig.show()
