# <p style="padding: 15px; background-color: #010D36; font-family: 'JetBrains Mono'; font-weight: bold; font-size: 100%; color: #F2F2F0; letter-spacing: 2px; text-align: center; border-radius: 8px;">ICR - Identifying Age-Related Conditions</p>

In [245]:
import os
import shutil
import subprocess
from collections import defaultdict
from copy import copy
from itertools import product
from functools import reduce
from pathlib import Path

# Sub-modules and so on.
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import seaborn as sns
import scipy.stats as stats
from colorama import Fore, Style
from IPython.core.display import HTML
from IPython.display import display_html
from matplotlib.colors import Colormap
from plotly.subplots import make_subplots
from scipy.cluster.hierarchy import fcluster, linkage
from scipy.spatial.distance import squareform
from scipy.stats import gaussian_kde, probplot
from sklearn.preprocessing import PowerTransformer

ON_KAGGLE = os.getenv("KAGGLE_KERNEL_RUN_TYPE") is not None

# Colorama settings.
CLR = (Style.BRIGHT + Fore.BLACK) if ON_KAGGLE else (Style.BRIGHT + Fore.WHITE)
RED = Style.BRIGHT + Fore.RED
BLUE = Style.BRIGHT + Fore.BLUE
CYAN = Style.BRIGHT + Fore.CYAN
RESET = Style.RESET_ALL

# Colors
DF_CMAP: Colormap = sns.light_palette("#8C92AC", as_cmap=True)  # type: ignore
FONT_COLOR = "#010D36"
BACKGROUND_COLOR = "#F6F5F5"

cell_hover = {  # for row hover use <tr> instead of <td>
    "selector": "td:hover",
    "props": "background-color: #F6F5F5",
}
text_highlight = {
    "selector": "td",
    "props": "color: #FF2079; font-weight: bold",
}
index_names = {
    "selector": ".index_name",
    "props": "font-style: italic; background-color: #010D36; color: #F2F2F0;",
}
headers = {
    "selector": "th:not(.index_name)",
    "props": "font-style: italic; background-color: #010D36; color: #F2F2F0;",
}
DF_STYLE = (cell_hover, index_names, headers, text_highlight)

MY_RC = {
    "axes.labelcolor": FONT_COLOR,
    "axes.labelsize": 10,
    "axes.labelpad": 15,
    "axes.labelweight": "bold",
    "axes.titlesize": 14,
    "axes.titleweight": "bold",
    "axes.titlepad": 15,
    "axes.facecolor": BACKGROUND_COLOR,
    "xtick.labelsize": 10,
    "xtick.color": FONT_COLOR,
    "ytick.labelsize": 10,
    "ytick.color": FONT_COLOR,
    "figure.titlesize": 14,
    "figure.titleweight": "bold",
    "figure.facecolor": BACKGROUND_COLOR,
    "figure.edgecolor": BACKGROUND_COLOR,
    "figure.dpi": 72,  # Locally Seaborn uses 72, meanwhile Kaggle 96.
    "font.size": 10,
    "font.family": "Serif",
    "text.color": FONT_COLOR,
}
sns.set_theme(rc=MY_RC)


# Utility functions.
def download_dataset_from_kaggle(user, dataset, directory):
    command = "kaggle datasets download -d "
    filepath = directory / (dataset + ".zip")

    if not filepath.is_file():
        subprocess.run((command + user + "/" + dataset).split())
        filepath.parent.mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(dataset + ".zip", "data")
        shutil.move(dataset + ".zip", "data")


def download_competition_from_kaggle(competition):
    command = "kaggle competitions download -c "
    filepath = Path("data/" + competition + ".zip")

    if not filepath.is_file():
        subprocess.run((command + competition).split())
        Path("data").mkdir(parents=True, exist_ok=True)
        shutil.unpack_archive(competition + ".zip", "data")
        shutil.move(competition + ".zip", "data")


# Html `code` block highlight.
HTML(
    """
<style>
code {
    background: rgba(58, 90, 129, 0.5) !important;
    border-radius: 4px !important;
    color: #f2f2f0 !important;
}
</style>
"""
)


<blockquote style="
    margin-right: auto; 
    margin-left: auto; 
    background-color: #010D36; 
    padding: 15px; 
    border-radius: 8px;
    border-left: none;
">
<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
">
    <b>Competition Description</b> 📜
</p>

<p style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 20px;
    margin-right: 20px;
    margin-bottom: 20px;
">
    <i>The goal of this competition is to predict if a person has any of three medical conditions. You are being asked to predict if the person has one or more of any of the three medical conditions (Class $1$), or none of the three medical conditions (Class $0$). You will create a model trained on measurements of health characteristics.</br></br>
    To determine if someone has these medical conditions requires a long and intrusive process to collect information from patients. With predictive models, we can shorten this process and keep patient details private by collecting key characteristics relative to the conditions, then encoding these characteristics.</br></br>
    Your work will help researchers discover the relationship between measurements of certain characteristics and potential patient conditions.</i>
</p>

<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color:#f2f2f0;
">
    <b>Context and Task</b> 🕵
</p>

<p style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 20px;
    margin-right: 20px;
    margin-bottom: 20px;
">
    <i>They say age is just a number but a whole host of health issues come with aging. From heart disease and dementia to hearing loss and arthritis, aging is a risk factor for numerous diseases and complications. The growing field of bioinformatics includes research into interventions that can help slow and reverse biological aging and prevent major age-related ailments. Data science could have a role to play in developing new methods to solve problems with diverse data, even if the number of samples is small.</br></br>
    Currently, models like XGBoost and random forest are used to predict medical conditions yet the models' performance is not good enough. Dealing with critical problems where lives are on the line, models need to make correct predictions reliably and consistently between different cases.</br></br>
    Founded in 2015, competition host InVitro Cell Research, LLC (ICR) is a privately funded company focused on regenerative and preventive personalized medicine. Their offices and labs in the greater New York City area offer state-of-the-art research space. InVitro Cell Research's Scientists are what set them apart, helping guide and defining their mission of researching how to repair aging people fast.</br></br>
    <b>In this competition, you’ll work with measurements of health characteristic data to solve critical problems in bioinformatics. Based on minimal training, you’ll create a model to predict if a person has any of three medical conditions, with an aim to improve on existing methods.</b></br></br>
    You could help advance the growing field of bioinformatics and explore new methods to solve complex problems with diverse data.</i>
</p>

<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
">
    <b>This Notebook Covers</b> 📔
</p>

<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 8px;
    margin-right: 8px;
    margin-bottom: 20px;
"> 
    <li>A quick look at the dataset.</li>
    <li>...</li>
</ul>

<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
">
    <b>See More Here</b> 📈
</p>

<p style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 20px;
    margin-right: 20px;
    margin-bottom: 20px;
">
    <a href="https://www.kaggle.com/competitions/icr-identify-age-related-conditions/overview" style="color: #01CBEE;"><b>ICR - Identifying Age-Related Conditions</b></a>
</p>
</blockquote>

# <p style="padding: 15px; background-color: #010D36; font-family: 'JetBrains Mono'; font-weight: bold; font-size: 100%; color: #f2f2f0; letter-spacing: 2px; text-align: center; border-radius: 8px;">Quick Overview</p>

<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #3E3F4C;
    border-bottom: 3px solid #01CBEE;
">
    <b>Notes</b> 📜
</p>

<blockquote style="
    margin-right: auto; 
    margin-left: auto; 
    background-color: #010D36; 
    padding: 15px; 
    border-radius: 8px;
    border-left: none;
">
<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 8px;
    margin-right: 8px;
    margin-top: 4px; 
    margin-bottom: 4px;
">
    <li>Let's get started with a short dataset overview.</li>
</ul>
</blockquote>

In [2]:
competition = "icr-identify-age-related-conditions"

if not ON_KAGGLE:
    download_competition_from_kaggle(competition)
    train_path = "data/train.csv"
    test_path = "data/test.csv"
    greeks_path = "data/greeks.csv"
else:
    train_path = f"/kaggle/input/{competition}/train.csv"
    test_path = f"/kaggle/input/{competition}/test.csv"
    greeks_path = f"/kaggle/input/{competition}/greeks.csv"

train = pd.read_csv(train_path, index_col="Id").rename(columns=str.strip)
test = pd.read_csv(test_path, index_col="Id").rename(columns=str.strip)
greeks = pd.read_csv(greeks_path, index_col="Id").rename(columns=str.strip)


<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #3E3F4C;
    border-bottom: 3px solid #01CBEE;
">
    <b>General Remarks</b> 📔
</p>

<blockquote style="
    margin-right: auto; 
    margin-left: auto; 
    background-color: #010D36; 
    padding: 15px; 
    border-radius: 8px;
    border-left: none;
">

<p style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 20px;
    margin-right: 20px;
    margin-bottom: 20px;
">
    <b>In the original description, we read that:</b></br></br>
    <i>The competition data comprises over fifty anonymized health characteristics linked to three age-related conditions. Your goal is to predict whether a subject has or has not been diagnosed with one of these conditions - a binary classification problem.</br></br>
    Note that this is a Code Competition, in which the actual test set is hidden. In this version, we give some sample data in the correct format to help you author your solutions. When your submission is scored, this example test data will be replaced with the full test set. There are about $400$ rows in the full test set.</i>
</p>

<p style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 20px;
    margin-right: 20px;
    margin-bottom: 20px;
">
    <b>Moreover, we know that:</b>
</p>

<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 8px;
    margin-right: 8px;
    margin-top: 4px; 
    margin-bottom: 4px;
">
    <li><b>train.csv</b> - <i>The training set.</i></li>
    <ul style="
        font-size: 16px;
        font-family: 'JetBrains Mono';
        color: #f2f2f0;
        margin-right: 8px;
    ">
        <li><code>Id</code> - <i>Unique identifier for each observation.</i></li>
        <li><code>AB-GL</code> - <i>Fifty-six anonymized health characteristics. All are numeric except for EJ, which is categorical.</i></li>
        <li><code>Class</code> - <i>A binary target: $1$ indicates the subject has been diagnosed with one of the three conditions, $0$ indicates they have not.</i></li>
    </ul>
    <li><b>test.csv</b> - <i>The test set. Your goal is to predict the probability that a subject in this set belongs to each of the two classes.</i></li>
    <li><b>greeks.csv</b> - <i>Supplemental metadata, only available for the training set.</i></li>
    <ul style="
        font-size: 16px;
        font-family: 'JetBrains Mono';
        color: #f2f2f0;
        margin-right: 8px;
    ">
        <li><code>Alpha</code> - <i>Identifies the type of age-related condition, if present.</i></li>
        <ul style="
            font-size: 16px;
            font-family: 'JetBrains Mono';
            color: #f2f2f0;
            margin-right: 8px;
        ">
            <li><code>A</code> - <i>No age-related condition. Corresponds to class $0$.</i></li>
            <li><code>B</code>, <code>D</code>, <code>G</code> - <i>The three age-related conditions. Correspond to class $1$.</i></li>
        </ul>
        <li><code>Beta</code>, <code>Gamma</code>, <code>Delta</code> - <i>Three experimental characteristics.</i></li>
        <li><code>Epsilon</code> - <i>The date the data for this subject was collected. Note that all of the data in the test set was collected after the training set was collected.</i></li>
    </ul>
</ul>
</blockquote>

In [3]:
train.head().style.set_table_styles(DF_STYLE).format(precision=3)


Unnamed: 0_level_0,AB,AF,AH,AM,AR,AX,AY,AZ,BC,BD,BN,BP,BQ,BR,BZ,CB,CC,CD,CF,CH,CL,CR,CS,CU,CW,DA,DE,DF,DH,DI,DL,DN,DU,DV,DY,EB,EE,EG,EH,EJ,EL,EP,EU,FC,FD,FE,FI,FL,FR,FS,GB,GE,GF,GH,GI,GL,Class
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1
000ff2bfdfe9,0.209,3109.033,85.2,22.394,8.139,0.7,0.026,9.812,5.556,4126.587,22.598,175.639,152.708,823.928,257.432,47.223,0.563,23.388,4.852,0.023,1.05,0.069,13.784,1.302,36.206,69.083,295.571,0.239,0.284,89.246,84.317,29.657,5.311,1.743,23.188,7.294,1.987,1433.167,0.949,B,30.879,78.527,3.828,13.395,10.265,9028.292,3.583,7.298,1.739,0.095,11.339,72.611,2003.81,22.136,69.835,0.12,1
007255e47698,0.145,978.764,85.2,36.969,8.139,3.632,0.026,13.518,1.23,5496.928,19.421,155.868,14.755,51.217,257.432,30.284,0.485,50.628,6.085,0.031,1.114,1.118,28.311,1.357,37.477,70.798,178.553,0.239,0.363,110.582,75.745,37.532,0.006,1.743,17.222,4.926,0.859,1111.287,0.003,A,109.125,95.415,52.26,17.176,0.297,6785.003,10.359,0.173,0.497,0.569,9.293,72.611,27981.563,29.135,32.132,21.978,0
013f2bd269f5,0.47,2635.107,85.2,32.361,8.139,6.733,0.026,12.825,1.23,5135.78,26.483,128.989,219.32,482.142,257.432,32.564,0.496,85.955,5.376,0.036,1.05,0.7,39.365,1.01,21.46,70.82,321.427,0.239,0.21,120.056,65.47,28.053,1.29,1.743,36.861,7.814,8.147,1494.076,0.377,B,109.125,78.527,5.391,224.207,8.745,8338.906,11.627,7.71,0.976,1.199,37.078,88.609,13676.958,28.023,35.193,0.197,0
043ac50845d5,0.252,3819.652,120.202,77.112,8.139,3.685,0.026,11.054,1.23,4169.677,23.658,237.282,11.05,661.519,257.432,15.202,0.718,88.159,2.348,0.029,1.4,0.636,41.117,0.723,21.53,47.276,196.608,0.239,0.292,139.825,71.571,24.355,2.655,1.743,52.004,7.386,3.813,15691.552,0.614,B,31.674,78.527,31.323,59.302,7.884,10965.766,14.852,6.122,0.497,0.284,18.53,82.417,2094.262,39.949,90.493,0.156,0
044fb8a146ec,0.38,3733.048,85.2,14.104,8.139,3.942,0.055,3.397,102.152,5728.734,24.011,324.546,149.717,6074.859,257.432,82.213,0.536,72.644,30.538,0.025,1.05,0.693,31.725,0.828,34.415,74.065,200.178,0.239,0.208,97.92,52.839,26.02,1.145,1.743,9.065,7.351,3.491,1403.656,0.164,B,109.125,91.995,51.141,29.103,4.275,16198.05,13.667,8.153,48.501,0.122,16.409,146.11,8524.371,45.381,36.263,0.097,1


In [4]:
train.info(verbose=False)


<class 'pandas.core.frame.DataFrame'>
Index: 617 entries, 000ff2bfdfe9 to ffcca4ded3bb
Columns: 57 entries, AB to Class
dtypes: float64(55), int64(1), object(1)
memory usage: 279.6+ KB


In [5]:
greeks.head().style.set_table_styles(DF_STYLE)


Unnamed: 0_level_0,Alpha,Beta,Gamma,Delta,Epsilon
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
000ff2bfdfe9,B,C,G,D,3/19/2019
007255e47698,A,C,M,B,Unknown
013f2bd269f5,A,C,M,B,Unknown
043ac50845d5,A,C,M,B,Unknown
044fb8a146ec,D,B,F,B,3/25/2020


In [6]:
greeks.info(verbose=False)


<class 'pandas.core.frame.DataFrame'>
Index: 617 entries, 000ff2bfdfe9 to ffcca4ded3bb
Columns: 5 entries, Alpha to Epsilon
dtypes: object(5)
memory usage: 28.9+ KB


In [7]:
missing_values_cols = train.isna().sum()[train.isna().sum() > 0].index.to_list()

print(CLR + "Training Dataset Missing Values\n")

for feature in missing_values_cols:
    print(
        (CLR + feature) + "\t",
        (RED + str(train[feature].isna().sum())) + "\t",
        (RED + f"{train[feature].isna().sum() / len(train):.1%}" + RESET) + "\t",
        (RED + f"{train[feature].dtype}"),
    )


[1m[37mTraining Dataset Missing Values

[1m[37mBQ	 [1m[31m60	 [1m[31m9.7%[0m	 [1m[31mfloat64
[1m[37mCB	 [1m[31m2	 [1m[31m0.3%[0m	 [1m[31mfloat64
[1m[37mCC	 [1m[31m3	 [1m[31m0.5%[0m	 [1m[31mfloat64
[1m[37mDU	 [1m[31m1	 [1m[31m0.2%[0m	 [1m[31mfloat64
[1m[37mEL	 [1m[31m60	 [1m[31m9.7%[0m	 [1m[31mfloat64
[1m[37mFC	 [1m[31m1	 [1m[31m0.2%[0m	 [1m[31mfloat64
[1m[37mFL	 [1m[31m1	 [1m[31m0.2%[0m	 [1m[31mfloat64
[1m[37mFS	 [1m[31m2	 [1m[31m0.3%[0m	 [1m[31mfloat64
[1m[37mGL	 [1m[31m1	 [1m[31m0.2%[0m	 [1m[31mfloat64


In [8]:
fig = px.pie(
    train.assign(ClassMap=train.Class.map({0: "Class 0", 1: "Class 1"})),
    names="ClassMap",
    height=540,
    width=840,
    hole=0.65,
    title="Target Overview - Class",
    color_discrete_sequence=["#010D36", "#FF2079"],
)
fig.update_layout(
    font_color=FONT_COLOR,
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    showlegend=False,
)
fig.add_annotation(
    x=0.5,
    y=0.5,
    align="center",
    xref="paper",
    yref="paper",
    showarrow=False,
    font_size=22,
    text="Class<br>Unbalance",
)
fig.update_traces(
    hovertemplate=None,
    textposition="outside",
    texttemplate="%{label}<br>%{value} - %{percent}",
    textfont_size=16,
    rotation=-20,
    marker_line_width=25,
    marker_line_color=BACKGROUND_COLOR,
)
fig.show()


<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #3E3F4C;
    border-bottom: 3px solid #01CBEE;
">
    <b>Observations</b> 📔
</p>

<blockquote style="
    margin-right: auto; 
    margin-left: auto; 
    background-color: #010D36; 
    padding: 15px; 
    border-radius: 8px;
    border-left: none;
">
<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 8px;
    margin-right: 8px;
    margin-top: 4px; 
    margin-bottom: 4px;
">
    <li>The training dataset is small, containing $617$ samples. Nevertheless, we have to handle $57$ different medical characteristics (attributes), including the binary target.</li>
    <li>These features are anonymous, and we all know that these are specific medical characteristics.</li>
    <li>We've got additional data, e.g. greeks.csv, but we will look at this later, especially the <code>Epsilon</code> attribute.</li>
    <li>In our dataset, we have nine numeric features that contain missing values. Typically, only $1$ to $3$ values are missing for each attribute. However, there are two specific features where we observe $60$ missing values each.</li>
    <li>Lastly, there is quite a lot of unbalance in the target: 83% to 17%.</li>
</ul>
</blockquote>

# <p style="padding: 15px; background-color: #010D36; font-family: 'JetBrains Mono'; font-weight: bold; font-size: 100%; color: #f2f2f0; letter-spacing: 2px; text-align: center; border-radius: 8px;">Basic Relations in Numerical Features</p>

<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #3E3F4C;
    border-bottom: 3px solid #01CBEE;
">
    <b>Notes</b> 📜
</p>

<blockquote style="
    margin-right: auto; 
    margin-left: auto; 
    background-color: #010D36; 
    padding: 15px; 
    border-radius: 8px;
    border-left: none;
">
<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 8px;
    margin-right: 8px;
    margin-top: 4px; 
    margin-bottom: 4px;
">
    <li>Let's focus on an elementary description of numerical features. Firstly, let's see the numerical summary. Then, we will get to the correlation matrix and finally create hierarchical clustering based on Pearson correlations.</li>
</ul>
</blockquote>

In [9]:
numeric_descr = (
    train.drop("Class", axis=1)
    .describe(percentiles=[0.01, 0.05, 0.25, 0.50, 0.75, 0.95, 0.99])
    .drop("count")
    .T.rename(columns=str.title)
)

numeric_descr.style.set_table_styles(DF_STYLE).format(precision=3)


Unnamed: 0,Mean,Std,Min,1%,5%,25%,50%,75%,95%,99%,Max
AB,0.477,0.468,0.081,0.12,0.153,0.252,0.355,0.56,1.079,2.165,6.162
AF,3502.013,2300.323,192.593,192.593,1018.985,2197.345,3120.319,4361.637,6957.807,10377.994,28688.188
AH,118.625,127.839,85.2,85.2,85.2,85.2,85.2,113.74,209.993,541.429,1910.123
AM,38.969,69.728,3.178,5.186,7.153,12.27,20.533,39.14,111.939,410.512,630.518
AR,10.128,10.519,8.139,8.139,8.139,8.139,8.139,8.139,17.12,34.467,178.944
AX,5.546,2.552,0.7,1.035,2.87,4.128,5.032,6.432,9.247,13.169,38.271
AY,0.06,0.417,0.026,0.026,0.026,0.026,0.026,0.037,0.124,0.214,10.316
AZ,10.566,4.351,3.397,3.397,3.397,8.13,10.461,12.97,16.862,22.914,38.972
BC,8.053,65.167,1.23,1.23,1.23,1.23,1.23,5.081,11.997,50.66,1463.693
BD,5350.389,3021.327,1693.624,2221.15,3041.643,4155.703,4997.961,6035.886,7955.458,10131.207,53060.599


<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #3E3F4C;
    border-bottom: 3px solid #01CBEE;
">
    <b>Observations</b> 📔
</p>

<blockquote style="
    margin-right: auto; 
    margin-left: auto; 
    background-color: #010D36; 
    padding: 15px; 
    border-radius: 8px;
    border-left: none;
">
<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 8px;
    margin-right: 8px;
    margin-top: 4px; 
    margin-bottom: 4px;
">
    <li>Well, at first glance, it's hard to focus on specific values here. However, let's look at the Q1-Q3 range and upper percentiles, including the max value. We may conclude that many of these distributions have long tails, which will probably require some transformations like log-level transformation.</li>
</ul>
</blockquote>

In [10]:
color_map = [[0.0, "#01CBEE"], [0.5, "#010D36"], [1.0, "#FF2079"]]

pearson_corr = (
    train.drop("Class", axis=1).corr(numeric_only=True, method="pearson").round(2)
)
mask = np.triu(np.ones_like(pearson_corr, dtype=bool))
lower_triangular_corr = (
    pearson_corr.mask(mask)
    .dropna(axis="index", how="all")
    .dropna(axis="columns", how="all")
)

heatmap = go.Heatmap(
    z=lower_triangular_corr,
    x=lower_triangular_corr.columns,
    y=lower_triangular_corr.index,
    text=lower_triangular_corr.fillna(""),
    texttemplate="%{text}",
    xgap=1,
    ygap=1,
    showscale=True,
    colorscale=color_map,
    colorbar_len=1.02,
    hoverinfo="none",
)
fig = go.Figure(heatmap)
fig.update_layout(
    font_color=FONT_COLOR,
    title="Correlation Matrix (Pearson) - Lower Triangular",
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    width=840,
    height=840,
    xaxis_showgrid=False,
    yaxis_showgrid=False,
    yaxis_autorange="reversed",
)
fig.show()


<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #3E3F4C;
    border-bottom: 3px solid #01CBEE;
">
    <b>Observations</b> 📔
</p>

<blockquote style="
    margin-right: auto; 
    margin-left: auto; 
    background-color: #010D36; 
    padding: 15px; 
    border-radius: 8px;
    border-left: none;
">
<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 8px;
    margin-right: 8px;
    margin-top: 4px; 
    margin-bottom: 4px;
">
    <li>Here we have several highly correlated features like <code>BZ</code> vs <code>BC</code> ($0.91$) or <code>DV</code> vs <code>CL</code> ($0.95$). Such extreme linear correlation gives hope for rejecting certain features. Remember that you can zoom in on this matrix and explore specific relations. In the other case, you won't be able to see anything.</li>
</ul>
</blockquote>

In [126]:
dissimilarity = 1 - np.abs(pearson_corr)

fig = ff.create_dendrogram(
    dissimilarity,
    labels=pearson_corr.columns,
    orientation="left",
    colorscale=px.colors.sequential.YlGnBu_r,
    # squareform() returns lower triangular in compressed form - as 1D array.
    linkagefun=lambda x: linkage(squareform(dissimilarity), method="complete"),
)
fig.update_layout(
    font_color=FONT_COLOR,
    title="Hierarchical Clustering using Correlation Matrix (Pearson)",
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    height=1340,
    width=840,
    yaxis=dict(
        showline=False,
        title="Feature",
        ticks="",
    ),
    xaxis=dict(
        showline=False,
        title="Distance",
        ticks="",
        range=[-0.05, 1.05],
    ),
)
fig.update_traces(line_width=1.5)
fig.show()


<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #3E3F4C;
    border-bottom: 3px solid #01CBEE;
">
    <b>Observations</b> 📔
</p>

<blockquote style="
    margin-right: auto; 
    margin-left: auto; 
    background-color: #010D36; 
    padding: 15px; 
    border-radius: 8px;
    border-left: none;
">
<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 8px;
    margin-right: 8px;
    margin-top: 4px; 
    margin-bottom: 4px;
">
    <li>Okay, here we need to make something clear. Since we had the correlation matrix, we conducted hierarchical clustering. This process consists of an alternative to the K-Means algorithm. Hierarchical clustering allows us to visualize the effect of different clusters' number determining.</li>
    <li>However, relying on a correlation matrix to perform hierarchical clustering requires additional steps. Primarily, clustering methods measure the dissimilarity of variables. Meanwhile, correlation measures similarity. We can treat dissimilarity as $dissimilarity = 1 - abs(correlation)$. And basically, that's all. We passed dissimilarity to the <code>linkage()</code> function from the <code>scipy</code> module and got clustering results.</li>
    <li>Moreover, we should remember that we rely on the <b>Pearson</b> correlation. It measures linear dependency, and it's computed on actual values. However, we could have used for example the <b>Spearman</b> correlation, which is based on ranks and measures monotonic relations.</li>
    <li>Additionally, we chose the <code>complete</code> method in the <code>linkage()</code> function, and if you take a different method, you get different results.</li>
    <li>As you can see, here we have minimal distances between <code>BZ</code> - <code>BC</code>, <code>DV</code> - <code>CL</code>, and <code>EH</code> - <code>FD</code>.</li>
</ul>
</blockquote>

# <p style="padding: 15px; background-color: #010D36; font-family: 'JetBrains Mono'; font-weight: bold; font-size: 100%; color: #f2f2f0; letter-spacing: 2px; text-align: center; border-radius: 8px;">Kernel Density Estimation &amp; Pair Plots</p>

<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #3E3F4C;
    border-bottom: 3px solid #01CBEE;
">
    <b>Notes</b> 📜
</p>

<blockquote style="
    margin-right: auto; 
    margin-left: auto; 
    background-color: #010D36; 
    padding: 15px; 
    border-radius: 8px;
    border-left: none;
">
<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 8px;
    margin-right: 8px;
    margin-top: 4px; 
    margin-bottom: 4px;
">
    <li>In this section, we will focus on exploring distributions in a general manner. Firstly, we will depict some pair plots of highly correlated features, and then we will see the probability density of these variables by target value.</li>
    <li>Firstly, let's define some small utility functions. The former is liable for KDE calculations, and the latter provides appropriate axes arrangement.</li>
</ul>
</blockquote>

In [138]:
def get_kde_estimation(data_series):
    kde = gaussian_kde(data_series.dropna())
    kde_range = np.linspace(
        data_series.min() - data_series.max() * 0.1,
        data_series.max() + data_series.max() * 0.1,
        len(data_series),
    )
    estimated_values = kde.evaluate(kde_range)
    estimated_values_cum = np.cumsum(estimated_values)
    estimated_values_cum /= estimated_values_cum.max()
    return kde_range, estimated_values, estimated_values_cum


def get_n_rows_axes(n_features, n_cols=5, n_rows=None):
    n_rows = int(np.ceil(n_features / n_cols))
    current_col = range(1, n_cols + 1)
    current_row = range(1, n_rows + 1)
    return n_rows, list(product(current_row, current_col))


In [139]:
threshold = 0.7

highest_abs_corr = (
    lower_triangular_corr.abs()
    .unstack()
    .sort_values(ascending=False)  # type: ignore
    .rename("Absolute Pearson Correlation")
)

highest_abs_corr = (
    highest_abs_corr[highest_abs_corr > threshold]
    .to_frame()
    .reset_index(names=["Feature 1", "Feature 2"])
)

highest_corr_combinations = highest_abs_corr[["Feature 1", "Feature 2"]].to_numpy()
highest_abs_corr.style.set_table_styles(DF_STYLE).format(precision=2)


Unnamed: 0,Feature 1,Feature 2,Absolute Pearson Correlation
0,EH,FD,0.97
1,CL,DV,0.95
2,BC,BZ,0.91
3,DU,EH,0.85
4,AR,DV,0.82
5,DU,FD,0.81
6,CS,EP,0.79
7,BC,BD,0.75
8,AR,CL,0.75
9,AR,EP,0.75


In [140]:
n_cols = 3
n_rows, axes = get_n_rows_axes(len(highest_corr_combinations), n_cols=n_cols)

fig = make_subplots(
    rows=n_rows,
    cols=n_cols,
    horizontal_spacing=0.1,
    vertical_spacing=0.06,
)

show_legend = True

for k, ((current_row, current_col), (feature1, feature2)) in enumerate(
    zip(axes, highest_corr_combinations)
):
    if k > 0:
        show_legend = False

    fig.add_scatter(
        x=train.query("Class == 0")[feature1],
        y=train.query("Class == 0")[feature2],
        mode="markers",
        name="Class 0",
        marker=dict(color="#010D36", size=3, symbol="diamond", opacity=0.5),
        legendgroup="Class 0",
        showlegend=show_legend,
        row=current_row,
        col=current_col,
    )
    fig.add_scatter(
        x=train.query("Class == 1")[feature1],
        y=train.query("Class == 1")[feature2],
        mode="markers",
        name="Class 1",
        marker=dict(color="#FF2079", size=2, symbol="circle", opacity=0.5),
        legendgroup="Class 1",
        showlegend=show_legend,
        row=current_row,
        col=current_col,
    )
    fig.update_xaxes(
        type="log",
        title_text=feature1,
        titlefont_size=9,
        titlefont_family="Arial Black",
        tickfont_size=7,
        row=current_row,
        col=current_col,
    )
    fig.update_yaxes(
        type="log",
        title_text=feature2,
        titlefont_size=9,
        titlefont_family="Arial Black",
        tickfont_size=7,
        row=current_row,
        col=current_col,
    )

fig.update_annotations(font_size=14)
fig.update_layout(
    font_color=FONT_COLOR,
    title="Highest Pearson Correlations - Pair Plots<br>Double Logarithmic Scale",
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    width=840,
    height=1140,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        xanchor="right",
        y=1.01,
        x=1,
        itemsizing="constant",
    ),
)

fig.show()


<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #3E3F4C;
    border-bottom: 3px solid #01CBEE;
">
    <b>Observations</b> 📔
</p>

<blockquote style="
    margin-right: auto; 
    margin-left: auto; 
    background-color: #010D36; 
    padding: 15px; 
    border-radius: 8px;
    border-left: none;
">
<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 8px;
    margin-right: 8px;
    margin-top: 4px; 
    margin-bottom: 4px;
">
    <li>In the case of this dataset, it's impossible to show all pair-plots, so I chose only those most correlated.</li>
    <li>The highest correlation is between <code>EH</code> - <code>FD</code> ($0.97$), and this is clearly visible there. Moreover, values associated with Class $0$ are shifted towards higher values. You can explore this by turning off and turning on a given group using legend. A similar situation occurs within <code>DU</code> - <code>EH</code> and <code>DU</code> - <code>FD</code>. Unfortunately, we don't know what these abbreviations mean.</li>
    <li>Moreover, we can see that many different values of a given feature correspond to one specific value from the second one. It may account for a little problem for machine learning algorithms. Such a situation appears in each of the above relationships.</li>
</ul>
</blockquote>

In [141]:
numeric_data = train.select_dtypes("number")
numeric_cols = numeric_data.drop("Class", axis=1).columns.tolist()

n_cols = 5
n_rows, axes = get_n_rows_axes(len(numeric_cols))

fig1 = make_subplots(
    rows=n_rows,
    cols=n_cols,
    y_title="Probability Density",
    horizontal_spacing=0.06,
    vertical_spacing=0.04,
)
fig2 = copy(fig1)

show_legend = True

for k, ((current_row, current_col), feature) in enumerate(zip(axes, numeric_cols)):
    if k > 0:
        show_legend = False

    for target, color in zip((0, 1), ("#010D36", "#FF2079")):
        kde_range, estimated_values, estimated_values_cum = get_kde_estimation(
            numeric_data.query(f"Class == {target}")[feature]
        )

        for fig, kde_values in zip(  # type: ignore
            (fig1, fig2), (estimated_values, estimated_values_cum)
        ):
            fig.add_scatter(
                x=kde_range,
                y=kde_values,
                line=dict(dash="solid", color=color, width=1),
                fill="tozeroy",
                name=f"Class {target}",
                legendgroup=f"Class {target}",
                showlegend=show_legend,
                row=current_row,
                col=current_col,
            )
            fig.update_yaxes(
                tickfont_size=7,
                row=current_row,
                col=current_col,
            )
            fig.update_xaxes(
                title_text=feature,
                titlefont_size=9,
                titlefont_family="Arial Black",
                tickfont_size=7,
                row=current_row,
                col=current_col,
            )

title1 = "Numerical Features - Kernel Density Estimation"
title2 = "Numerical Features - Cumulative Kernel Density Estimation"

for fig, title in zip((fig1, fig2), (title1, title2)):
    fig.update_annotations(font_size=14)
    fig.update_layout(
        font_color=FONT_COLOR,
        title=title,
        title_font_size=18,
        plot_bgcolor=BACKGROUND_COLOR,
        paper_bgcolor=BACKGROUND_COLOR,
        width=840,
        height=1340,
        legend=dict(
            orientation="h",
            yanchor="bottom",
            xanchor="right",
            y=1.01,
            x=1,
        ),
    )

fig1.show()


<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #3E3F4C;
    border-bottom: 3px solid #01CBEE;
">
    <b>Observations</b> 📔
</p>

<blockquote style="
    margin-right: auto; 
    margin-left: auto; 
    background-color: #010D36; 
    padding: 15px; 
    border-radius: 8px;
    border-left: none;
">
<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 8px;
    margin-right: 8px;
    margin-top: 4px; 
    margin-bottom: 4px;
">
    <li>You can activate and deactivate distributions for a certain class by clicking on the legend.</li>
    <li>Well, here we've got a diversity of variables, i.e. some of them probably relatively good fit a normal distribution (<code>BN</code>, <code>CU</code>, <code>GH</code>), some have long tails (and extremely long tails), like <code>AR</code>, <code>AY</code>, <code>BR</code>, <code>BZ</code>, etc. Moreover, there are even bimodal distributions (<code>CW</code>, <code>EL</code> and <code>GL</code>).</li>
    <li>We will better understand the diversity between classes on the cumulative plots, as below.</li>
</ul>
</blockquote>

In [14]:
fig2.show()


<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #3E3F4C;
    border-bottom: 3px solid #01CBEE;
">
    <b>Observations</b> 📔
</p>

<blockquote style="
    margin-right: auto; 
    margin-left: auto; 
    background-color: #010D36; 
    padding: 15px; 
    border-radius: 8px;
    border-left: none;
">
<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 8px;
    margin-right: 8px;
    margin-top: 4px; 
    margin-bottom: 4px;
">
    <li>The cumulative KDE reveals a varied presence of long tails in the given distributions. Depending on the variable, the responsibility for the long tail can be attributed to values associated with Class $0$ in some cases, while in other cases it is associated with values linked to Class $1$. Additionally, there are instances where the distributions overlap.</li>
</ul>
</blockquote>

# <p style="padding: 15px; background-color: #010D36; font-family: 'JetBrains Mono'; font-weight: bold; font-size: 100%; color: #f2f2f0; letter-spacing: 2px; text-align: center; border-radius: 8px;">Probability Plots &amp; Transformations</p>

<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #3E3F4C;
    border-bottom: 3px solid #01CBEE;
">
    <b>Notes</b> 📜
</p>

<blockquote style="
    margin-right: auto; 
    margin-left: auto; 
    background-color: #010D36; 
    padding: 15px; 
    border-radius: 8px;
    border-left: none;
">
<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 8px;
    margin-right: 8px;
    margin-top: 4px; 
    margin-bottom: 4px;
">
    <li>This section aims to explore so-called probability plots. It's a pleasant graphical technique to assess whether a variable follows a specific distribution. Here, the normal one. On such a plot, samples which follow normal distribution are deployed on a diagonal straight line.</li>
    <li>The mentioned technique helps to decide which transformations should be done within the given variable to improve the fit to the normal distribution.</li>
    <li>Some machine learning models assume that the variable follows a normal distribution. In turn, the mentioned technique helps to decide which transformations should be done within the given variable to improve the fit to that distribution.</li>
    <li>Let's get started with original values and see results.</li>
</ul>
</blockquote>

In [246]:
fig = make_subplots(
    rows=n_rows,
    cols=n_cols,
    y_title="Observed Values",
    x_title="Theoretical Quantiles",
    horizontal_spacing=0.06,
    vertical_spacing=0.04,
)
fig.update_annotations(font_size=14)

for (row, col), feature in zip(axes, numeric_cols):
    (osm, osr), (slope, intercept, R) = probplot(train[feature].dropna(), rvalue=True)
    x_theory = np.array([osm[0], osm[-1]])
    y_theory = intercept + slope * x_theory
    R2 = f"R\u00b2 = {R * R:.2f}"
    fig.add_scatter(x=osm, y=osr, mode="markers", row=row, col=col, name=feature)
    fig.add_scatter(x=x_theory, y=y_theory, mode="lines", row=row, col=col)
    fig.add_annotation(
        x=-1.25,
        y=osr[-1] * 0.75,
        text=R2,
        showarrow=False,
        row=row,
        col=col,
        font_size=9,
    )
    fig.update_yaxes(tickfont_size=7, row=row, col=col)
    fig.update_xaxes(
        title_text=feature,
        titlefont_size=9,
        titlefont_family="Arial Black",
        tickfont_size=7,
        row=row,
        col=col,
    )

fig.update_layout(
    font_color=FONT_COLOR,
    title="Numerical Features - Probability Plots",
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    showlegend=False,
    width=840,
    height=1340,
)
fig.update_traces(
    marker=dict(size=1, symbol="x-thin", line=dict(width=2, color="#010D36")),
    line_color="#FF2079",
)
fig.show()


<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #3E3F4C;
    border-bottom: 3px solid #01CBEE;
">
    <b>Observations</b> 📔
</p>

<blockquote style="
    margin-right: auto; 
    margin-left: auto; 
    background-color: #010D36; 
    padding: 15px; 
    border-radius: 8px;
    border-left: none;
">
<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 8px;
    margin-right: 8px;
    margin-top: 4px; 
    margin-bottom: 4px;
">
    <li>As you can see, some variables fit a normal distribution well, which manifests by a high coefficient of determination (R-squared) and evenly deployed samples around the straight line. These are for example <code>DN</code> or <code>BN</code>.</li>
    <li>Nevertheless, there are a lot of features which do not fit the normal one. We can improve that by specific transformations:</li>
    <ul style="
        font-size: 16px;
        font-family: 'JetBrains Mono';
        color: #f2f2f0;
        margin-right: 8px;
    ">
        <li><b>Log Transformation</b> - generally works fine with right-skewed data.</li>
        <li><b>Square Root Transformation</b> - similarly to log-level transformation.</li>
        <li><b>Square Transformation</b> - helps to reduce left-skewed data.</li>
        <li><b>Reciprocal Transformation</b> - used sometimes, when data is skewed, or there are obvious outliers.</li>
        <li><b>Box-Cox Transformation</b> - used when data is skewed or has outliers.</li>
        <li><b>Yeo-Johnson Transformation</b> - variation of Box-Cox transformation.</li>
    </ul>
    <li>Let's check all of these transformations for our variables.</li>
</blockquote>

In [297]:
r2_scores = defaultdict(tuple)

for feature in numeric_cols:
    orig = train[feature].dropna()
    _, (*_, R_orig) = probplot(orig, rvalue=True)
    _, (*_, R_log) = probplot(np.log(orig), rvalue=True)
    _, (*_, R_sqrt) = probplot(np.sqrt(orig), rvalue=True)
    _, (*_, R_reci) = probplot(np.reciprocal(orig), rvalue=True)
    _, (*_, R_boxcox) = probplot(stats.boxcox(orig)[0], rvalue=True)
    _, (*_, R_yeojohn) = probplot(stats.yeojohnson(orig)[0], rvalue=True)
    r2_scores[feature] = (
        R_orig * R_orig,
        R_log * R_log,
        R_sqrt * R_sqrt,
        R_reci * R_reci,
        R_boxcox * R_boxcox,
        R_yeojohn * R_yeojohn,
    )

r2_scores = pd.DataFrame(
    r2_scores, index=("Original", "Log", "Sqrt", "Reciprocal", "BoxCox", "YeoJohnson")
).T

r2_scores["Winner"] = r2_scores.idxmax(axis=1)
r2_scores.style.set_table_styles(DF_STYLE).format(precision=3)


Unnamed: 0,Original,Log,Sqrt,Reciprocal,BoxCox,YeoJohnson,Winner
AB,0.537,0.976,0.82,0.92,0.998,0.991,BoxCox
AF,0.761,0.872,0.945,0.344,0.955,0.955,YeoJohnson
AH,0.238,0.568,0.416,0.678,0.686,0.686,YeoJohnson
AM,0.383,0.959,0.716,0.903,0.997,0.996,BoxCox
AR,0.158,0.422,0.299,0.505,0.515,0.515,YeoJohnson
AX,0.745,0.918,0.912,0.489,0.938,0.95,YeoJohnson
AY,0.039,0.573,0.232,0.642,0.634,0.627,Reciprocal
AZ,0.942,0.903,0.953,0.722,0.957,0.958,YeoJohnson
BC,0.058,0.74,0.308,0.723,0.739,0.745,YeoJohnson
BD,0.412,0.924,0.73,0.918,0.962,0.962,YeoJohnson


In [310]:
r2_scores.describe().T.drop("count", axis=1).rename(
    columns=str.title
).style.set_table_styles(DF_STYLE).format(precision=3)


Unnamed: 0,Mean,Std,Min,25%,50%,75%,Max
Original,0.527,0.321,0.023,0.209,0.531,0.82,0.982
Log,0.841,0.17,0.177,0.827,0.903,0.949,0.998
Sqrt,0.722,0.249,0.132,0.588,0.789,0.941,0.992
Reciprocal,0.653,0.217,0.11,0.504,0.678,0.832,0.972
BoxCox,0.879,0.155,0.254,0.84,0.941,0.984,0.998
YeoJohnson,0.882,0.157,0.254,0.843,0.95,0.985,0.998


In [306]:
orig_cols = r2_scores.query("Winner == 'Original'").index.to_list()
log_cols = r2_scores.query("Winner == 'Log'").index.to_list()
sqrt_cols = r2_scores.query("Winner == 'Sqrt'").index.to_list()
reci_cols = r2_scores.query("Winner == 'Reciprocal'").index.to_list()
box_cox_cols = r2_scores.query("Winner == 'BoxCox'").index.to_list()
yeo_johnson_cols = r2_scores.query("Winner == 'YeoJohnson'").index.to_list()


<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #3E3F4C;
    border-bottom: 3px solid #01CBEE;
">
    <b>Observations</b> 📔
</p>

<blockquote style="
    margin-right: auto; 
    margin-left: auto; 
    background-color: #010D36; 
    padding: 15px; 
    border-radius: 8px;
    border-left: none;
">
<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 8px;
    margin-right: 8px;
    margin-top: 4px; 
    margin-bottom: 4px;
">
    <li>Well, as you can see Yeo-Johnson's transformation wins in most cases. However, some simple transformations, like log one, are also doing well. Moreover, we have one feature where none of the transformations helps - <code>CW</code>.</li>
</blockquote>

In [292]:
AB_orig = train.AB.dropna()
(osm, osr), (slope, intercept, R) = probplot(AB_orig, rvalue=True)
x_theory = np.array([osm[0], osm[-1]])
y_theory = intercept + slope * x_theory

fig = make_subplots(rows=1, cols=2, subplot_titles=["Probability Plot", "Histogram"])

fig.add_scatter(x=osm, y=osr, mode="markers", row=1, col=1, name="BoxCox(AB)")
fig.add_scatter(x=x_theory, y=y_theory, mode="lines", row=1, col=1)
fig.add_annotation(
    x=-1.25,
    y=osr[-1] * 0.4,
    text=f"R\u00b2 = {R * R:.3f}",
    showarrow=False,
    row=1,
    col=1,
)
fig.update_yaxes(title_text="Observed Values", row=1, col=1)
fig.update_xaxes(title_text="Theoretical Quantiles", row=1, col=1)
fig.update_traces(
    marker=dict(size=1, symbol="x-thin", line=dict(width=2, color="#010D36")),
    line_color="#FF2079",
)

fig.add_histogram(
    x=AB_orig,
    marker_color="#010D36",
    opacity=0.75,
    name="BoxCox(AB)",
    row=1,
    col=2,
)
fig.update_yaxes(title_text="Count", row=1, col=2)
fig.update_xaxes(title_text="BoxCox(AB)", row=1, col=2)

fig.update_layout(
    font_color=FONT_COLOR,
    title="AB Feature - Original",
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    showlegend=False,
    width=840,
    height=440,
    bargap=0.2,
)

fig.update_annotations(font_size=14)
fig.show()


In [293]:
AB_transformed = stats.boxcox(train.AB.dropna())[0]
(osm, osr), (slope, intercept, R) = probplot(AB_transformed, rvalue=True)
x_theory = np.array([osm[0], osm[-1]])
y_theory = intercept + slope * x_theory

fig = make_subplots(rows=1, cols=2, subplot_titles=["Probability Plot", "Histogram"])

fig.add_scatter(x=osm, y=osr, mode="markers", row=1, col=1, name="BoxCox(AB)")
fig.add_scatter(x=x_theory, y=y_theory, mode="lines", row=1, col=1)
fig.add_annotation(
    x=-1.25,
    y=osr[-1] * 0.4,
    text=f"R\u00b2 = {R * R:.3f}",
    showarrow=False,
    row=1,
    col=1,
)
fig.update_yaxes(title_text="Observed Values", row=1, col=1)
fig.update_xaxes(title_text="Theoretical Quantiles", row=1, col=1)
fig.update_traces(
    marker=dict(size=1, symbol="x-thin", line=dict(width=2, color="#010D36")),
    line_color="#FF2079",
)

fig.add_histogram(
    x=AB_transformed,
    marker_color="#010D36",
    opacity=0.75,
    name="BoxCox(AB)",
    row=1,
    col=2,
)
fig.update_yaxes(title_text="Count", row=1, col=2)
fig.update_xaxes(title_text="BoxCox(AB)", row=1, col=2)

fig.update_layout(
    font_color=FONT_COLOR,
    title="AB Feature - Box-Cox Transformation",
    title_font_size=18,
    plot_bgcolor=BACKGROUND_COLOR,
    paper_bgcolor=BACKGROUND_COLOR,
    showlegend=False,
    width=840,
    height=440,
    bargap=0.2,
)

fig.update_annotations(font_size=14)
fig.show()


<p style="
    font-size: 20px;
    font-family: 'JetBrains Mono';
    color: #3E3F4C;
    border-bottom: 3px solid #01CBEE;
">
    <b>Observations</b> 📔
</p>

<blockquote style="
    margin-right: auto; 
    margin-left: auto; 
    background-color: #010D36; 
    padding: 15px; 
    border-radius: 8px;
    border-left: none;
">
<ul style="
    font-size: 16px;
    font-family: 'JetBrains Mono';
    color: #f2f2f0;
    margin-left: 8px;
    margin-right: 8px;
    margin-top: 4px; 
    margin-bottom: 4px;
">
    <li>As you can see above, the Box-Cox transformation works perfectly for the <code>AB</code> variable.</li>
    <li>Obviously, I suppose we will be working with tree-based models at the end, but sometimes models like <code>SVC</code> handle very well, and appropriate transformations for these algorithms are crucial.</li>
</blockquote>