# Imports

In [1]:
import os

import datetime

from typing import Union

import numpy as np
import pandas as pd

import seaborn as sns

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_selector, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import Binarizer, OneHotEncoder, OrdinalEncoder, KBinsDiscretizer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier, VotingClassifier

import catboost
import xgboost

In [2]:
import sklearn
from sklearn import set_config

In [3]:
print(f"scikit-learn version is {sklearn.__version__}")

scikit-learn version is 1.4.1.post1


In [4]:
if sklearn.__version__ not in ["1.2.0", "1.2.2", "1.4.1.post1"]:
    os.system("pip install scikit-learn==1.2.0")

# Global Configuration

In [5]:
set_config(transform_output="pandas")

In [6]:
CWD = os.getcwd()
KAGGLE = "kaggle" in CWD

In [7]:
print(CWD)

/Users/nicolaepopescul/Desktop/streams/20240328_sklearn_modelo_benchmark_ml


In [8]:
if not KAGGLE:
    PATH_INPUT = os.path.join(CWD, "input")
    PATH_INPUT_TRAIN = os.path.join(PATH_INPUT, "train.csv")
    PATH_INPUT_TEST = os.path.join(PATH_INPUT, "test.csv")
    
    print(os.listdir(PATH_INPUT))
    
else:
    PATH_INPUT_TRAIN = "/kaggle/input/spaceship-titanic/train.csv"
    PATH_INPUT_TEST = "/kaggle/input/spaceship-titanic/test.csv"

['test.csv', 'spaceship-titanic.zip', 'train.csv', 'sample_submission.csv']


In [9]:
sorted(os.listdir(CWD))

['.DS_Store',
 '.git',
 '.gitignore',
 '.ipynb_checkpoints',
 'EDA_and_FE_20240421.ipynb',
 'Model_20240421.ipynb',
 'catboost_info',
 'input',
 'notebooks',
 'outputs',
 'requirements.txt',
 'src',
 'venvs',
 'videos']

# Helper functions

In [10]:
def load_data():
    
    X_train = pd.read_csv(PATH_INPUT_TRAIN).set_index("PassengerId")
    X_test = pd.read_csv(PATH_INPUT_TEST).set_index("PassengerId")
    
    return X_train, X_test

# Dataset loading & EDA

In [11]:
X, _ = load_data()

In [45]:
X["Spa"].isnull().sum()

183

In [18]:
X["Spa"].describe()

count     8510.000000
mean       311.138778
std       1136.705535
min          0.000000
25%          0.000000
50%          0.000000
75%         59.000000
max      22408.000000
Name: Spa, dtype: float64

In [19]:
X["SpaBoolean"] = X["Spa"] > 0

In [20]:
X.groupby(["SpaBoolean"])["Transported"].mean()

SpaBoolean
False    0.634465
True     0.277464
Name: Transported, dtype: float64

In [None]:
KBinsDiscretizer()

In [38]:
kbins = KBinsDiscretizer(
    n_bins = 20,
    strategy = "uniform",
    encode = "ordinal"
)

In [39]:
X_ = X[X["Spa"] > 0][["Spa", "Transported"]]

In [40]:
X_["SpaBinned"] = kbins.fit_transform(X_[["Spa"]])



In [44]:
(
    X_
    .groupby(["SpaBinned"])
    .agg(
        nr = ("Spa", len),
        mean_target = ("Transported", np.mean)
    )
    .reset_index()
    .pivot_table(
        index = "SpaBinned",
        columns = "nr",
        values = "mean_target",
        aggfunc = lambda val: val,
        fill_value = 0
    )
)

  X_


nr,1,2,4,5,7,8,10,12,21,52,72,107,282,2599
SpaBinned,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.318199
1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.156028,0.0
2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.102804,0.0,0.0
3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.027778,0.0,0.0,0.0
4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
sns.heatmap()

Unnamed: 0_level_0,Spa,Transported,SpaBinned
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0002_01,549.0,True,0.0
0003_01,6715.0,False,5.0
0003_02,3329.0,False,2.0
0004_01,565.0,True,0.0
0005_01,291.0,True,0.0
...,...,...,...
9275_02,50.0,False,0.0
9275_03,2.0,True,0.0
9276_01,1643.0,False,1.0
9279_01,1.0,True,0.0


In [17]:
(X["Spa"] == 0).sum()/X.shape[0]

0.6124467962728632

In [None]:
(X["Spa"] == 0).sum()/X.shape[0]

In [16]:
(X["Spa"] == 0).sum()

5324

In [12]:
X["Spa"]

PassengerId
0001_01       0.0
0002_01     549.0
0003_01    6715.0
0003_02    3329.0
0004_01     565.0
            ...  
9276_01    1643.0
9278_01       0.0
9279_01       1.0
9280_01     353.0
9280_02       0.0
Name: Spa, Length: 8693, dtype: float64

In [None]:
X["Cabin"]

In [None]:
X_ = X["Cabin"].str.split("/", expand = True)

In [None]:
X_.rename(
    columns = {
        0:"FirstLetter",
        1:"MiddleLetter",
        2:"LastLetter"
    },
    inplace = True
)

In [None]:
X = pd.concat(
    [
        X,
        X_
    ],
    axis = 1
)

In [None]:
X.shape

In [None]:
X["MiddleLetter"].nunique()

In [None]:
X["MiddleLetter"].isnull().sum()

In [None]:
X["MiddleLetter"] = X["MiddleLetter"].astype(float)

In [None]:
X["MiddleLetter"] = X["MiddleLetter"].fillna(2_000)

In [None]:
gbdf = (
    X
    .groupby(["MiddleLetter"])
    .agg(
        mean_transported  = ("Transported", np.mean),
        nr_transported  = ("Transported", len)
    )
    .reset_index()
)

In [None]:
gbdf

In [None]:
gbdf[gbdf["MiddleLetter"] >= 1_000]

In [None]:
gbdf[(gbdf["nr_transported"] == 1) & (gbdf["MiddleLetter"] >= 1_000)]

In [None]:
gbdf[(gbdf["nr_transported"] == 1) & (gbdf["MiddleLetter"] >= 1_000)]

In [None]:
gbdf[gbdf["nr_transported"] == 1]

In [None]:
(
    X["MiddleLetter"]
    .value_counts()
    .sort_index()
    .plot(kind = "kde")
)

In [None]:
kbins = KBinsDiscretizer(
    n_bins = 20,
    strategy = "uniform",
    encode = "ordinal"
)

X["MiddleLetterBinned"] = kbins.fit_transform(X[["MiddleLetter"]])

In [None]:
gbdf = (
    X
    .groupby(["MiddleLetterBinned"])
    .agg(
        mean_transported  = ("Transported", np.mean),
        nr_transported  = ("Transported", len)
    )
    .reset_index()
)

In [None]:
gbdf.head()

In [None]:
sns.heatmap(gbdf.set_index("MiddleLetterBinned")[["mean_transported"]])

In [None]:
X["TotalExpenses"] = X.iloc[:, 6:11].sum(axis = 1)

In [None]:
(
    X
    .groupby(["HomePlanet"])
    .agg(
        sum_expenses = ("TotalExpenses", np.sum),
        mean_expenses = ("TotalExpenses", np.mean),
        mean_transported = ("Transported", np.mean)
    )
)

In [None]:
X["TotalExpenses"]

In [None]:
X["RoomService"].plot(kind = "kde")

In [None]:
X["RoomService"].value_counts().sort_index()

In [None]:
nulls_per_row = X.isnull()

In [None]:
sns.heatmap(nulls_per_row);

In [None]:
def bucketize_age(age):
    
    if 0 <= age < 10:
        return "[0-10)"
    
    elif 10 <= age < 20:
        return "[10-20)"
    
    elif 20 <= age < 30:
        return "[20-30)"
    
    elif 30 <= age < 40:
        return "[30-40)"
    
    elif 40 <= age < 50:
        return "[40-50)"
    
    elif 50 <= age < 60:
        return "[50-60)" 
    
    elif 60 <= age < 70:
        return "[60-70)"
    
    elif age >= 70:
        return "[70 - inf"

In [None]:
Xc = X.copy(deep=True).dropna()

In [None]:
Xc["AgeBuketized"] = Xc["Age"].apply(bucketize_age)

Xc["AgeKBinsDiscretized"] = KBinsDiscretizer(
    encode = "ordinal",
    strategy = "uniform",
    n_bins = 8
).fit_transform(Xc[["Age"]].dropna())

In [None]:
Xc[["AgeKBinsDiscretized", "AgeBuketized"]].drop_duplicates().sort_values("AgeKBinsDiscretized")

In [None]:
(
    Xc
    .pivot_table(
        index = "AgeBuketized",
        columns = "AgeKBinsDiscretized",
        values = "Age",
        aggfunc = len,
        margins = True,
        fill_value = 0
    )
)

In [None]:
Xc[["AgeKBinsDiscretized", "AgeBuketized", "Age"]].drop_duplicates().sort_values("Age")

In [None]:
r_ = (
    Xc
    .groupby("AgeBuketized")
    .agg(
        nr = ("CryoSleep", len),
        mean_transported = ("Transported", np.mean)
    )
    .reset_index()
)

In [None]:
r_

In [None]:
fig = plt.figure()
ax = fig.subplots()

ax.bar(x = r_["AgeBuketized"], height = r_["nr"], color = "#A4CE95")

ax.set_ylim(bottom=0, top=3000)

ax2 = ax.twinx()

ax2.plot(r_["mean_transported"], color = "#6196A6");

In [None]:
(X.isnull().sum(axis = 1) == X.shape[1]).sum()

In [None]:
(
    X
    .groupby("HomePlanet")
    ["Transported"]
    .mean()
    .sort_values(ascending = False)
    .plot(kind = "bar")
);

In [None]:
X.head()

In [None]:
_ = (
    X
    .groupby(["HomePlanet", "CryoSleep"])
    .agg(
        nr_rows = ("Cabin", len),
        transported_ratio = ("Transported", np.mean)
    )
    .assign(
        nr_rows_pct = lambda df: round((df["nr_rows"]/df["nr_rows"].sum())*100, 2),
        transported_ratio = lambda df: round(df["transported_ratio"] * 100, 2)
    )
);

In [None]:
_

In [None]:
X["LenCabin"] = X["Cabin"].apply(lambda cabin: len(str(cabin)))

In [None]:
X.groupby(["LenCabin"]).size()

In [None]:
X[X["LenCabin"] == 5].head()

In [None]:
X["Cabin"].fillna("NA", inplace = True)

In [None]:
X["CabinFirstLetter"] = X["Cabin"].apply(lambda cabin: cabin[0] if cabin != "NA" else cabin)
X["CabinLastLetter"] = X["Cabin"].apply(lambda cabin: cabin[-1] if cabin != "NA" else cabin)

In [None]:
X["CabinFirstLetter"].value_counts().sort_index()

In [None]:
X["CabinLastLetter"].value_counts().sort_index()

In [None]:
X["Cabin"].sample(10)

In [None]:
(
    X
    .groupby(["CabinFirstLetter"])
    ["Transported"]
    .mean()
    .sort_index()
)

In [None]:
(
    X
    .groupby(["CabinLastLetter"])
    ["Transported"]
    .mean()
    .sort_index()
)

In [None]:
pd.concat([
    (
        X
        .groupby(["CabinFirstLetter", "CabinLastLetter"])
        ["Transported"]
        .mean()
        .sort_index()
        .unstack()
    ),
    (
        X
        .groupby(["CabinFirstLetter"])
        ["Transported"]
        .mean()
        .sort_index()
    )], axis = 1)

In [None]:
X.describe()

In [None]:
X.isnull().sum()

In [None]:
X.shape

In [None]:
(
    X
    .pivot_table(
        index = "HomePlanet",
        columns = "Destination",
        values = "Transported",
        aggfunc = [len, np.mean],
        margins = True
    )
)

In [None]:
X["Name"].fillna("NA").apply(lambda name: len(name.split(" "))).value_counts()

In [None]:
X.head()

In [None]:
X["Surname"] = X["Name"].fillna("NA").apply(lambda name: name.split(" ")[1] if name != "NA" else name)

In [None]:
X["Surname"]

In [None]:
(
    X
    .groupby(["Surname"])
    ["Transported"]
    .mean()
)

In [None]:
(
    X
    .groupby(["Surname"])
    ["Transported"]
    .mean()
    .value_counts(normalize=True)
    .sort_index()
)

In [None]:
del X, _