# Titanic classification with MOSTLY AI Engine

This notebook trains the tabular generator on the Titanic dataset, generates synthetic data, and computes class probabilities for `survived` conditioned on all other features.


In [6]:
# If running in a fresh environment, ensure the 'uv' kernel is available.
# You can create a kernel bound to the current uv virtual environment with:
#   uv run python -m ipykernel install --user --name=mostlyai-uv --display-name "Python (mostlyai-uv)"
# Then, in Jupyter, select the kernel "Python (mostlyai-uv)".

from pathlib import Path

import pandas as pd

from mostlyai import engine
from mostlyai.engine.domain import ModelType

ROOT = Path("..").resolve()
DATA_PATH = ROOT / "data" / "titanic_unbalanced.csv"


df = pd.read_csv(DATA_PATH)

features = [c for c in df.columns if c != "survived"]
ws = ROOT / "ws-titanic-unbalanced-classify"

engine.init_logging()

In [7]:
# Train the generator
engine.split(workspace_dir=ws, tgt_data=df, model_type=ModelType.tabular)
engine.analyze(workspace_dir=ws, value_protection=False)
engine.encode(workspace_dir=ws)
engine.train(workspace_dir=ws, enable_flexible_generation=True)

[2025-10-02 13:56:55,265] INFO   : SPLIT started
[2025-10-02 13:56:55,267] INFO   : create `/home/ivona/git/mostlyai-engine/ws-titanic-unbalanced-classify/OriginalData/tgt-data`
[2025-10-02 13:56:55,268] INFO   : create `/home/ivona/git/mostlyai-engine/ws-titanic-unbalanced-classify/OriginalData/tgt-meta`
[2025-10-02 13:56:55,270] INFO   : model_type='TABULAR'
[2025-10-02 13:56:55,270] INFO   : tgt_encoding_types={'survived': 'TABULAR_NUMERIC_AUTO', 'pclass': 'TABULAR_NUMERIC_AUTO', 'sex': 'TABULAR_CATEGORICAL', 'age': 'TABULAR_NUMERIC_AUTO', 'sibsp': 'TABULAR_NUMERIC_AUTO', 'parch': 'TABULAR_NUMERIC_AUTO', 'fare': 'TABULAR_NUMERIC_AUTO', 'embarked': 'TABULAR_CATEGORICAL'}
[2025-10-02 13:56:55,281] INFO   : SPLIT finished in 0.01s
[2025-10-02 13:56:55,281] INFO   : ANALYZE started
[2025-10-02 13:56:55,283] INFO   : create `/home/ivona/git/mostlyai-engine/ws-titanic-unbalanced-classify/ModelStore/tgt-stats`
[2025-10-02 13:56:55,284] INFO   : analyzing 2 partitions in parallel
[2025-10-0

In [8]:
# Generate synthetic data (optional)
engine.generate(workspace_dir=ws, sample_size=len(df))

[2025-10-02 13:57:07,762] INFO   : GENERATE_TABULAR started
[2025-10-02 13:57:07,763] INFO   : create `/home/ivona/git/mostlyai-engine/ws-titanic-unbalanced-classify/SyntheticData`
[2025-10-02 13:57:07,765] INFO   : is_sequential=False
[2025-10-02 13:57:07,765] INFO   : has_context=False
[2025-10-02 13:57:07,766] INFO   : enable_flexible_generation=True
[2025-10-02 13:57:07,767] INFO   : len(tgt_sub_columns)=8
[2025-10-02 13:57:07,768] INFO   : len(ctx_sub_columns)=0
[2025-10-02 13:57:07,768] INFO   : device=device(type='cpu')
[2025-10-02 13:57:07,769] INFO   : tgt_primary_key=None, tgt_context_key=None, ctx_primary_key=None
[2025-10-02 13:57:07,770] INFO   : imputation: None
[2025-10-02 13:57:07,770] INFO   : rebalancing: None
[2025-10-02 13:57:07,771] INFO   : fairness: None
[2025-10-02 13:57:07,771] INFO   : seed_data: None
[2025-10-02 13:57:07,772] INFO   : gen_column_order=['tgt:t0/c0', 'tgt:t1/c1', 'tgt:t2/c2', 'tgt:t3/c3', 'tgt:t4/c4', 'tgt:t5/c5', 'tgt:t6/c6', 'tgt:t7/c7']
[202

In [9]:
# Classify 'survived' using all other columns as features
proba_df = engine.classify(data=df, features=features, target="survived", workspace_dir=ws)
proba_df.head()

[2025-10-02 13:57:10,861] INFO   : loaded model weights in 0.01s


Unnamed: 0,pclass,sex,age,sibsp,parch,fare,embarked,proba__RARE_,proba_0,proba_1
0,3,female,18.0,0,0,8.0,S,0.00033,0.998486,0.001184
1,3,male,28.0,0,0,8.0,S,0.001703,0.996027,0.002271
2,3,male,24.0,1,0,16.0,S,0.002197,0.99477,0.003032
3,3,male,23.0,0,0,9.0,S,0.001203,0.996359,0.002438
4,3,male,18.0,1,0,6.0,S,0.001577,0.994884,0.003539


In [17]:
# Classify 'survived' using all other columns as features
proba_df2 = engine.classify(data=df, features=[], target="survived", workspace_dir=ws)
proba_df2.head()

[2025-10-02 14:07:24,324] INFO   : loaded model weights in 0.01s


Unnamed: 0,proba__RARE_,proba_0,proba_1
0,0.180144,0.653679,0.166178
1,0.180144,0.653679,0.166178
2,0.180144,0.653679,0.166178
3,0.180144,0.653679,0.166178
4,0.180144,0.653679,0.166178


In [13]:
# !uv pip install plotly

In [14]:
import plotly.express as px
import plotly.io as pio

pio.renderers.default = "notebook"

In [None]:
def compare_probs(df, p1, p2):
    cols_to_plot = ["pro", "proba_1"]

    # Melt the dataframe into long format
    df_melted = proba_df.melt(value_vars=cols_to_plot, var_name="feature", value_name="value")

    # Create overlapping histograms
    fig = px.histogram(
        df_melted,
        x="value",
        color="feature",
        barmode="overlay",  # overlay histograms
        opacity=0.6,  # slight transparency so both show
        marginal="box",  # optional: adds a boxplot on the side
    )

    fig.update_layout(title="Histograms of RARE and minority class", xaxis_title="Value", yaxis_title="Count")

    fig.show()

In [16]:
# Choose the two columns you want to compare
cols_to_plot = ["proba__RARE_", "proba_1"]

# Melt the dataframe into long format
df_melted = proba_df.melt(value_vars=cols_to_plot, var_name="feature", value_name="value")

# Create overlapping histograms
fig = px.histogram(
    df_melted,
    x="value",
    color="feature",
    barmode="overlay",  # overlay histograms
    opacity=0.6,  # slight transparency so both show
    marginal="box",  # optional: adds a boxplot on the side
)

fig.update_layout(title="Histograms of RARE and minority class", xaxis_title="Value", yaxis_title="Count")

fig.show()