## Titanic
### Analysis and modeling of the Titanic dataset
This Jupyter notebook analysis and models the famous Titanic dataset using Polar and logistic regression.

This is *not* an attempt to find the best predictive model.

#### Import packages

In [140]:
import os
import numpy as np
import polars as pl
#import matplotlib as plt
import random
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, precision_score, recall_score, log_loss

#### List files and folders

In [14]:
os.listdir()

['.git', '.gitignore', 'README.md', 'titanic.csv', 'titanic.ipynb', 'venv']

#### Load dataset

In [70]:
df = pl.read_csv("titanic.csv", separator=",")

nrows, ncols = df.shape

df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""


#### Have a look at the columns

In [113]:
df.describe()

statistic,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3
str,f64,f64,str,str,str,f64,f64,f64,str,f64,str,str,f64,f64,f64,f64,f64
"""count""",891.0,891.0,"""891""","""891""","""891""",714.0,891.0,891.0,"""891""",891.0,"""204""","""889""",891.0,891.0,891.0,891.0,891.0
"""null_count""",0.0,0.0,"""0""","""0""","""0""",177.0,0.0,0.0,"""0""",0.0,"""687""","""2""",0.0,0.0,0.0,0.0,0.0
"""mean""",446.0,0.383838,,,,29.699118,0.523008,0.381594,,32.204208,,,0.352413,0.647587,0.242424,0.20651,0.551066
"""std""",257.353842,0.486592,,,,14.526497,1.102743,0.806057,,49.693429,,,0.47799,0.47799,0.42879,0.405028,0.497665
"""min""",1.0,0.0,,"""Abbing, Mr. Anthony""",,0.42,0.0,0.0,"""110152""",0.0,"""A10""","""C""",0.0,0.0,0.0,0.0,0.0
"""25%""",224.0,0.0,,,,20.0,0.0,0.0,,7.925,,,0.0,0.0,0.0,0.0,0.0
"""50%""",446.0,0.0,,,,28.0,0.0,0.0,,14.4542,,,0.0,1.0,0.0,0.0,1.0
"""75%""",669.0,1.0,,,,38.0,1.0,0.0,,31.0,,,1.0,1.0,0.0,0.0,1.0
"""max""",891.0,1.0,,"""van Melkebeke, Mr. Philemon""",,80.0,8.0,6.0,"""WE/P 5735""",512.3292,"""T""","""S""",1.0,1.0,1.0,1.0,1.0


#### Change variables

In [121]:
df = df.with_columns(pl.col("Pclass").cast(pl.String).cast(pl.Categorical),
                     pl.col("Sex").cast(pl.Categorical))

# Add a column indicating when Age is missing
df = df.with_columns(pl.col("Age").fill_null(0))
df = df.with_columns(Age_miss = pl.lit(np.where(df['Age'] == 0, 1, 0)))

df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3,Age_miss
i64,i64,cat,str,cat,f64,i64,i64,str,f64,str,str,f64,f64,f64,f64,f64,i64
1,0,"""3""","""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""",0.0,1.0,0.0,0.0,1.0,0
2,1,"""1""","""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C""",1.0,0.0,1.0,0.0,0.0,0
3,1,"""3""","""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S""",1.0,0.0,0.0,0.0,1.0,0


####

#### Divide dataset into training and test set
You should only train your model _once_, otherwise you need a third dataset.

In [90]:
random.seed(123)
train_pct = 0.8
train_nrows = np.floor(train_pct * nrows).astype(int)
train_nrows
train_rows = sorted(random.sample(range(nrows), train_nrows))
test_rows  = sorted(list(set(range(nrows)) - set(train_rows)))

print("Total rows:", nrows, "\nTraining rows:", len(train_rows), "\nTest rows:", len(test_rows))

Total rows: 891 
Training rows: 712 
Test rows: 179


#### Encode variables

In [110]:
encoder = OneHotEncoder(sparse_output=False).set_output(transform="polars")

df_enc = encoder.fit_transform(df[['Sex', 'Pclass']])

df_enc.head(3)

df = pl.concat([df, df_enc], how="horizontal")

df.head(3)

PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Sex_female,Sex_male,Pclass_1,Pclass_2,Pclass_3
i64,i64,cat,str,cat,f64,i64,i64,str,f64,str,str,f64,f64,f64,f64,f64
1,0,"""3""","""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S""",0.0,1.0,0.0,0.0,1.0
2,1,"""1""","""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C""",1.0,0.0,1.0,0.0,0.0
3,1,"""3""","""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S""",1.0,0.0,0.0,0.0,1.0


#### Fit a logistic regression
Setup:
* Use the training dataset
* Use the variables: Pclass, Sex, Age.
* Don't use a penalty term

In [143]:
x_cols = ['Pclass_1', 'Pclass_2', 'Pclass_3',
          'Sex_male', 'Sex_female',
          'Age', 'Age_miss']
x_train = df[train_rows, :].select(x_cols).to_numpy()

y_train = df[train_rows, :].select("Survived").to_numpy().ravel()

model = LogisticRegression(penalty=None).fit(x_train, y_train)
y_fitted = model.predict(x_train)
y_prob   = model.predict_proba(x_train)

#### Training evaluation

In [146]:
#y_fitted = np.where(probs >= df['Survived'].mean(), 1, 0)
confusion_matrix(y_train, y_fitted)
log_loss(y_train, y_prob)
precision_score(y_train, y_fitted)
recall_score(y_train, y_fitted)

np.float64(0.6818181818181818)