In [60]:
import polars as pl
from collections.abc import Iterable

In [11]:
pl.Config.set_tbl_cols(300)

polars.config.Config

In [68]:
def print_distribution_null_values(df: pl.DataFrame) -> None: 
    for col in df.columns:
        if df[col].is_null().any():
            null_percentage = (df[col].null_count()) / (len(df[col]))

            print(f'{col} contains \'Null\' values: {null_percentage:.3f} of total')

In [78]:
def print_min_max_values(df: pl.DataFrame, **kwargs) -> None:
    for col in df.columns:
        print(f'{col} - Min: {df[col].min()} Max: {df[col].max()}')

In [61]:
def print_column_distribution(df: pl.DataFrame, **kwargs) -> None:
    cols_to_exclude = kwargs.get('cols_to_exclude', None)

    for col in df.columns:
        if cols_to_exclude:
            assert isinstance(cols_to_exclude, Iterable)
            if col in cols_to_exclude:
                continue

        print(col + ':', 
            df
            .group_by(pl.col(col)).count()
            .with_columns((pl.col('count') / pl.sum('count')).alias('percentage'))
            .sort(by='percentage', descending=True)
        )

In [None]:
# 1. Review given files - analyze structure and content

In [12]:
train_df = pl.read_csv(
    'source/train.csv', 
    infer_schema_length=10000
)

print(len(train_df))
train_df.head()

891


PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [13]:
test_df = pl.read_csv(
    'source/test.csv',
    infer_schema_length=10000
)

print(len(test_df))
test_df.head() 

# Test df missing 'Survived' column

418


PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,str,str,f64,i64,i64,str,f64,str,str
892,3,"""Kelly, Mr. James""","""male""",34.5,0,0,"""330911""",7.8292,,"""Q"""
893,3,"""Wilkes, Mrs. James (Ellen Need…","""female""",47.0,1,0,"""363272""",7.0,,"""S"""
894,2,"""Myles, Mr. Thomas Francis""","""male""",62.0,0,0,"""240276""",9.6875,,"""Q"""
895,3,"""Wirz, Mr. Albert""","""male""",27.0,0,0,"""315154""",8.6625,,"""S"""
896,3,"""Hirvonen, Mrs. Alexander (Helg…","""female""",22.0,1,1,"""3101298""",12.2875,,"""S"""


In [10]:
submission_sample_df = pl.read_csv(
    'source/gender_submission.csv',
    infer_schema_length=10000
)

print(len(submission_sample_df))
submission_sample_df.head()

418


PassengerId,Survived
i64,i64
892,0
893,1
894,0
895,0
896,1


In [15]:
# 2.1 Start EDA on Train df

In [17]:
print(len(train_df))
train_df.head()

891


PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
i64,i64,i64,str,str,f64,i64,i64,str,f64,str,str
1,0,3,"""Braund, Mr. Owen Harris""","""male""",22.0,1,0,"""A/5 21171""",7.25,,"""S"""
2,1,1,"""Cumings, Mrs. John Bradley (Fl…","""female""",38.0,1,0,"""PC 17599""",71.2833,"""C85""","""C"""
3,1,3,"""Heikkinen, Miss. Laina""","""female""",26.0,0,0,"""STON/O2. 3101282""",7.925,,"""S"""
4,1,1,"""Futrelle, Mrs. Jacques Heath (…","""female""",35.0,1,0,"""113803""",53.1,"""C123""","""S"""
5,0,3,"""Allen, Mr. William Henry""","""male""",35.0,0,0,"""373450""",8.05,,"""S"""


In [71]:
print_distribution_null_values(train_df)

        # Age: 20% of values are null -> (is field suitable for predictions?)
        # Cabin: 77% of values are null -> (is field suitable for predictions?)
        # Embarked: 2% of values are null -> Negligible

Age contains 'Null' values: 0.199 of total
Cabin contains 'Null' values: 0.771 of total
Embarked contains 'Null' values: 0.002 of total


In [24]:
assert len(train_df['PassengerId'].unique()) == len(train_df['PassengerId']) # Assert Id column only contains unique values

In [79]:
print_min_max_values(train_df)

PassengerId - Min: 1 Max: 891
Survived - Min: 0 Max: 1
Pclass - Min: 1 Max: 3
Name - Min: Abbing, Mr. Anthony Max: van Melkebeke, Mr. Philemon
Sex - Min: female Max: male
Age - Min: 0.42 Max: 80.0
SibSp - Min: 0 Max: 8
Parch - Min: 0 Max: 6
Ticket - Min: 110152 Max: WE/P 5735
Fare - Min: 0.0 Max: 512.3292
Cabin - Min: A10 Max: T
Embarked - Min: C Max: S


In [81]:
# Create buckets for Age, SibSp, Parch, and Fare fields

train_df = train_df.with_columns(
    pl.when(pl.col('Age') <= 20).then(pl.lit('0-20'))
    .when(pl.col('Age') <= 40).then(pl.lit('21-40'))
    .when(pl.col('Age') <= 60).then(pl.lit('41-60'))
    .otherwise(pl.lit('61+'))
    .alias('Age_Bucket'),

    pl.when(pl.col('SibSp') <= 1).then(pl.lit('0-1'))
    .when(pl.col('SibSp') <= 4).then(pl.lit('2-4'))
    .otherwise(pl.lit('4+'))
    .alias('SibSp_Bucket'),

    pl.when(pl.col('Parch') <= 1).then(pl.lit('0-1'))
    .when(pl.col('Parch') <= 4).then(pl.lit('2-4'))
    .otherwise(pl.lit('4+'))
    .alias('Parch_Bucket'),
    
    pl.when(pl.col('Fare') <= 100).then(pl.lit('0-100'))
    .when(pl.col('Fare') <= 200).then(pl.lit('101-200'))
    .when(pl.col('Fare') <= 300).then(pl.lit('201-300'))
    .when(pl.col('Fare') <= 400).then(pl.lit('301-400'))
    .otherwise(pl.lit('400+'))
    .alias('Fare_Bucket'),
)

In [82]:
# See distribution of values for all columns (except for columns with high distributions)

print_column_distribution(train_df, cols_to_exclude = ['PassengerId', 'Name', 'Fare', 'Ticket',])

Survived: shape: (2, 3)
┌──────────┬───────┬────────────┐
│ Survived ┆ count ┆ percentage │
│ ---      ┆ ---   ┆ ---        │
│ i64      ┆ u32   ┆ f64        │
╞══════════╪═══════╪════════════╡
│ 0        ┆ 549   ┆ 0.616162   │
│ 1        ┆ 342   ┆ 0.383838   │
└──────────┴───────┴────────────┘
Pclass: shape: (3, 3)
┌────────┬───────┬────────────┐
│ Pclass ┆ count ┆ percentage │
│ ---    ┆ ---   ┆ ---        │
│ i64    ┆ u32   ┆ f64        │
╞════════╪═══════╪════════════╡
│ 3      ┆ 491   ┆ 0.551066   │
│ 1      ┆ 216   ┆ 0.242424   │
│ 2      ┆ 184   ┆ 0.20651    │
└────────┴───────┴────────────┘
Sex: shape: (2, 3)
┌────────┬───────┬────────────┐
│ Sex    ┆ count ┆ percentage │
│ ---    ┆ ---   ┆ ---        │
│ str    ┆ u32   ┆ f64        │
╞════════╪═══════╪════════════╡
│ male   ┆ 577   ┆ 0.647587   │
│ female ┆ 314   ┆ 0.352413   │
└────────┴───────┴────────────┘
Age: shape: (89, 3)
┌──────┬───────┬────────────┐
│ Age  ┆ count ┆ percentage │
│ ---  ┆ ---   ┆ ---        │
│ f64  ┆ 

  .group_by(pl.col(col)).count()
