In [1]:
import pathlib
import pandas as pd
from columns_fill_nulls import fill_nulls
import polars as pl

pl.Config(set_fmt_float="full")


<polars.config.Config at 0x703424705610>

In [2]:
BASE_DATA_DIR = pathlib.Path("../data/datalake/bronze/train")

BASE_TRAIN_PATH = BASE_DATA_DIR / "train_base.parquet"
STATIC_INTERNAL_0_PATH = BASE_DATA_DIR / "train_static_0_0.parquet"
STATIC_INTERNAL_1_PATH = BASE_DATA_DIR / "train_static_0_1.parquet"
STATIC_EXTERNAL_PATH = BASE_DATA_DIR / "train_static_cb_0.parquet"

## Functions

## Load data

In [3]:
def get_date_categorized_transforms(dataframe: pl.LazyFrame) -> list[pl.Expr]:
    date_transforms = []

    for col in dataframe.columns:
        if not col.endswith("D"):
            continue

        transform = (
            pl.when(pl.col(col) > 365 * 5).then(pl.lit("MORE_THAN_5_YEARS")).otherwise(
                pl.when((pl.col(col) <= 365 * 5) & (pl.col(col) > 365)).then(pl.lit("BETWEEN_1_AND_5_YEARS")).otherwise(
                    pl.when((pl.col(col) <= 365) & (pl.col(col) > 180)).then(pl.lit("BETWEEN_6_MONTHS_AND_1_YEAR")).otherwise(
                        pl.when((pl.col(col) <= 180) & (pl.col(col).is_not_null())).then(pl.lit("LESS_THAN_6_MONTH")).otherwise(pl.lit("UNKNOWN"))
                    )
                )
            ).alias(col))
        date_transforms.append(transform)

    return date_transforms


def get_date_transforms(dataframe: pl.LazyFrame) -> list[pl.Expr]:
    date_transforms = []
    for col in dataframe.columns:
        if not col.endswith("D"):
            continue

        date_transforms.append(
            (pl.col("date_decision").cast(pl.Date) - pl.col(col).cast(pl.Date)).dt.total_days().alias(col)
        )

    return date_transforms


def get_group_by_transforms(dataframe: pl.LazyFrame, group_by_cols: list[str]) -> list[pl.Expr]:
    transforms = []
    for col, col_dtype in zip(dataframe.columns, dataframe.dtypes):
        if col in group_by_cols:
            continue

        if isinstance(col_dtype, tuple(pl.NUMERIC_DTYPES)):
            transforms.append(pl.col(col).mean().shrink_dtype())

        elif isinstance(col_dtype, pl.String):
            transforms.append(pl.col(col).mode().drop_nulls().first())

        elif isinstance(col_dtype, pl.Date):
            transforms.append(pl.col(col).first())

    return transforms


def get_columns_with_more_85_perc_nulls(dataframe: pl.LazyFrame, threshold: float = 0.85) -> list[str]:
    dataframe = dataframe.collect()
    length_dataframe: int = dataframe.shape[0]

    describe = dataframe.describe(percentiles=[]).filter(pl.col("statistic") == "null_count")
    describe = describe.transpose(include_header=True, header_name="column", column_names=["null_count"]).slice(1).with_columns(pl.col("null_count").cast(pl.Float32) / length_dataframe)

    return describe.filter(pl.col("null_count") > threshold).select("column").to_series().to_list()


def binarize_columns_with_more_85_perc_nulls(columns: list[str]) -> list[pl.Expr]:
    transforms = []
    for col in columns:
        transforms.append(
            pl.when(pl.col(col).is_null()).then(pl.lit(False)).otherwise(pl.lit(True)).alias(col)
        )
    return transforms


In [4]:
base_train: pl.LazyFrame = pl.scan_parquet(BASE_TRAIN_PATH)
static_internal: pl.LazyFrame = pl.concat([pl.scan_parquet(STATIC_INTERNAL_0_PATH), pl.scan_parquet(STATIC_INTERNAL_1_PATH)])
static_external: pl.LazyFrame = pl.scan_parquet(STATIC_EXTERNAL_PATH)

In [5]:
JOIN_COLUMNS = ["case_id"]

data: pl.LazyFrame = base_train.join(other=static_internal, how="left", on=JOIN_COLUMNS).join(other=static_external, how="left", on=JOIN_COLUMNS)

#### Remove birth date related columns

In [6]:
data = data.drop([col for col in data.columns if "birth" in col])

#### Binarize columns that contains more than 85% of null values

In [7]:
cols_to_binarize = get_columns_with_more_85_perc_nulls(data)

data = data.with_columns(*binarize_columns_with_more_85_perc_nulls(cols_to_binarize))

#### Fill Nulls

In [8]:
data.select("case_id", "firstclxcampaign_1125D").drop_nulls().collect()

case_id,firstclxcampaign_1125D
i64,str
175,"""2019-01-07"""
1403,"""2019-01-28"""
2302,"""2019-02-06"""
2331,"""2019-02-06"""
2542,"""2019-02-05"""
6496,"""2019-01-19"""
26903,"""2019-08-30"""
34826,"""2019-11-01"""
40231,"""2019-12-23"""
103059,"""2016-01-31"""


In [9]:
nulls_data: pd.DataFrame = fill_nulls(data).describe(percentiles=[]).filter(pl.col("statistic") == "null_count").drop("statistic").transpose(include_header=True, column_names=["null_count"]).with_columns(pl.col("null_count").cast(pl.Float32).cast(pl.Int32)).to_pandas().query("null_count > 0")
nulls_data

Unnamed: 0,column,null_count


In [10]:
data.select("annuitynextmonth_57A").describe(percentiles=[])

statistic,annuitynextmonth_57A
str,f64
"""count""",1526655.0
"""null_count""",4.0
"""mean""",1435.7750149552585
"""std""",2807.021231822099
"""min""",0.0
"""max""",87500.0


In [11]:
data.select("paytype1st_925L").collect().to_series().value_counts().sort("count", descending=True)

paytype1st_925L,count
str,u32
"""OTHER""",1468084
,58575


#### Transforms other columns

In [12]:
data = data.with_columns(*get_date_transforms(data))
# data = data.with_columns(*get_date_categorized_transforms(data))
# data = data.with_columns(pl.col(pl.Boolean).fill_null(False).cast(pl.Int8))

# data.sort("case_id").head().collect()

In [13]:
transforms = []

for col, dtype in zip(data.columns, data.dtypes):
    if col == "date_decision":
        transforms.append(pl.col(col).cast(pl.Date).dt.epoch().alias(col))
        continue
    
    if isinstance(dtype, tuple(pl.NUMERIC_DTYPES)):
        transforms.append(pl.col(col).shrink_dtype().alias(col))
    elif isinstance(dtype, pl.Boolean):
        transforms.append(pl.col(col).cast(pl.UInt8).shrink_dtype().alias(col))
    elif isinstance(dtype, pl.String):
        transforms.append(pl.col(col).cast(pl.Categorical).alias(col))
        
data = data.with_columns(*transforms)


In [14]:
## Drop categorical columns with just one category other than null
cols_to_drop = []
for col in data.select([col for col, dtype in zip(data.columns, data.dtypes) if isinstance(dtype, pl.Categorical)]).columns:
    if data.select(col).drop_nulls().unique().collect().shape[0] == 1:
        cols_to_drop.append(col)

data = data.drop(cols_to_drop)

## Impute categorical columns test

In [15]:
from sklearn.cluster import SpectralClustering

numerical_data = data.select(pl.col(pl.NUMERIC_DTYPES))
categorical_data = data.select(pl.col(pl.Categorical))

if False:
    for column in categorical_data.columns:
        category = categorical_data.select(column)
        n_clusters = category.drop_nulls().unique().collect().shape[0]
        print(column, n_clusters)
        cluster = SpectralClustering(n_jobs=-1, random_state=666, n_clusters=category.drop_nulls().unique().collect().shape[0])
        predictions = cluster.fit_predict(numerical_data.collect())
        break

In [16]:
categories_mapping = {}
inverse_categories_mapping = {}
for column in categorical_data.columns:
    category = data.select(pl.col(column).cat.get_categories()).collect().to_series().to_list()
    category = {val: i for i, val in enumerate(category)}
    inverse_category = {i: value for value, i in category.items()}
    categories_mapping[column] = category
    inverse_categories_mapping[column] = inverse_category


In [17]:
transforms = []

for col, map_col in categories_mapping.items():
    transforms.append(pl.col(col).replace(map_col, default=None, return_dtype=pl.Int32))

data = data.with_columns(*transforms)

In [18]:
# %%time
# 
# from sklearn.impute import KNNImputer
# import numpy as np
# 
# 
# imputer = KNNImputer(
#     missing_values=np.nan,
#     n_neighbors=5,
#     weights='distance',
#     copy=True,
#     add_indicator=True,
#     keep_empty_features=False
# )
# imputer.set_output(transform="polars")
# data_imputed = imputer.fit_transform(data.collect())

## Fill null

In [19]:
data = fill_nulls(data)

## Write to file

In [20]:
OUTPUT_BASE_DIR: pathlib.Path = pathlib.Path(f"../data/datalake/silver/depth_0")

data.sink_parquet(OUTPUT_BASE_DIR.with_suffix(".parquet"))