In [160]:
import polars as pl
from enum import Enum
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.cluster import KMeans

In [161]:
class Config(Enum):
    DATA: str = 'files/marketing_campaign.csv'
    DATA_SEP: str = '\t'
    CATEGORICAL: list[str] = [
        'Education',
        'Marital_Status',
    ]


In [162]:
def load_dataset(path: str, sep: str) -> pl.DataFrame | None:
    return pl.read_csv(source=path, separator=sep)

In [163]:
def preprocess(df: pl.DataFrame) -> pl.DataFrame:
    df = df.with_columns(
        pl.col('Income').fill_null(pl.col('Income').mean()),
        pl.col('Dt_Customer').str.to_date(format='%d-%m-%Y'),
    )
    return df

In [164]:
@pl.StringCache()
def encode(df: pl.DataFrame) -> list[pl.DataFrame]:
    df_encoded = df
    for col in Config.CATEGORICAL.value:
        df_encoded = df.with_columns(
            pl.col(col).cast(pl.Categorical).to_physical().alias(col)
        )
        df = df.drop(col)

    return [df_encoded, df]

In [165]:
def scale(df: pl.DataFrame) -> pl.DataFrame:
    scaler = StandardScaler()
    return scaler.fit_transform(df)

In [166]:
df = load_dataset(Config.DATA.value, Config.DATA_SEP.value)

In [167]:
df.describe()

statistic,ID,Year_Birth,Education,Marital_Status,Income,Kidhome,Teenhome,Dt_Customer,Recency,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth,AcceptedCmp3,AcceptedCmp4,AcceptedCmp5,AcceptedCmp1,AcceptedCmp2,Complain,Z_CostContact,Z_Revenue,Response
str,f64,f64,str,str,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",2240.0,2240.0,"""2240""","""2240""",2216.0,2240.0,2240.0,"""2240""",2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0,2240.0
"""null_count""",0.0,0.0,"""0""","""0""",24.0,0.0,0.0,"""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",5592.159821,1968.805804,,,52247.251354,0.444196,0.50625,,49.109375,303.935714,26.302232,166.95,37.525446,27.062946,44.021875,2.325,4.084821,2.662054,5.790179,5.316518,0.072768,0.074554,0.072768,0.064286,0.013393,0.009375,3.0,11.0,0.149107
"""std""",3246.662198,11.984069,,,25173.076661,0.538398,0.544538,,28.962453,336.597393,39.773434,225.715373,54.628979,41.280498,52.167439,1.932238,2.778714,2.923101,3.250958,2.426645,0.259813,0.262728,0.259813,0.245316,0.114976,0.096391,0.0,0.0,0.356274
"""min""",0.0,1893.0,"""2n Cycle""","""Absurd""",1730.0,0.0,0.0,"""01-01-2013""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
"""25%""",2829.0,1959.0,,,35322.0,0.0,0.0,,24.0,24.0,1.0,16.0,3.0,1.0,9.0,1.0,2.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
"""50%""",5462.0,1970.0,,,51390.0,0.0,0.0,,49.0,174.0,8.0,67.0,12.0,8.0,24.0,2.0,4.0,2.0,5.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
"""75%""",8427.0,1977.0,,,68487.0,1.0,1.0,,74.0,504.0,33.0,232.0,50.0,33.0,56.0,3.0,6.0,4.0,8.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,11.0,0.0
"""max""",11191.0,1996.0,"""PhD""","""YOLO""",666666.0,2.0,2.0,"""31-12-2013""",99.0,1493.0,199.0,1725.0,259.0,263.0,362.0,15.0,27.0,28.0,13.0,20.0,1.0,1.0,1.0,1.0,1.0,1.0,3.0,11.0,1.0


In [168]:
df = preprocess(df)
df, df_unencoded = encode(df)
df = scale(df)

In [169]:
df

array([[-0.0209985 , -0.98534473, -1.39422628, ...,  0.        ,
         0.        ,  2.38884634],
       [-1.05305783, -1.23573295, -1.39422628, ...,  0.        ,
         0.        , -0.41861211],
       [-0.44706956, -0.3176428 , -0.46280163, ...,  0.        ,
         0.        , -0.41861211],
       ...,
       [ 0.51690466,  1.01776106,  1.40004769, ...,  0.        ,
         0.        , -0.41861211],
       [ 0.81419936, -1.06880747, -0.46280163, ...,  0.        ,
         0.        , -0.41861211],
       [ 1.17464994, -1.23573295,  0.46862303, ...,  0.        ,
         0.        ,  2.38884634]], shape=(2240, 28))