#### Imports

In [1]:
import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import polars as pl
import seaborn as sns

#### Define `data_filepath`

In [2]:
data_filepath = pathlib.Path('churn.csv')
assert data_filepath.exists(), f'{data_filepath} does not exist'

#### Data prep with `pandas`

In [3]:
%%time

df_pd = pd.read_csv(data_filepath)

df_pd['TotalCharges'] = pd.to_numeric(
    df_pd['TotalCharges'], 
    errors='coerce',
)

binary_columns = []
for col in df_pd.columns:
    unique_vals = df_pd[col].sort_values().unique().tolist()
    if unique_vals == ['No', 'Yes']:
        binary_columns.append(col)

for col in binary_columns:
    df_pd[col] = df_pd[col].map({'No': 0, 'Yes': 1})

columns_to_exclude = ['customerID']
for col in df_pd.columns:
    if col not in columns_to_exclude:
        if df_pd.dtypes.loc[col] == 'object':
            dummy_columns = pd.get_dummies(df_pd[col], prefix=col, drop_first=True).astype(int)
            df_pd = pd.concat([df_pd, dummy_columns], axis=1)
            df_pd = df_pd.drop(columns=[col])

df_pd.head()

CPU times: user 125 ms, sys: 4.11 ms, total: 130 ms
Wall time: 129 ms


Unnamed: 0,customerID,SeniorCitizen,Partner,Dependents,tenure,PhoneService,PaperlessBilling,MonthlyCharges,TotalCharges,Churn,...,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check
0,7590-VHVEG,0,1,0,1,0,1,29.85,29.85,0,...,0,0,0,0,0,0,0,0,1,0
1,5575-GNVDE,0,0,0,34,1,0,56.95,1889.5,0,...,0,0,0,0,0,1,0,0,0,1
2,3668-QPYBK,0,0,0,2,1,1,53.85,108.15,1,...,0,0,0,0,0,0,0,0,0,1
3,7795-CFOCW,0,0,0,45,0,0,42.3,1840.75,0,...,1,0,0,0,0,1,0,0,0,0
4,9237-HQITU,0,0,0,2,1,1,70.7,151.65,1,...,0,0,0,0,0,0,0,0,1,0


#### Data prep with `polars`

In [4]:
%%time

df_pl = pl.read_csv(data_filepath)

binary_columns = []
for col in df_pl.columns:
    unique_vals = df_pl.get_column(col).unique().sort().to_list()
    if unique_vals == ['No', 'Yes']:
        binary_columns.append(col)

df_pl = df_pl.with_columns(
    pl.col(binary_columns).map_dict({'No': 0, 'Yes': 1})
)

columns_to_exclude = ['customerID']
string_columns = []
for (col, dtype) in zip(df_pl.columns, df_pl.dtypes):
    if col not in columns_to_exclude:
        if dtype == pl.Utf8:
            string_columns.append(col)

df_pl = df_pl.to_dummies(string_columns, drop_first=True)

CPU times: user 23.4 ms, sys: 44.7 ms, total: 68 ms
Wall time: 27.6 ms
