In [1]:
import os
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from dotenv import load_dotenv

import src.features.basic as ftr_basic
import src.features.transforms as transforms
import src.features.transforms as trf
import src.utils.io as io_utils

In [2]:
load_dotenv()
warnings.filterwarnings("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2


ROOT = Path(os.getenv("ROOT"))
CONFIG_DIR = ROOT / Path("src/config/")
DATA_CFG = io_utils.load_yaml(CONFIG_DIR / "data.yaml")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [4]:
DATA_PATH = DATA_CFG["data_raw"]["test_path"]
data = pd.read_csv(ROOT / DATA_PATH)

data.head()

Unnamed: 0,id,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,750000,32,blue-collar,married,secondary,no,1397,yes,no,unknown,21,may,224,1,-1,0,unknown
1,750001,44,management,married,tertiary,no,23,yes,no,cellular,3,apr,586,2,-1,0,unknown
2,750002,36,self-employed,married,primary,no,46,yes,yes,cellular,13,may,111,2,-1,0,unknown
3,750003,58,blue-collar,married,secondary,no,-1380,yes,yes,unknown,29,may,125,1,-1,0,unknown
4,750004,28,technician,single,secondary,no,1950,yes,no,cellular,22,jul,181,1,-1,0,unknown


In [5]:
data_ids = data["id"]
data = data.drop(columns=["id"])

data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,32,blue-collar,married,secondary,no,1397,yes,no,unknown,21,may,224,1,-1,0,unknown
1,44,management,married,tertiary,no,23,yes,no,cellular,3,apr,586,2,-1,0,unknown
2,36,self-employed,married,primary,no,46,yes,yes,cellular,13,may,111,2,-1,0,unknown
3,58,blue-collar,married,secondary,no,-1380,yes,yes,unknown,29,may,125,1,-1,0,unknown
4,28,technician,single,secondary,no,1950,yes,no,cellular,22,jul,181,1,-1,0,unknown


In [6]:
data.isna().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
dtype: int64

In [7]:
def transforms(df: pd.DataFrame) -> pd.DataFrame:
    trans = {
        "was_contact": trf.was_contact(df),
        "credit_score": trf.credit_score(df),
        "job_marital": trf.job_marital(df),
        "job_education": trf.job_education(df),
        "education_marital": trf.education_marital(df),
        "campaign_cat": trf.campaign_categorical(df),
        "pdays_cat": trf.pdays_categorical(df),
        "previous_cat": trf.previous_categorical(df),
        "log_duration": trf.log_duration(df),
        "log_balance": trf.log_balance(df),
        "multiply_logs": trf.multiply_logs(df),
        "is_overdraft": trf.is_overdraft(df),
        "sin_month": trf.sin_month(df),
        "cos_month": trf.cos_month(df),
        "sin_day": trf.sin_day(df),
        "cos_day": trf.cos_day(df),
    }
    df_new = df.copy()
    for col in trans:
        df_new[col] = trans[col]
    return df_new

In [8]:
data_new = transforms(data)

data_new.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,pdays_cat,previous_cat,log_duration,log_balance,multiply_logs,is_overdraft,sin_month,cos_month,sin_day,cos_day
0,32,blue-collar,married,secondary,no,1397,yes,no,unknown,21,...,no_contact,0,5.4161,7.242798,39.227721,0,0.8660254,-0.5,-0.790776,-0.612106
1,44,management,married,tertiary,no,23,yes,no,cellular,3,...,no_contact,0,6.375025,3.178054,20.260172,0,1.0,6.123234000000001e-17,0.394356,0.918958
2,36,self-employed,married,primary,no,46,yes,yes,cellular,13,...,no_contact,0,4.718499,3.850148,18.166917,0,0.8660254,-0.5,0.651372,-0.758758
3,58,blue-collar,married,secondary,no,-1380,yes,yes,unknown,29,...,no_contact,0,4.836282,0.0,0.0,1,0.8660254,-0.5,-0.571268,0.820763
4,28,technician,single,secondary,no,1950,yes,no,cellular,22,...,no_contact,0,5.204007,7.576097,39.426061,0,1.224647e-16,-1.0,-0.897805,-0.440394


In [9]:
jbm_enc = trf.JobBalanceEnc()
data_new["jb_mean"] = jbm_enc.fit_transform(data_new)

In [10]:
drop_columns = [
    "default",
    "balance",
    "housing",
    "loan",
    "day",
    "month",
    "duration",
    "campaign",
    "pdays",
    "previous",
]

In [11]:
data_new = data_new.drop(columns=drop_columns)

data_new.head()

Unnamed: 0,age,job,marital,education,contact,poutcome,was_contact,credit_score,job_marital,job_education,...,previous_cat,log_duration,log_balance,multiply_logs,is_overdraft,sin_month,cos_month,sin_day,cos_day,jb_mean
0,32,blue-collar,married,secondary,unknown,unknown,0,1,blue-collar_married,blue-collar_secondary,...,0,5.4161,7.242798,39.227721,0,0.8660254,-0.5,-0.790776,-0.612106,958.58349
1,44,management,married,tertiary,cellular,unknown,0,1,management_married,management_tertiary,...,0,6.375025,3.178054,20.260172,0,1.0,6.123234000000001e-17,0.394356,0.918958,1519.520961
2,36,self-employed,married,primary,cellular,unknown,0,2,self-employed_married,self-employed_primary,...,0,4.718499,3.850148,18.166917,0,0.8660254,-0.5,0.651372,-0.758758,1539.786643
3,58,blue-collar,married,secondary,unknown,unknown,0,2,blue-collar_married,blue-collar_secondary,...,0,4.836282,0.0,0.0,1,0.8660254,-0.5,-0.571268,0.820763,958.58349
4,28,technician,single,secondary,cellular,unknown,0,1,technician_single,technician_secondary,...,0,5.204007,7.576097,39.426061,0,1.224647e-16,-1.0,-0.897805,-0.440394,1075.092114


In [12]:
ftr_names = ftr_basic.get_features_names(data_new)
cat_features, num_features = ftr_names["categorical"], ftr_names["numeric"]

data_new = ftr_basic.cat_features_to_category(data_new)

In [14]:
io_utils.save_df_parquet(data_new, ROOT / "data/processed/test_new_features.parquet")
io_utils.save_df_parquet(data_ids, ROOT / "data/processed/test_ids.parquet")