In [11]:
import os
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from dotenv import load_dotenv

import src.features.basic as ftr_basic
import src.features.transforms as transforms
import src.features.transforms as trf
import src.utils.io as io_utils

In [2]:
load_dotenv()
warnings.filterwarnings("ignore")
%matplotlib inline
%load_ext autoreload
%autoreload 2


ROOT = Path(os.getenv("ROOT"))
CONFIG_DIR = ROOT / Path("src/config/")
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [3]:
data_cfg = io_utils.load_yaml(CONFIG_DIR / "data.yaml")
DATA_NEW_FTR = data_cfg["new_features_data_full"]
DATA_PATH = ROOT / "data/processed/full_dataset/clean_data_features.parquet"
data = io_utils.load_df_parquet(DATA_PATH)

data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown


In [4]:
def transforms(df: pd.DataFrame) -> pd.DataFrame:
    trans = {
        "was_contact": trf.was_contact(df),
        "credit_score": trf.credit_score(df),
        "job_marital": trf.job_marital(df),
        "job_education": trf.job_education(df),
        "education_marital": trf.education_marital(df),
        "campaign_cat": trf.campaign_categorical(df),
        "pdays_cat": trf.pdays_categorical(df),
        "previous_cat": trf.previous_categorical(df),
        "log_duration": trf.log_duration(df),
        "log_balance": trf.log_balance(df),
        "multiply_logs": trf.multiply_logs(df),
        "is_overdraft": trf.is_overdraft(df),
        "sin_month": trf.sin_month(df),
        "cos_month": trf.cos_month(df),
        "sin_day": trf.sin_day(df),
        "cos_day": trf.cos_day(df),
    }
    df_new = df.copy()
    for col in trans:
        df_new[col] = trans[col]
    return df_new

In [5]:
data_new = transforms(data)

data_new.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,...,pdays_cat,previous_cat,log_duration,log_balance,multiply_logs,is_overdraft,sin_month,cos_month,sin_day,cos_day
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,...,no_contact,0,5.568345,7.670429,42.711589,0,0.866025,-0.5,0.724793,0.688967
1,44,technician,single,secondary,no,29,yes,no,unknown,5,...,no_contact,0,5.023881,3.401197,17.087209,0,0.866025,-0.5,0.724793,0.688967
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,...,no_contact,0,4.343805,1.098612,4.772158,0,0.866025,-0.5,0.724793,0.688967
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,...,no_contact,0,4.532599,7.317876,33.169002,0,0.866025,-0.5,0.724793,0.688967
4,33,unknown,single,unknown,no,1,no,no,unknown,5,...,no_contact,0,5.293305,0.693147,3.669039,0,0.866025,-0.5,0.724793,0.688967


In [6]:
jbm_enc = trf.JobBalanceEnc()
data_new["jb_mean"] = jbm_enc.fit_transform(data_new)

In [7]:
drop_columns = [
    "default",
    "balance",
    "housing",
    "loan",
    "day",
    "month",
    "duration",
    "campaign",
    "pdays",
    "previous",
]

In [8]:
data_new = data_new.drop(columns=drop_columns)

data_new.head()

Unnamed: 0,age,job,marital,education,contact,poutcome,was_contact,credit_score,job_marital,job_education,...,previous_cat,log_duration,log_balance,multiply_logs,is_overdraft,sin_month,cos_month,sin_day,cos_day,jb_mean
0,58,management,married,tertiary,unknown,unknown,0,1,management_married,management_tertiary,...,0,5.568345,7.670429,42.711589,0,0.866025,-0.5,0.724793,0.688967,1763.321723
1,44,technician,single,secondary,unknown,unknown,0,1,technician_single,technician_secondary,...,0,5.023881,3.401197,17.087209,0,0.866025,-0.5,0.724793,0.688967,1252.076617
2,33,entrepreneur,married,secondary,unknown,unknown,0,2,entrepreneur_married,entrepreneur_secondary,...,0,4.343805,1.098612,4.772158,0,0.866025,-0.5,0.724793,0.688967,1520.405978
3,47,blue-collar,married,unknown,unknown,unknown,0,1,blue-collar_married,blue-collar_unknown,...,0,4.532599,7.317876,33.169002,0,0.866025,-0.5,0.724793,0.688967,1079.117507
4,33,unknown,single,unknown,unknown,unknown,0,0,unknown_single,unknown_unknown,...,0,5.293305,0.693147,3.669039,0,0.866025,-0.5,0.724793,0.688967,1758.593118


In [9]:
ftr_names = ftr_basic.get_features_names(data_new)
cat_features, num_features = ftr_names["categorical"], ftr_names["numeric"]

data_new = ftr_basic.cat_features_to_category(data_new)

In [10]:
io_utils.save_df_parquet(data_new, ROOT / DATA_NEW_FTR["data_new_features_path"])