In [1]:
import os
os.chdir('../')

## Imports

In [2]:
import pandas as pd
import numpy as np
import datetime as dt

from src.data_preprocess import transform_data_type

# supressing warnings
import warnings
warnings.filterwarnings('ignore')

## Funções Auxiliares

In [3]:
class FeatureCreator:

    def __init__(self):
        self.conditions = {}

    def apply_conditions(self, df, features_group):

        self.load_conditions(df, features_group)
        
        for group, conditions in self.conditions.items():

            if group == features_group and group in ("performance", "portfolio"):
                
                for rule, condition in conditions.items():
                    df.loc[:, rule] = (
                        np.where(condition, 1, 0)
                    )
            elif group == features_group and group in ("finance"):
                for rule, condition in conditions.items():
                    df.loc[:, rule] = (
                        np.where(condition, df[rule], 0)
                    )

        if features_group == "rating":
            for rating in ["A", "B", "C", "D", "E"]:

                df[f"rating_{rating}"] = (
                    np.where(
                        df.faixa_score == rating,
                        df.percent_ratings,
                        0
                    ))

        return df

    def load_conditions(self, df, group):

        if group == "performance":

            self.conditions[group] = {
                "is_commun_first_90days": (
                    (df.dt_communication - df.dt_ativacao).dt.days <= 90
                ),
                "is_debelado": (
                    df.status_inad.isin([6, 31, 36, 20]) &
                    ~df.dt_cancel_communication.isnull() &
                    ~df.dt_communication.isnull()
                ),
                "is_debelado_first_4months": (
                    df.status_inad.isin([6, 31, 36, 20]) &
                    ~df.dt_cancel_communication.isnull() &
                    ~df.dt_communication.isnull()        &
                    (df.dt_cancel_communication <= (df.dt_ativacao + dt.timedelta(days=120)))
                ),
                "is_indemnified": (
                    ~df.id_blacklist.isnull() &
                    df.tipo_indenizacao.isin(["A: Primeira", "B: Segunda"])
                ),
                "is_indemn_first_6months": (
                    df.tipo_indenizacao.isin(["A: Primeira", "B: Segunda"]) &
                    ((dt.datetime.now() - df.dt_ativacao).dt.days >= 180) &
                    ((df.cancelado_em - df.dt_ativacao).dt.days >= 180) &
                    (df.dt_indemnity <= (df.dt_ativacao + dt.timedelta(days=180)))
                )

            }

        elif group == "finance":
            self.conditions[group] = {
                "indemnity_value": (
                    df.tipo_indenizacao.isin(["A: Primeira", "B: Segunda"])
                ),
                "valor": (
                    df.tipo_indenizacao.isin(["A: Primeira", "B: Segunda"])
                )
            }

        elif group == "portfolio":
            self.conditions[group] = {
                "is_active": (
                    (df.dt_ativacao <= df.dt_calendar) & (
                    (df.cancelado_em > df.dt_calendar) |
                    (df.dt_exoneracao > df.dt_calendar) | (
                        df.cancelado_em.isnull() &
                        df.dt_exoneracao.isnull()
                    )           
                ))
            }

### Dados

In [4]:
scores = pd.read_csv("data/scores.csv")
info_default = pd.read_csv("data/info_default.csv")
info_contracts = pd.read_csv("data/info_contracts.csv")
tipo_indenizacao = pd.read_csv("data/tipo_indenizacao.csv")

In [5]:
creator = FeatureCreator()

## Pré processamento da ETL

In [6]:
calendar = pd.DataFrame(
    pd.date_range(start=info_contracts.dt_ativacao.min(), end=dt.datetime.now(), freq="M", name="dt_calendar").strftime("%Y-%m")
)

#### Histórico de Contratos Ativos

In [7]:
all_hist = (
    calendar
    .merge(info_contracts[["dt_ativacao", "id_imobiliaria", "id_contrato", "cancelado_em", "dt_exoneracao"]],
            how="cross")
)

all_hist = creator.apply_conditions(all_hist, "portfolio")

In [9]:
active_historic = all_hist.groupby(["dt_calendar", "id_imobiliaria"], as_index=False)["is_active"].sum()

active_historic[active_historic.id_imobiliaria == 121].sort_values("is_active", ascending=False).head()

Unnamed: 0,dt_calendar,id_imobiliaria,is_active
669864,2022-12,121,220
678237,2023-01,121,217
619626,2022-06,121,213
661491,2022-11,121,212
627999,2022-07,121,209


#### Informações de Default

In [10]:
df_default = (
    info_contracts
    .merge(info_default[["id_contrato", "status_inad", "dt_communication", "dt_indemnity", "valor", "dt_cancel_communication"]], on="id_contrato", how="left")
    .merge(scores[["id_contrato", "faixa_score"]], on="id_contrato", how="left")
    .merge(tipo_indenizacao[["id_contrato", "dt_communication", "tipo_indenizacao", "indemnity_value", "id_blacklist"]], on=["id_contrato", "dt_communication"], how="left")
)

df_default.shape

(786450, 19)

In [12]:
df_default = transform_data_type(
    df_default, {
       "datetime": ["dt_ativacao", "dt_communication", "cancelado_em" ,"dt_cancel_communication"]
    }
)
info_contracts = transform_data_type(
    info_contracts, {
       "datetime": ["dt_ativacao"]
    }
)

In [13]:
df_default = creator.apply_conditions(df_default, "performance")
df_default = creator.apply_conditions(df_default, "finance")


df_default.dt_ativacao = df_default.dt_ativacao.dt.strftime("%Y-%m")
df_default.head()

Unnamed: 0,id_imobiliaria,lat_imob,long_imob,dt_ativacao,id_contrato,vl_locacao,dt_exoneracao,cancelado_em,is_churn,exonerated_first_6months,...,dt_cancel_communication,faixa_score,tipo_indenizacao,indemnity_value,id_blacklist,is_commun_first_90days,is_debelado,is_debelado_first_4months,is_indemnified,is_indemn_first_6months
0,6,,,2020-04,122064,700.0,,2020-11-25,0,0,...,2020-06-12,E,,0.0,,1,1,1,0,0
1,6,,,2020-04,122064,700.0,,2020-11-25,0,0,...,2020-07-22,E,,0.0,,1,1,1,0,0
2,6,,,2020-04,122064,700.0,,2020-11-25,0,0,...,2020-09-16,E,,0.0,,0,1,0,0,0
3,6,,,2020-04,122064,700.0,,2020-11-25,0,0,...,2020-09-14,E,,0.0,,0,1,0,0,0
4,6,,,2020-05,130499,650.0,,2020-12-02,0,0,...,NaT,C,,0.0,,0,0,0,0,0


In [14]:
df_default = (
    df_default.merge(
        active_historic,
        left_on=["dt_ativacao", "id_imobiliaria"],
        right_on=["dt_calendar", "id_imobiliaria"],
        how="left"
    )
)

In [15]:
df_commun = (
    df_default.groupby(["dt_ativacao", "id_contrato", "id_imobiliaria", "lat_imob", "long_imob", "is_active", "faixa_score"], as_index=False)
    .agg({
        "is_commun_first_90days": max,
        "is_debelado": max,
        "is_debelado_first_4months": max,
        "is_indemnified": max,
        "indemnity_value": sum,
        # "valor": sum,
        "is_indemn_first_6months": max,
        "vl_locacao": max,
        "is_churn": max,
        "exonerated_first_6months": max
}))

print(df_commun.shape)
df_commun.head()

(393960, 16)


Unnamed: 0,dt_ativacao,id_contrato,id_imobiliaria,lat_imob,long_imob,is_active,faixa_score,is_commun_first_90days,is_debelado,is_debelado_first_4months,is_indemnified,indemnity_value,is_indemn_first_6months,vl_locacao,is_churn,exonerated_first_6months
0,2016-04,61,30,-26.2578,-48.8475,0.0,N/I,0,0,0,0,0.0,0,1190.0,1,0
1,2016-04,66,30,-26.2578,-48.8475,0.0,N/I,0,0,0,0,0.0,0,700.0,0,0
2,2016-06,68,30,-26.2578,-48.8475,1.0,N/I,0,0,0,0,0.0,0,870.0,0,0
3,2016-06,70,30,-26.2578,-48.8475,1.0,N/I,0,0,0,0,0.0,0,686.59,0,0
4,2016-07,79,81,-27.5961,-48.6145,0.0,N/I,0,0,0,0,0.0,0,1200.0,0,0


#### Ativados na Safra

In [20]:
is_activated = df_commun.groupby(["dt_ativacao", "id_imobiliaria"])["id_contrato"].size().to_frame("is_activated").reset_index()

In [21]:
df_contracts = df_commun.merge(is_activated, on=["dt_ativacao", "id_imobiliaria"], how="left")

print(df_contracts.shape)
df_contracts.head()

(393960, 17)


Unnamed: 0,dt_ativacao,id_contrato,id_imobiliaria,lat_imob,long_imob,is_active,faixa_score,is_commun_first_90days,is_debelado,is_debelado_first_4months,is_indemnified,indemnity_value,is_indemn_first_6months,vl_locacao,is_churn,exonerated_first_6months,is_activated
0,2016-04,61,30,-26.2578,-48.8475,0.0,N/I,0,0,0,0,0.0,0,1190.0,1,0,2
1,2016-04,66,30,-26.2578,-48.8475,0.0,N/I,0,0,0,0,0.0,0,700.0,0,0,2
2,2016-06,68,30,-26.2578,-48.8475,1.0,N/I,0,0,0,0,0.0,0,870.0,0,0,2
3,2016-06,70,30,-26.2578,-48.8475,1.0,N/I,0,0,0,0,0.0,0,686.59,0,0,2
4,2016-07,79,81,-27.5961,-48.6145,0.0,N/I,0,0,0,0,0.0,0,1200.0,0,0,1


#### Percentuais de Scores

In [22]:
contracts_score = df_contracts[["dt_ativacao", "id_imobiliaria", "is_activated", "faixa_score"]]

contracts_score["percent_ratings"] = (
    contracts_score
    .groupby(["dt_ativacao", "id_imobiliaria", "faixa_score", "is_activated"])["faixa_score"]
    .transform('count') / contracts_score["is_activated"]
)

contracts_score = contracts_score.groupby(["dt_ativacao", "id_imobiliaria", "faixa_score", "percent_ratings"], as_index=False).max()

In [23]:
contracts_score = creator.apply_conditions(contracts_score, "rating")

In [24]:
contracts_score = (
    contracts_score
    .groupby(["dt_ativacao", "id_imobiliaria"], as_index=False)
    [["rating_A", "rating_B", "rating_C", "rating_D", "rating_E"]]
    .sum().round(2)
)
contracts_score


Unnamed: 0,dt_ativacao,id_imobiliaria,rating_A,rating_B,rating_C,rating_D,rating_E
0,2016-04,30,0.0,0.0,0.0,0.0,0.0
1,2016-06,30,0.0,0.0,0.0,0.0,0.0
2,2016-07,81,0.0,0.0,0.0,0.0,0.0
3,2016-07,95,0.0,0.0,0.0,0.0,0.0
4,2016-08,95,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
92651,2023-01,30458,0.0,0.0,0.0,0.0,1.0
92652,2023-01,30484,0.0,0.0,1.0,0.0,0.0
92653,2023-01,30501,0.0,1.0,0.0,0.0,0.0
92654,2023-01,30537,1.0,0.0,0.0,0.0,0.0


#### Base Final

In [25]:
df_imobs = (
    df_contracts.groupby(["dt_ativacao", "id_imobiliaria", "lat_imob", "long_imob", "is_active", "is_activated"], as_index=False)
    .agg({
        "is_commun_first_90days": sum,
        "is_debelado": sum,
        "is_debelado_first_4months": sum,
        "is_indemnified": sum,
        "indemnity_value": sum,
        # "valor": sum,
        "is_indemn_first_6months": sum,
        "vl_locacao": np.mean,
        "is_churn": sum,
        "exonerated_first_6months": sum
    })
    .merge(contracts_score, on=["dt_ativacao", "id_imobiliaria"], how="left")
    .round(2)
)

print(df_imobs.shape)
df_imobs.head()

(92656, 20)


Unnamed: 0,dt_ativacao,id_imobiliaria,lat_imob,long_imob,is_active,is_activated,is_commun_first_90days,is_debelado,is_debelado_first_4months,is_indemnified,indemnity_value,is_indemn_first_6months,vl_locacao,is_churn,exonerated_first_6months,rating_A,rating_B,rating_C,rating_D,rating_E
0,2016-04,30,-26.26,-48.85,0.0,2,0,0,0,0,0.0,0,945.0,1,0,0.0,0.0,0.0,0.0,0.0
1,2016-06,30,-26.26,-48.85,1.0,2,0,0,0,0,0.0,0,778.3,0,0,0.0,0.0,0.0,0.0,0.0
2,2016-07,81,-27.6,-48.61,0.0,1,0,0,0,0,0.0,0,1200.0,0,0,0.0,0.0,0.0,0.0,0.0
3,2016-07,95,-26.89,-49.08,0.0,1,0,0,0,0,0.0,0,1000.0,0,0,0.0,0.0,0.0,0.0,0.0
4,2016-08,95,-26.89,-49.08,1.0,3,0,0,0,0,0.0,0,1533.33,0,0,0.0,0.0,0.0,0.0,0.0


In [22]:
## OBS: is_active está com shift de 1. Usar sem shift (já está com dado de um cohort a frente)
df_imobs.to_csv("notebooks/outputs/df_imobs.csv", index=False)