In [1]:
import os
os.chdir('../')

## Imports

In [2]:
import pandas as pd
from src.data_preprocess import DataProcessor, aggregate_data

## Data Collect

In [3]:
df_imobs  = pd.read_csv("notebooks/outputs/df_imobs.csv").rename(columns={"dt_ativacao": "dt_calendar"})

df_imobs["dt_calendar"] = pd.to_datetime(df_imobs["dt_calendar"])
df_imobs = df_imobs.sort_values(["id_imobiliaria", "dt_calendar"])

print(df_imobs.shape)
df_imobs.head()

(92656, 20)


Unnamed: 0,dt_calendar,id_imobiliaria,lat_imob,long_imob,is_active,is_activated,is_commun_first_90days,is_debelado,is_debelado_first_4months,is_indemnified,indemnity_value,is_indemn_first_6months,vl_locacao,is_churn,exonerated_first_6months,rating_A,rating_B,rating_C,rating_D,rating_E
369,2017-10-01,25,-25.45,-49.27,0.0,3,0,0,0,1,4717.81,0,1731.44,0,0,0.0,0.0,0.0,0.0,0.0
465,2017-11-01,25,-25.45,-49.27,3.0,1,0,0,0,0,0.0,0,1444.38,0,0,0.0,0.0,0.0,0.0,0.0
586,2017-12-01,25,-25.45,-49.27,4.0,1,0,1,0,0,0.0,0,4427.33,0,0,0.0,0.0,0.0,0.0,0.0
717,2018-01-01,25,-25.45,-49.27,5.0,1,0,0,0,0,0.0,0,1896.09,0,0,0.0,0.0,0.0,0.0,0.0
32136,2021-03-01,25,-25.45,-49.27,4.0,1,0,0,0,0,0.0,0,2090.16,0,0,0.0,1.0,0.0,0.0,0.0


In [4]:
# Novas imobiliárias
(
    df_imobs["is_active"].apply(lambda x: (x < 100))
    .to_frame("new_imob")
    .groupby("new_imob").size()
    .to_frame("count")
)

Unnamed: 0_level_0,count
new_imob,Unnamed: 1_level_1
False,10465
True,82191


In [5]:
df_imobs.groupby("id_imobiliaria", as_index=False)["is_active"].mean()

Unnamed: 0,id_imobiliaria,is_active
0,25,3.571429
1,30,33.181818
2,35,6.875000
3,36,0.500000
4,38,151.065574
...,...,...
8194,30484,0.000000
8195,30501,0.000000
8196,30537,0.000000
8197,30547,0.000000


In [6]:
df_imobs["target"] = df_imobs["is_indemn_first_6months"].transform(
    lambda x: 1 if x >= 1 else 0 
)

## Shift das Features

In [7]:
df_imobs.columns

Index(['dt_calendar', 'id_imobiliaria', 'lat_imob', 'long_imob', 'is_active',
       'is_activated', 'is_commun_first_90days', 'is_debelado',
       'is_debelado_first_4months', 'is_indemnified', 'indemnity_value',
       'is_indemn_first_6months', 'vl_locacao', 'is_churn',
       'exonerated_first_6months', 'rating_A', 'rating_B', 'rating_C',
       'rating_D', 'rating_E', 'target'],
      dtype='object')

In [8]:
processor = DataProcessor(df_imobs)

In [9]:
# is_active já está com shift de 1 (já está com dado de um cohort atrás)
df_shift = processor.features_shift(
    group_by="id_imobiliaria",
    columns_to_shift={
        1: ["is_churn"],
        3: ["is_commun_first_90days"],
        4: ["is_debelado_first_4months"],
        6: ["exonerated_first_6months"]

    }
)

In [10]:
df_shift["churn_rate"] = df_shift["is_churn"] / df_shift["is_activated"]

In [13]:
def aggregate_data(df, column, window):
    return (
        (df.groupby(["id_imobiliaria"])[column]
        .rolling(window, min_periods=1).mean()
        .values) 
        # / 
        # (df.groupby(["id_imobiliaria"])["is_activated"]
        # .rolling(window, min_periods=1).mean()
        # .values)
    )

In [14]:
df_shift["agg_activated_last_90days"] = df_shift.groupby(["id_imobiliaria"])["is_activated"].rolling(6, min_periods=1).mean().values

In [11]:
aggregate_list = {
    # "is_activated": "agg_activated_last_90days",
    "is_commun_first_90days": "agg_comun_last_90days",
    # "is_debelado_first_4months": "agg_deb_last_4months",
    # "exonerated_first_6months": "agg_exon_last_6months",
    "churn_rate": "agg_churn_rate",
    # "rating_A": "agg_rating_A",
    # "rating_B": "agg_rating_B",
    # "rating_C": "agg_rating_C",
    # "rating_D": "agg_rating_D",
    # "rating_E": "agg_rating_E"
}

In [12]:
for to_agg, column in aggregate_list.items():
    df_shift[column] = (
        aggregate_data(df_shift, to_agg, 6)
    )

In [13]:
df_shift = df_shift.loc[df_shift["is_active"] > 100, :].reset_index(drop=True)

In [14]:
# 3,5% de target positivo
df_shift.groupby("target").size()

target
0    10015
1      349
dtype: int64

In [15]:
df_model = df_shift[~df_shift.exonerated_first_6months.isnull()]

In [16]:
df_model.dt_calendar.describe()

  df_model.dt_calendar.describe()


count                   10317
unique                     68
top       2023-01-01 00:00:00
freq                      463
first     2017-06-01 00:00:00
last      2023-01-01 00:00:00
Name: dt_calendar, dtype: object

In [17]:
df_model.to_csv("notebooks/outputs/df_model_2.csv", index=False)

In [23]:
# df_new_imobs = (
#     df_shift.loc[df_shift["is_active"] <= 100, :][["dt_calendar", "id_imobiliaria"]]
#     .reset_index(drop=True)
#     .query("dt_calendar >= '2021-12' and dt_calendar <= '2022-05'")
#     .rename(columns={"dt_calendar": "dt_ativacao"})
# )

In [24]:
# df_new_imobs.to_csv("notebooks/outputs/df_new_imobs.csv", index=False)