In [1]:
import pandas as pd
import datetime as dt

from sklearn.ensemble import IsolationForest

from mypackage import dir
from mypackage.rfm import calculate_rfm


# Environment variables
modality = 'p'
project = 'australian'
data = dir.make_dir_line(modality, project) 
processed = data('processed')


# Función para cargar datos
def cargar_datos(table_name: str) -> pd.DataFrame:
    df = pd.read_parquet(processed / f'{table_name}.parquet.gzip')
    print(f'Loaded table: {table_name}')
    return df

# Función para cargar los datos en la base de datos
def cargar_en_db(df: pd.DataFrame, table_name: str) -> None:
    df.to_parquet(processed/f'{table_name}.parquet.gzip', compression='gzip')
    print(f'Saved table: {table_name}')

def get_month(x): return dt.datetime(x.year, x.month, 1) 

def get_dates(df, col):
    
    year = df[col].dt.year
    month = df[col].dt.month
    day = df[col].dt.day
    
    return year, month, day

In [2]:
iso_forest = IsolationForest(contamination=0.1, random_state=42)

In [12]:
df = cargar_datos('dataset')
print(df.shape)
df['fecha'] = pd.to_datetime(df['fecha'], format='%Y-%m-%d', errors = 'coerce')
df = df[df['fecha']<'2024-1-1']
print(df.shape)

Loaded table: dataset
(12951, 6)
(12529, 6)


In [4]:
df = cargar_datos('dataset')
# dividir data para drift
df['fecha'] = pd.to_datetime(df['fecha'], format='%Y-%m-%d', errors = 'coerce')
df = df[df['fecha']<'2024-1-1']
df = df[df["total_factura"] > 0.0]
# transformaciones
df = df.sort_values(by=['fecha'], ascending=True)
df['anomaly'] = iso_forest.fit_predict(df.loc[:,['quantity', 'valor', 'total_factura']])
df = df[df['anomaly'] == 1].drop(columns='anomaly')
df.head()

Loaded table: dataset


Unnamed: 0,id_factura,fecha,id_cliente,quantity,valor,total_factura
0,1001,2019-11-25,87,1,65.6,65.6
200,1006,2019-11-25,484,2,33.6,67.2
85,1003,2019-11-25,276,2,59.2,118.4
166,1005,2019-11-25,189,3,24.0,72.0
42,1002,2019-11-25,308,2,28.8,57.6


In [5]:
df['fecha'].max()

Timestamp('2024-09-30 00:00:00')

In [6]:
df["InvoiceMonth"] = df["fecha"].apply(get_month)
df["CohortMonth"] = df.groupby("id_cliente")["InvoiceMonth"].transform("min")

invoice_year, invoice_month, invoice_day = get_dates(df, "InvoiceMonth")
cohort_year, cohort_month, cohort_day = get_dates(df, "CohortMonth")
year_diff = invoice_year - cohort_year
month_diff = invoice_month - cohort_month

df["CohortIndex"] = 12 * year_diff + month_diff + 1
df.head()

Unnamed: 0,id_factura,fecha,id_cliente,quantity,valor,total_factura,InvoiceMonth,CohortMonth,CohortIndex
0,1001,2019-11-25,87,1,65.6,65.6,2019-11-01,2019-11-01,1
125,1004,2019-11-25,560,1,48.0,48.0,2019-11-01,2019-11-01,1
166,1005,2019-11-25,189,3,24.0,72.0,2019-11-01,2019-11-01,1
200,1006,2019-11-25,484,2,33.6,67.2,2019-11-01,2019-11-01,1
42,1002,2019-11-25,308,2,28.8,57.6,2019-11-01,2019-11-01,1


In [7]:
rfm = calculate_rfm(dataframe=df, 
                    val_id_customer='id_cliente', 
                    val_id_facture='id_factura', 
                    val_money='total_factura', 
                    val_date='fecha')
rfm["monetary"] = rfm["monetary"] / rfm["frequency"] 
rfm = rfm[(rfm['frequency'] > 1)]

# dividir para tener semanas
rfm["live_purches"] = rfm["live_purches"] / 7
rfm["tenure"] = rfm["tenure"] / 7
rfm.head()

Unnamed: 0,id_cliente,recency,live_purches,tenure,frequency,monetary
0,0,47,258.285714,251.571429,16,86.4
1,1,20,258.571429,255.714286,20,94.88
2,2,183,258.142857,232.0,17,84.705882
3,3,55,259.142857,251.285714,21,95.314286
4,4,1,234.142857,234.0,19,93.305263


In [8]:
cargar_en_db(rfm, 'rfm')

Saved table: rfm


In [9]:
print('Ok_')

Ok_
