# <CENTER> DATA DE FEATURE STORE

## *PASOS INICIALES*

### *LIBRERÍAS*

In [2]:
import pandas as pd
import numpy as np
import unidecode
import os
import openpyxl
import datetime
from datetime import date, datetime, timedelta
import warnings
import pyarrow as pa
import pyarrow.parquet as pq
import re

pd.set_option('display.max_columns', None)

### *RUTAS*

In [31]:
ruta_matricula_bronze = '../data/fs/matricula_bronze/'
ruta_matricula_gold = '../data/fs/matricula_gold/'
ruta_cobranzas_silver = '../data/fs/cobranzas_silver/'
ruta_cursos_gold = '../data/fs/cursos_gold/'
ruta_inasistencias_gold = '../data/fs/inasistencias_gold/'
ruta_sesion_plataforma_gold = '../data/fs/sesion_plataforma_gold/'
ruta_acceso_cursos_gold = '../data/fs/acceso_cursos_gold/'

ruta_output_pq = '../output/ds_fs.parquet'

### *FUNCIONES*

In [4]:
def LimpiarNumero( numero_str ) :
    try : 
        numero = float( numero_str )
    except :
        try :
            numero = float( str( numero_str ).replace( ',' , '.' ) )
        except :
            numero = np.nan
    return numero

### *DATOS A ESPECIFICAR*

In [21]:
dict_fechas_inicio_periodo = {
    202401 : datetime( 2024 , 3 , 25 ) ,
    202402 : datetime( 2024 , 8 , 12 ) ,
    202501 : datetime( 2025 , 3 , 24 ) ,
    202502 : datetime( 2025 , 8 , 18 ) ,
}

lista_periodos = sorted( list( dict_fechas_inicio_periodo.keys() ) )

# CARGAR DATA

## MATRÍCULA BRONZE

In [13]:
cols = [
    'COD_ALUMNO' , 
    # 'CREDITOS_APROBADOS_ACUMULADOS' , 
    # 'CREDITOS_TOTALES_CARRERA' , 
    # 'CANTIDAD_CURSOS_MATRICULADOS' , 
    'DOCUMENTO_ALUMNO' , 
    # 'NRO_PONDERADO_ACUMULADO' , 
    # 'PONDERADO_CICLO_ANTERIOR' ,
]
conv = {
    'COD_ALUMNO' : str , 
    'DOCUMENTO_ALUMNO' : str , 
    # 'CREDITOS_APROBADOS_ACUMULADOS' : str , 
    # 'CREDITOS_TOTALES_CARRERA' : str , 
    # 'NRO_PONDERADO_ACUMULADO' : str , 
    # 'PONDERADO_CICLO_ANTERIOR' : str ,
}
rename = {
    'COD_ALUMNO' : 'cod_alumno' ,
    # 'CREDITOS_APROBADOS_ACUMULADOS' : 'cant_creds_aprob_acum' , 
    # 'CREDITOS_TOTALES_CARRERA' : 'cant_creds_carrera' , 
    # 'CANTIDAD_CURSOS_MATRICULADOS' : 'cant_cursos_matr' , 
    'DOCUMENTO_ALUMNO' : 'documento' , 
    # 'NRO_PONDERADO_ACUMULADO' : 'ponderado_acum' , 
    # 'PONDERADO_CICLO_ANTERIOR' : 'ponderado_ant' ,
}

lista_dfs = []
for arch in os.listdir( ruta_matricula_bronze ) :
    df_matr_b = pd.read_csv( ruta_matricula_bronze + arch , sep = '|' , usecols = cols , converters = conv )
    df_matr_b.rename( rename , axis = 1 , inplace = True )
    df_matr_b['periodo'] = int( arch[ -10 : -4 ] )
    lista_dfs.append( df_matr_b )
df_matr_b = pd.concat( lista_dfs , axis = 0 ).reset_index( drop = True )

# df_matr_b['cant_creds_aprob_acum'] = df_matr_b['cant_creds_aprob_acum'].apply( LimpiarNumero )
# df_matr_b['cant_creds_carrera'] = df_matr_b['cant_creds_carrera'].apply( LimpiarNumero )
# df_matr_b['ponderado_acum'] = df_matr_b['ponderado_acum'].apply( LimpiarNumero )
# df_matr_b['ponderado_ant'] = df_matr_b['ponderado_ant'].apply( LimpiarNumero )

df_matr_b

Unnamed: 0,cod_alumno,documento,periodo
0,N00240983,77529979,202401
1,N00256833,72707410,202401
2,N00404113,72706434,202401
3,N00205705,72799837,202401
4,N00426135,70706800,202401
...,...,...,...
510961,N00521205,73906045,202502
510962,N00521232,40806322,202502
510963,N00521254,74066598,202502
510964,N00521317,77699069,202502


## MATRÍCULA GOLD

In [14]:
df_matr_g = pd.read_parquet( ruta_matricula_gold )

df_matr_g = df_matr_g.groupby( [ 'periodo' , 'cod_alumno' ] , as_index = False ).agg(
    unidad_negocio = ( 'unidad_negocio' , 'last' ) ,
    producto = ( 'carrera' , 'last' ) ,
    edad = ( 'edad' , 'max' ) ,
    pct_beca = ( 'pct_beca' , 'max' ) ,
)

df_matr_g

Unnamed: 0,periodo,cod_alumno,unidad_negocio,producto,edad,pct_beca
0,202001,N00001048,WA,CONTABILIDAD Y FINANZAS,57,0.65
1,202001,N00001095,WA,DERECHO,49,0.00
2,202001,N00001409,WA,DERECHO,44,0.00
3,202001,N00001588,WA,ING. DE MINAS,42,0.00
4,202001,N00002040,WA,ING. CIVIL,42,0.00
...,...,...,...,...,...,...
1399131,202502,N00521521,WV,ING. INDUSTRIAL,22,0.00
1399132,202502,N00521522,WV,ING. INDUSTRIAL,32,0.00
1399133,202502,N00521523,WV,CONTABILIDAD Y FINANZAS,20,0.00
1399134,202502,N00521524,WV,ING. DE SIST. COMPUTACIONALES,28,0.00


## COBRANZAS SILVER

In [38]:
df_cobr_s = pd.read_parquet( ruta_cobranzas_silver )
# df_cobr_s = df_cobr_s[ df_cobr_s['flag_matricula'] == 1 ].copy()

# df_cobr_s = df_cobr_s.groupby( [ 'periodo' , 'cod_alumno' ] , as_index = False ).agg(
#     fecha_pago_cuota_0 = ( 'fecha_pago' , 'max' ) ,
#     monto_cuota_0 = ( 'monto' , 'sum' )
# )

df_cobr_s

Unnamed: 0,periodo,cod_alumno,tipo_documento,cod_documento,fecha_documento,fecha_vencimiento,fecha_pago,monto,monto_pagado,cod_cobranza,flag_matricula,flag_tuition,cod_documento_asociado,nro_cuota
932585,202401,N00001079,B1,1714294,2024-05-27,2024-05-27,2024-05-27,2100.0,2100.0,000000057809,0,0,,0.0
919126,202401,N00001095,B1,1648265,2024-03-26,2024-03-26,2024-03-26,2100.0,2100.0,000000444964,0,0,,0.0
904887,202401,N00001212,B1,1731848,2024-06-05,2024-06-05,2024-06-05,2100.0,2100.0,000620535635,0,0,,0.0
874480,202401,N00001320,B1,1805339,2024-07-23,2024-07-23,2024-07-23,2180.0,2180.0,000000055754,0,0,,0.0
368693,202401,N00001397,F1,0014226757,2024-05-07,2024-06-07,NaT,3240.0,-0.0,,0,0,,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227171,202502,N00521523,B1,0018524228,2025-09-08,2025-09-08,2025-09-08,592.5,592.5,000004902982,1,0,,0.0
227228,202502,N00521524,B1,0018524234,2025-08-31,2025-09-08,2025-08-31,95.0,95.0,000002441326,1,0,,0.0
260993,202502,N00521524,B1,0018524235,2025-09-05,2025-09-08,2025-09-05,592.5,592.5,000000276884,1,0,,0.0
50125,202502,N00521525,B1,0018524251,2025-09-01,2025-09-08,2025-09-01,95.0,95.0,000030305685,1,0,,0.0


## CURSOS GOLD

In [10]:
df_cursos_g = pd.read_parquet( ruta_cursos_gold )

df_cursos_g = df_cursos_g.groupby( [ 'periodo' , 'cod_alumno' ] , as_index = False ).agg(
    cursos_matriculados = ( 'n_curmatr' , 'max' ) ,
    creditos_matriculados = ( 'sum_cred_curmatr' , 'max' ) ,
    ciclo_avg = ( 'avg_ciclo' , 'max' ) ,
)

df_cursos_g

Unnamed: 0,periodo,cod_alumno,cursos_matriculados,creditos_matriculados,ciclo_avg
0,202001,N00001048,4,19.0,9.500000
1,202001,N00001095,5,21.0,3.200000
2,202001,N00001409,5,22.0,5.000000
3,202001,N00001588,6,21.0,1.666667
4,202001,N00002040,2,11.0,10.500000
...,...,...,...,...,...
1420714,202502,N00521521,8,20.0,1.000000
1420715,202502,N00521522,8,20.0,1.000000
1420716,202502,N00521523,7,20.0,1.000000
1420717,202502,N00521524,8,20.0,1.000000


## INASISTENCIAS GOLD

In [57]:
df_inasist_g = pd.read_parquet( ruta_inasistencias_gold )

df_inasist_g['periodo'] = df_inasist_g['periodo'].astype( int )

df_inasist_g = df_inasist_g.groupby( [ 'periodo' , 'cod_alumno' , 'fecha_corte' ] , as_index = False ).agg(
    cant_asistencias = ( 'cant_asistencias' , 'max' ) ,
    cant_inasistencias = ( 'cant_inasistencias' , 'max' ) ,
)

df_inasist_g

Unnamed: 0,periodo,cod_alumno,fecha_corte,cant_asistencias,cant_inasistencias
0,202401,N00001409,2024-04-04,0,1
1,202401,N00001409,2024-04-11,3,1
2,202401,N00001409,2024-04-18,3,4
3,202401,N00001409,2024-04-25,3,7
4,202401,N00001409,2024-05-02,3,10
...,...,...,...,...,...
12193285,202502,N00521525,2026-02-05,7,2
12193286,202502,N00521525,2026-02-12,7,2
12193287,202502,N00521525,2026-02-19,7,2
12193288,202502,N00521525,2026-02-26,7,2


## SESIÓN PLATAFORMA GOLD

In [35]:
df_sesplat_g = pd.read_parquet( ruta_sesion_plataforma_gold )

df_sesplat_g = df_sesplat_g.groupby( [ 'periodo' , 'cod_alumno' , 'fecha_corte' ] , as_index = False ).agg(
    cant_logs_bb = ( 'count_logs_plataforma' , 'max' ) ,
)

df_sesplat_g

Unnamed: 0,periodo,cod_alumno,fecha_corte,cant_logs_bb
0,202401,N00001515,2024-04-11,18
1,202401,N00001515,2024-04-18,38
2,202401,N00001515,2024-04-25,62
3,202401,N00001515,2024-05-02,96
4,202401,N00001515,2024-05-09,130
...,...,...,...,...
710986,202502,N00521524,2026-02-05,20
710987,202502,N00521524,2026-02-12,20
710988,202502,N00521524,2026-02-19,20
710989,202502,N00521524,2026-02-26,20


## ACCESO CURSOS GOLD

In [37]:
df_acccur_g = pd.read_parquet( ruta_acceso_cursos_gold )

df_acccur_g = df_acccur_g.groupby( [ 'periodo' , 'cod_alumno' , 'fecha_corte' ] , as_index = False ).agg(
    cant_logs_bb_cursos = ( 'count_logs_curso' , 'max' ) ,
)

df_acccur_g

Unnamed: 0,periodo,cod_alumno,fecha_corte,cant_logs_bb_cursos
0,202401,N00001515,2024-04-11,16
1,202401,N00001515,2024-04-18,36
2,202401,N00001515,2024-04-25,68
3,202401,N00001515,2024-05-02,118
4,202401,N00001515,2024-05-09,160
...,...,...,...,...
688425,202502,N00521524,2026-02-05,33
688426,202502,N00521524,2026-02-12,33
688427,202502,N00521524,2026-02-19,33
688428,202502,N00521524,2026-02-26,33


# TRATAMIENTO DE DATA

## DATA AL INICIO DE CICLO

In [68]:
df_inicio_ciclo = pd.merge( df_matr_g , df_matr_b , how = 'left' , on = [ 'periodo' , 'cod_alumno' ] )
df_inicio_ciclo = df_inicio_ciclo[ df_inicio_ciclo['periodo'].isin( lista_periodos ) ].copy().reset_index( drop = True )
df_inicio_ciclo = pd.merge( df_inicio_ciclo , df_cursos_g , how = 'left' , on = [ 'periodo' , 'cod_alumno' ] )
df_inicio_ciclo['fecha_inicio_periodo'] = df_inicio_ciclo['periodo'].map( dict_fechas_inicio_periodo )

df_cobr_c0 = df_cobr_s.copy()
df_cobr_c0 = df_cobr_c0[ df_cobr_c0['flag_matricula'] == 1 ].copy()
df_cobr_c0 = df_cobr_c0.groupby( [ 'periodo' , 'cod_alumno' ] , as_index = False ).agg(
    fecha_pago_cuota_0 = ( 'fecha_pago' , 'max' ) ,
    monto_cuota_0 = ( 'monto' , 'sum' )
)
df_inicio_ciclo = pd.merge( df_inicio_ciclo , df_cobr_c0 , how = 'left' , on = [ 'periodo' , 'cod_alumno' ] )
df_inicio_ciclo['delta_dias_c0_inicio_clases'] = ( df_inicio_ciclo['fecha_inicio_periodo'] - df_inicio_ciclo['fecha_pago_cuota_0'] ).dt.days

lista_dfs = []
for n_semana in range( 1 , 5 ) :
    df_inicio_ciclo_temp = df_inicio_ciclo.copy()
    df_inicio_ciclo_temp['semana_ciclo'] = n_semana
    cant_dias = 7 * n_semana
    df_inicio_ciclo_temp['fecha_corte'] = df_inicio_ciclo_temp['fecha_inicio_periodo'] + timedelta( days = cant_dias )
    lista_dfs.append( df_inicio_ciclo_temp )
df_inicio_ciclo = pd.concat( lista_dfs , axis = 0 ).reset_index( drop = True )

df_sesplat_inicio = df_sesplat_g.copy()
df_sesplat_inicio['fecha_inicio_periodo'] = df_sesplat_inicio['periodo'].map( dict_fechas_inicio_periodo )
df_sesplat_inicio = df_sesplat_inicio.sort_values( [ 'periodo' , 'cod_alumno' , 'fecha_corte' ] , ascending = [ True , True , True ] ).reset_index( drop = True )
df_sesplat_inicio = df_sesplat_inicio[ df_sesplat_inicio['fecha_inicio_periodo'] > df_sesplat_inicio['fecha_corte'] ].groupby( [ 'periodo' , 'cod_alumno' ] , as_index = False )['cant_logs_bb'].last()
df_sesplat_inicio.columns = [ 'periodo' , 'cod_alumno' , 'ingresos_bb_prev' ]

df_acccur_inicio = df_acccur_g.copy()
df_acccur_inicio['fecha_inicio_periodo'] = df_acccur_inicio['periodo'].map( dict_fechas_inicio_periodo )
df_acccur_inicio = df_acccur_inicio.sort_values( [ 'periodo' , 'cod_alumno' , 'fecha_corte' ] , ascending = [ True , True , True ] ).reset_index( drop = True )
df_acccur_inicio = df_acccur_inicio[ df_acccur_inicio['fecha_inicio_periodo'] > df_acccur_inicio['fecha_corte'] ].groupby( [ 'periodo' , 'cod_alumno' ] , as_index = False )['cant_logs_bb_cursos'].last()
df_acccur_inicio.columns = [ 'periodo' , 'cod_alumno' , 'ingresos_bb_curso_prev' ]

df_inicio_ciclo = pd.merge( df_inicio_ciclo , df_sesplat_inicio , how = 'left' , on = [ 'periodo' , 'cod_alumno' ] )
df_inicio_ciclo = pd.merge( df_inicio_ciclo , df_acccur_inicio , how = 'left' , on = [ 'periodo' , 'cod_alumno' ] )

df_inicio_ciclo = df_inicio_ciclo[ [ 'periodo' , 'fecha_inicio_periodo' , 'cod_alumno' , 'fecha_corte' , 'semana_ciclo' , 'documento' , 'unidad_negocio' , 'producto' , 'edad' , 'pct_beca' , 'cursos_matriculados' , 'creditos_matriculados' , 'ciclo_avg' , 'monto_cuota_0' , 'delta_dias_c0_inicio_clases' , 'ingresos_bb_prev' , 'ingresos_bb_curso_prev' ] ].copy()
df_inicio_ciclo = df_inicio_ciclo.sort_values( [ 'periodo' , 'cod_alumno' , 'fecha_corte' ] , ascending = [ True , True , True ] ).reset_index( drop = True )

df_inicio_ciclo

Unnamed: 0,periodo,fecha_inicio_periodo,cod_alumno,fecha_corte,semana_ciclo,documento,unidad_negocio,producto,edad,pct_beca,cursos_matriculados,creditos_matriculados,ciclo_avg,monto_cuota_0,delta_dias_c0_inicio_clases,ingresos_bb_prev,ingresos_bb_curso_prev
0,202401,2024-03-25,N00001409,2024-04-01,1,18165456,WA,DERECHO,48,0.0,5,19.0,11.2,925.0,5.0,,
1,202401,2024-03-25,N00001409,2024-04-08,2,18165456,WA,DERECHO,48,0.0,5,19.0,11.2,925.0,5.0,,
2,202401,2024-03-25,N00001409,2024-04-15,3,18165456,WA,DERECHO,48,0.0,5,19.0,11.2,925.0,5.0,,
3,202401,2024-03-25,N00001409,2024-04-22,4,18165456,WA,DERECHO,48,0.0,5,19.0,11.2,925.0,5.0,,
4,202401,2024-03-25,N00001515,2024-04-01,1,18207703,WV,DERECHO,47,0.0,5,16.0,2.0,632.5,-25.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2043859,202502,2025-08-18,N00521524,2025-09-15,4,70836962,WV,ING. DE SIST. COMPUTACIONALES,28,0.0,8,20.0,1.0,687.5,-18.0,,
2043860,202502,2025-08-18,N00521525,2025-08-25,1,60041562,UG,ENFERMERIA,19,0.0,6,21.0,1.0,671.0,-21.0,,
2043861,202502,2025-08-18,N00521525,2025-09-01,2,60041562,UG,ENFERMERIA,19,0.0,6,21.0,1.0,671.0,-21.0,,
2043862,202502,2025-08-18,N00521525,2025-09-08,3,60041562,UG,ENFERMERIA,19,0.0,6,21.0,1.0,671.0,-21.0,,


## COBRANZAS A FECHA DE CORTE

In [None]:
df_cobr_date = df_cobr_s.copy()

lista_dfs = []
for periodo in sorted( df_inicio_ciclo['periodo'].unique().tolist() ) :

    df_cobr_date_temp = df_cobr_date[ df_cobr_date['periodo'] == periodo ].copy().reset_index( drop = True )

    for fecha_corte in sorted( df_inicio_ciclo['fecha_corte'][ df_inicio_ciclo['periodo'] == periodo ].unique().tolist() ) :

        df_cobr_monto = df_cobr_date_temp[ df_cobr_date_temp['fecha_documento'] < fecha_corte ].groupby( [ 'periodo' , 'cod_alumno' ] , as_index = False )[ [ 'monto' , 'monto_pagado' ] ].sum()
        df_cobr_monto['fecha_corte'] = fecha_corte
        df_cobr_monto['ratio_monto_pagado'] = df_cobr_monto['monto_pagado'] / df_cobr_monto['monto']

        df_cobr_mora = df_cobr_date_temp[ ( df_cobr_date_temp['fecha_documento'] < fecha_corte ) & ( df_cobr_date_temp['fecha_vencimiento'] < fecha_corte ) & ( ( df_cobr_date_temp['fecha_pago'].isna() ) | ( df_cobr_date_temp['fecha_pago'] >= fecha_corte ) ) ].copy().reset_index( drop = True )
        df_cobr_mora['dias_mora_acum'] = ( fecha_corte - df_cobr_mora['fecha_vencimiento'] ).dt.days
        df_cobr_mora = df_cobr_mora.groupby( [ 'periodo' , 'cod_alumno' ] , as_index = False )['dias_mora_acum'].sum()

        df_cobr_date_temp_2 = pd.merge( df_cobr_monto , df_cobr_mora , how = 'left' , on = [ 'periodo' , 'cod_alumno' ] )
        df_cobr_date_temp_2['dias_mora_acum'] = df_cobr_date_temp_2['dias_mora_acum'].fillna( 0 ).astype( int )

        lista_dfs.append( df_cobr_date_temp_2 )

df_cobr_date = pd.concat( lista_dfs , axis = 0 ).reset_index( drop = True )
df_cobr_date = df_cobr_date[ [ 'periodo' , 'cod_alumno' , 'fecha_corte' , 'ratio_monto_pagado' , 'dias_mora_acum' ] ].copy()

df_cobr_date

Unnamed: 0,periodo,cod_alumno,fecha_corte,ratio_monto_pagado,dias_mora_acum
0,202401,N00001095,2024-04-01,1.000000,0
1,202401,N00001409,2024-04-01,1.000000,0
2,202401,N00001515,2024-04-01,1.000000,0
3,202401,N00002001,2024-04-01,1.000000,32
4,202401,N00002296,2024-04-01,1.000000,0
...,...,...,...,...,...
2084648,202502,N00521521,2025-09-15,0.138182,4
2084649,202502,N00521522,2025-09-15,0.138182,4
2084650,202502,N00521523,2025-09-15,1.000000,0
2084651,202502,N00521524,2025-09-15,1.000000,0


## INASISTENCIAS A FECHA DE CORTE

In [61]:
lista_dfs = []
for periodo in sorted( df_inicio_ciclo['periodo'].unique().tolist() ) :

    df_inasist_date_temp = df_inasist_g[ df_inasist_g['periodo'] == periodo ].sort_values( [ 'fecha_corte' ] , ascending = [ True ] ).reset_index( drop = True )

    for fecha_corte in sorted( df_inicio_ciclo['fecha_corte'][ df_inicio_ciclo['periodo'] == periodo ].unique().tolist() ) :

        df_inasist_date = df_inasist_date_temp[ df_inasist_date_temp['fecha_corte'] < fecha_corte ].groupby( [ 'periodo' , 'cod_alumno' ] , as_index = False )[ [ 'cant_asistencias' , 'cant_inasistencias' ] ].last()
        df_inasist_date['fecha_corte'] = fecha_corte
        df_inasist_date['ratio_asistencias'] = df_inasist_date['cant_asistencias'] / ( df_inasist_date['cant_asistencias'] + df_inasist_date['cant_inasistencias'] )
        
        lista_dfs.append( df_inasist_date )

df_inasist_date = pd.concat( lista_dfs , axis = 0 ).reset_index( drop = True )
df_inasist_date = df_inasist_date[ [ 'periodo' , 'cod_alumno' , 'fecha_corte' , 'cant_asistencias' , 'ratio_asistencias' ] ].copy()
df_inasist_date.rename( { 'cant_asistencias' : 'asistencias' } , axis = 1 , inplace = True )

df_inasist_date

Unnamed: 0,periodo,cod_alumno,fecha_corte,asistencias,ratio_asistencias
0,202401,N00003410,2024-04-01,1,1.000000
1,202401,N00003993,2024-04-01,0,0.000000
2,202401,N00004887,2024-04-01,1,1.000000
3,202401,N00011870,2024-04-01,1,1.000000
4,202401,N00011890,2024-04-01,0,0.000000
...,...,...,...,...,...
1825718,202502,N00521514,2025-09-15,3,0.500000
1825719,202502,N00521520,2025-09-15,1,1.000000
1825720,202502,N00521521,2025-09-15,1,1.000000
1825721,202502,N00521522,2025-09-15,1,1.000000


## SESIÓN PLATAFORMA A FECHA DE CORTE

In [71]:
lista_dfs = []
for periodo in sorted( df_inicio_ciclo['periodo'].unique().tolist() ) :

    df_sesplat_date_temp = df_sesplat_g[ df_sesplat_g['periodo'] == periodo ].sort_values( [ 'fecha_corte' ] , ascending = [ True ] ).reset_index( drop = True )
    fecha_inicio_periodo = dict_fechas_inicio_periodo[periodo]

    for fecha_corte in sorted( df_inicio_ciclo['fecha_corte'][ df_inicio_ciclo['periodo'] == periodo ].unique().tolist() ) :

        df_sesplat_date = df_sesplat_date_temp[ ( df_sesplat_date_temp['fecha_corte'] >= fecha_inicio_periodo ) & ( df_sesplat_date_temp['fecha_corte'] < fecha_corte ) ].groupby( [ 'periodo' , 'cod_alumno' ] , as_index = False )[ [ 'cant_logs_bb' ] ].last()
        df_sesplat_date['fecha_corte'] = fecha_corte
        
        lista_dfs.append( df_sesplat_date )

df_sesplat_date = pd.concat( lista_dfs , axis = 0 ).reset_index( drop = True )
df_sesplat_date = df_sesplat_date[ [ 'periodo' , 'cod_alumno' , 'fecha_corte' , 'cant_logs_bb' ] ].copy()
df_sesplat_date.rename( { 'cant_logs_bb' : 'ingresos_bb_post' } , axis = 1 , inplace = True )

df_sesplat_date

Unnamed: 0,periodo,cod_alumno,fecha_corte,ingresos_bb_post
0,202401,N00012150,2024-04-01,12
1,202401,N00014473,2024-04-01,86
2,202401,N00014621,2024-04-01,20
3,202401,N00016266,2024-04-01,10
4,202401,N00016947,2024-04-01,6
...,...,...,...,...
102627,202502,N00521519,2025-09-15,3
102628,202502,N00521520,2025-09-15,7
102629,202502,N00521521,2025-09-15,5
102630,202502,N00521523,2025-09-15,17


## ACCESO CURSOS A FECHA DE CORTE

In [73]:
lista_dfs = []
for periodo in sorted( df_inicio_ciclo['periodo'].unique().tolist() ) :

    df_acccur_date_temp = df_acccur_g[ df_acccur_g['periodo'] == periodo ].sort_values( [ 'fecha_corte' ] , ascending = [ True ] ).reset_index( drop = True )
    fecha_inicio_periodo = dict_fechas_inicio_periodo[periodo]

    for fecha_corte in sorted( df_inicio_ciclo['fecha_corte'][ df_inicio_ciclo['periodo'] == periodo ].unique().tolist() ) :

        df_acccur_date = df_acccur_date_temp[ ( df_acccur_date_temp['fecha_corte'] >= fecha_inicio_periodo ) & ( df_acccur_date_temp['fecha_corte'] < fecha_corte ) ].groupby( [ 'periodo' , 'cod_alumno' ] , as_index = False )[ [ 'cant_logs_bb_cursos' ] ].last()
        df_acccur_date['fecha_corte'] = fecha_corte
        
        lista_dfs.append( df_acccur_date )

df_acccur_date = pd.concat( lista_dfs , axis = 0 ).reset_index( drop = True )
df_acccur_date = df_acccur_date[ [ 'periodo' , 'cod_alumno' , 'fecha_corte' , 'cant_logs_bb_cursos' ] ].copy()
df_acccur_date.rename( { 'cant_logs_bb_cursos' : 'ingresos_bb_curso_post' } , axis = 1 , inplace = True )

df_acccur_date

Unnamed: 0,periodo,cod_alumno,fecha_corte,ingresos_bb_curso_post
0,202401,N00012150,2024-04-01,2
1,202401,N00014473,2024-04-01,142
2,202401,N00014621,2024-04-01,24
3,202401,N00016266,2024-04-01,6
4,202401,N00016947,2024-04-01,10
...,...,...,...,...
99241,202502,N00521519,2025-09-15,4
99242,202502,N00521520,2025-09-15,7
99243,202502,N00521521,2025-09-15,8
99244,202502,N00521523,2025-09-15,18


## CRUCE DATA INICIO CICLO CON FECHA DE CORTE

In [75]:
df_cruce = df_inicio_ciclo.copy()

df_cruce = pd.merge( df_cruce , df_cobr_date , how = 'left' , on = [ 'periodo' , 'cod_alumno' , 'fecha_corte' ] )
df_cruce = pd.merge( df_cruce , df_inasist_date , how = 'left' , on = [ 'periodo' , 'cod_alumno' , 'fecha_corte' ] )
df_cruce = pd.merge( df_cruce , df_sesplat_date , how = 'left' , on = [ 'periodo' , 'cod_alumno' , 'fecha_corte' ] )
df_cruce = pd.merge( df_cruce , df_acccur_date , how = 'left' , on = [ 'periodo' , 'cod_alumno' , 'fecha_corte' ] )


df_cruce

Unnamed: 0,periodo,fecha_inicio_periodo,cod_alumno,fecha_corte,semana_ciclo,documento,unidad_negocio,producto,edad,pct_beca,cursos_matriculados,creditos_matriculados,ciclo_avg,monto_cuota_0,delta_dias_c0_inicio_clases,ingresos_bb_prev,ingresos_bb_curso_prev,ratio_monto_pagado,dias_mora_acum,asistencias,ratio_asistencias,ingresos_bb_post,ingresos_bb_curso_post
0,202401,2024-03-25,N00001409,2024-04-01,1,18165456,WA,DERECHO,48,0.0,5,19.0,11.2,925.0,5.0,,,1.0,0.0,,,,
1,202401,2024-03-25,N00001409,2024-04-08,2,18165456,WA,DERECHO,48,0.0,5,19.0,11.2,925.0,5.0,,,1.0,0.0,0.0,0.000000,,
2,202401,2024-03-25,N00001409,2024-04-15,3,18165456,WA,DERECHO,48,0.0,5,19.0,11.2,925.0,5.0,,,1.0,0.0,3.0,0.750000,,
3,202401,2024-03-25,N00001409,2024-04-22,4,18165456,WA,DERECHO,48,0.0,5,19.0,11.2,925.0,5.0,,,1.0,0.0,3.0,0.428571,,
4,202401,2024-03-25,N00001515,2024-04-01,1,18207703,WV,DERECHO,47,0.0,5,16.0,2.0,632.5,-25.0,,,1.0,0.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2043859,202502,2025-08-18,N00521524,2025-09-15,4,70836962,WV,ING. DE SIST. COMPUTACIONALES,28,0.0,8,20.0,1.0,687.5,-18.0,,,1.0,0.0,,,17.0,29.0
2043860,202502,2025-08-18,N00521525,2025-08-25,1,60041562,UG,ENFERMERIA,19,0.0,6,21.0,1.0,671.0,-21.0,,,,,,,,
2043861,202502,2025-08-18,N00521525,2025-09-01,2,60041562,UG,ENFERMERIA,19,0.0,6,21.0,1.0,671.0,-21.0,,,,,,,,
2043862,202502,2025-08-18,N00521525,2025-09-08,3,60041562,UG,ENFERMERIA,19,0.0,6,21.0,1.0,671.0,-21.0,,,1.0,0.0,3.0,0.750000,,


## TARGETS

In [81]:
df_targets = df_cruce[ [ 'periodo' , 'cod_alumno' , 'semana_ciclo' , 'asistencias' , 'ingresos_bb_curso_post' ] ].copy()

df_targets = df_targets.fillna( 0 )

df_targets['flag_zombie_presencial_1w'] = ( df_targets['semana_ciclo'] == 1 ) * ( df_targets['asistencias'] == 0 ) * 1
df_targets['flag_zombie_presencial_2w'] = ( df_targets['semana_ciclo'] == 2 ) * ( df_targets['asistencias'] == 0 ) * 1
df_targets['flag_zombie_presencial_3w'] = ( df_targets['semana_ciclo'] == 3 ) * ( df_targets['asistencias'] == 0 ) * 1
df_targets['flag_zombie_presencial_4w'] = ( df_targets['semana_ciclo'] == 4 ) * ( df_targets['asistencias'] == 0 ) * 1

df_targets['flag_zombie_virtual_1w'] = ( df_targets['semana_ciclo'] == 1 ) * ( df_targets['ingresos_bb_curso_post'] == 0 ) * 1
df_targets['flag_zombie_virtual_2w'] = ( df_targets['semana_ciclo'] == 2 ) * ( df_targets['ingresos_bb_curso_post'] == 0 ) * 1
df_targets['flag_zombie_virtual_3w'] = ( df_targets['semana_ciclo'] == 3 ) * ( df_targets['ingresos_bb_curso_post'] == 0 ) * 1
df_targets['flag_zombie_virtual_4w'] = ( df_targets['semana_ciclo'] == 4 ) * ( df_targets['ingresos_bb_curso_post'] == 0 ) * 1

df_targets = df_targets.drop( [ 'semana_ciclo' , 'asistencias' , 'ingresos_bb_curso_post' ] , axis = 1 ).groupby( [ 'periodo' , 'cod_alumno' ] , as_index = False ).max()

df_targets

Unnamed: 0,periodo,cod_alumno,flag_zombie_presencial_1w,flag_zombie_presencial_2w,flag_zombie_presencial_3w,flag_zombie_presencial_4w,flag_zombie_virtual_1w,flag_zombie_virtual_2w,flag_zombie_virtual_3w,flag_zombie_virtual_4w
0,202401,N00001409,1,1,0,0,1,1,1,1
1,202401,N00001515,1,1,1,0,1,1,0,0
2,202401,N00002001,1,1,0,0,1,1,1,1
3,202401,N00002296,1,0,0,0,1,1,1,1
4,202401,N00002355,1,0,0,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...
510930,202502,N00521521,1,1,1,0,1,1,0,0
510931,202502,N00521522,1,1,1,0,1,1,1,1
510932,202502,N00521523,1,1,1,1,1,1,0,0
510933,202502,N00521524,1,1,1,1,1,1,0,0


## CRUCE BD CON TARGETS

In [88]:
df_cruce_final = pd.merge( df_cruce , df_targets , how = 'left' , on = [ 'periodo' , 'cod_alumno' ] )

df_cruce_final

Unnamed: 0,periodo,fecha_inicio_periodo,cod_alumno,fecha_corte,semana_ciclo,documento,unidad_negocio,producto,edad,pct_beca,cursos_matriculados,creditos_matriculados,ciclo_avg,monto_cuota_0,delta_dias_c0_inicio_clases,ingresos_bb_prev,ingresos_bb_curso_prev,ratio_monto_pagado,dias_mora_acum,asistencias,ratio_asistencias,ingresos_bb_post,ingresos_bb_curso_post,flag_zombie_presencial_1w,flag_zombie_presencial_2w,flag_zombie_presencial_3w,flag_zombie_presencial_4w,flag_zombie_virtual_1w,flag_zombie_virtual_2w,flag_zombie_virtual_3w,flag_zombie_virtual_4w
0,202401,2024-03-25,N00001409,2024-04-01,1,18165456,WA,DERECHO,48,0.0,5,19.0,11.2,925.0,5.0,,,1.0,0.0,,,,,1,1,0,0,1,1,1,1
1,202401,2024-03-25,N00001409,2024-04-08,2,18165456,WA,DERECHO,48,0.0,5,19.0,11.2,925.0,5.0,,,1.0,0.0,0.0,0.000000,,,1,1,0,0,1,1,1,1
2,202401,2024-03-25,N00001409,2024-04-15,3,18165456,WA,DERECHO,48,0.0,5,19.0,11.2,925.0,5.0,,,1.0,0.0,3.0,0.750000,,,1,1,0,0,1,1,1,1
3,202401,2024-03-25,N00001409,2024-04-22,4,18165456,WA,DERECHO,48,0.0,5,19.0,11.2,925.0,5.0,,,1.0,0.0,3.0,0.428571,,,1,1,0,0,1,1,1,1
4,202401,2024-03-25,N00001515,2024-04-01,1,18207703,WV,DERECHO,47,0.0,5,16.0,2.0,632.5,-25.0,,,1.0,0.0,,,,,1,1,1,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2043859,202502,2025-08-18,N00521524,2025-09-15,4,70836962,WV,ING. DE SIST. COMPUTACIONALES,28,0.0,8,20.0,1.0,687.5,-18.0,,,1.0,0.0,,,17.0,29.0,1,1,1,1,1,1,0,0
2043860,202502,2025-08-18,N00521525,2025-08-25,1,60041562,UG,ENFERMERIA,19,0.0,6,21.0,1.0,671.0,-21.0,,,,,,,,,1,1,0,0,1,1,1,1
2043861,202502,2025-08-18,N00521525,2025-09-01,2,60041562,UG,ENFERMERIA,19,0.0,6,21.0,1.0,671.0,-21.0,,,,,,,,,1,1,0,0,1,1,1,1
2043862,202502,2025-08-18,N00521525,2025-09-08,3,60041562,UG,ENFERMERIA,19,0.0,6,21.0,1.0,671.0,-21.0,,,1.0,0.0,3.0,0.750000,,,1,1,0,0,1,1,1,1


# DATASET FINAL

In [90]:
df_final = df_cruce_final.copy()

cols_final = [
    'periodo',
    'fecha_inicio_periodo',
    'fecha_corte',
    'semana_ciclo',
    'cod_alumno',
    'documento',
    'unidad_negocio',
    'producto',
    'edad',
    'pct_beca',
    'cursos_matriculados',
    'creditos_matriculados',
    'ciclo_avg',
    'monto_cuota_0',
    'delta_dias_c0_inicio_clases',
    'ratio_monto_pagado',
    'dias_mora_acum',
    'asistencias',
    'ratio_asistencias',
    'ingresos_bb_prev',
    'ingresos_bb_post',
    'ingresos_bb_curso_prev',
    'ingresos_bb_curso_post',
    'flag_zombie_presencial_1w',
    'flag_zombie_presencial_2w',
    'flag_zombie_presencial_3w',
    'flag_zombie_presencial_4w',
    'flag_zombie_virtual_1w',
    'flag_zombie_virtual_2w',
    'flag_zombie_virtual_3w',
    'flag_zombie_virtual_4w',
]
df_final = df_final[ cols_final ].sort_values( [ 'periodo' , 'cod_alumno' , 'fecha_corte' ] , ascending = [ True , True , True ] ).reset_index( drop = True )

df_final

Unnamed: 0,periodo,fecha_inicio_periodo,fecha_corte,semana_ciclo,cod_alumno,documento,unidad_negocio,producto,edad,pct_beca,cursos_matriculados,creditos_matriculados,ciclo_avg,monto_cuota_0,delta_dias_c0_inicio_clases,ratio_monto_pagado,dias_mora_acum,asistencias,ratio_asistencias,ingresos_bb_prev,ingresos_bb_post,ingresos_bb_curso_prev,ingresos_bb_curso_post,flag_zombie_presencial_1w,flag_zombie_presencial_2w,flag_zombie_presencial_3w,flag_zombie_presencial_4w,flag_zombie_virtual_1w,flag_zombie_virtual_2w,flag_zombie_virtual_3w,flag_zombie_virtual_4w
0,202401,2024-03-25,2024-04-01,1,N00001409,18165456,WA,DERECHO,48,0.0,5,19.0,11.2,925.0,5.0,1.0,0.0,,,,,,,1,1,0,0,1,1,1,1
1,202401,2024-03-25,2024-04-08,2,N00001409,18165456,WA,DERECHO,48,0.0,5,19.0,11.2,925.0,5.0,1.0,0.0,0.0,0.000000,,,,,1,1,0,0,1,1,1,1
2,202401,2024-03-25,2024-04-15,3,N00001409,18165456,WA,DERECHO,48,0.0,5,19.0,11.2,925.0,5.0,1.0,0.0,3.0,0.750000,,,,,1,1,0,0,1,1,1,1
3,202401,2024-03-25,2024-04-22,4,N00001409,18165456,WA,DERECHO,48,0.0,5,19.0,11.2,925.0,5.0,1.0,0.0,3.0,0.428571,,,,,1,1,0,0,1,1,1,1
4,202401,2024-03-25,2024-04-01,1,N00001515,18207703,WV,DERECHO,47,0.0,5,16.0,2.0,632.5,-25.0,1.0,0.0,,,,,,,1,1,1,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2043859,202502,2025-08-18,2025-09-15,4,N00521524,70836962,WV,ING. DE SIST. COMPUTACIONALES,28,0.0,8,20.0,1.0,687.5,-18.0,1.0,0.0,,,,17.0,,29.0,1,1,1,1,1,1,0,0
2043860,202502,2025-08-18,2025-08-25,1,N00521525,60041562,UG,ENFERMERIA,19,0.0,6,21.0,1.0,671.0,-21.0,,,,,,,,,1,1,0,0,1,1,1,1
2043861,202502,2025-08-18,2025-09-01,2,N00521525,60041562,UG,ENFERMERIA,19,0.0,6,21.0,1.0,671.0,-21.0,,,,,,,,,1,1,0,0,1,1,1,1
2043862,202502,2025-08-18,2025-09-08,3,N00521525,60041562,UG,ENFERMERIA,19,0.0,6,21.0,1.0,671.0,-21.0,1.0,0.0,3.0,0.750000,,,,,1,1,0,0,1,1,1,1


In [91]:
pq.write_table( pa.Table.from_pandas( df_final ) , ruta_output_pq )