In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from pyspark.sql import SparkSession
from pyspark.sql.functions import concat, col, when, count, lit

In [138]:
ruta = 'C:/Users/omarc/OneDrive/Escritorio/Proyectos VSC/OmarCano/Track Analítico UNI/05_Datos/02_Datos_Procesados'

cepre_df = pd.read_csv(f'{ruta}/hm_cepre_uni_limpio.csv')
admisiones_df = pd.read_csv(f'{ruta}/hm_admisiones_uni_limpio.csv')
matriculados_df = pd.read_csv(f'{ruta}/hm_matriculados_uni_limpio.csv')

In [139]:
matriculados_df['CodYear'] = matriculados_df['ANIO'] * 10 + matriculados_df['PERIODO']
matriculados_df = matriculados_df.sort_values(by=['IDHASH', 'CodYear'])

In [140]:
def calculate_metrics(df):
    return df.groupby('IDHASH').agg(
        min_CodYear=('CodYear', 'min'),
        max_CodYear=('CodYear', 'max'),
        max_CicloRelativo=('CICLO_RELATIVO', 'max'),
        total_records=('CodYear', 'count')
    ).reset_index()

In [172]:
def determine_desercion(row, student_records, max_global_codyear):
    all_years = set(range(row['min_CodYear'], max_global_codyear + 1))
    existing_years = set(student_records['CodYear'])
    
    if row['max_CicloRelativo'] >= 8:
        return "Finalizó carrera"
    elif all_years.issubset(existing_years):
        return "Sigue estudiando"
    elif len(all_years.difference(existing_years)) > 0 and max(existing_years) == max_global_codyear:
        return "Retomó estudios"
    else:
        return "Desertó"

In [143]:
def apply_desercion_logic(grouped, matriculados_df, max_global_codyear):
    # Calcular métricas usando la función calculate_metrics
    aux_df = calculate_metrics(matriculados_df)
    
    # Aplicar la lógica de deserción en una sola operación vectorizada
    def vectorized_desercion(row):
        student_records = matriculados_df[matriculados_df['IDHASH'] == row['IDHASH']]
        return determine_desercion(row, student_records, max_global_codyear)

    aux_df['Detalle'] = aux_df.apply(vectorized_desercion, axis=1)
    aux_df['Deserción'] = aux_df['Detalle'].apply(lambda x: 0 if x != "Desertó" else 1)
    
    # Combinar el resultado con el DataFrame original
    result_df = grouped.merge(aux_df[['IDHASH', 'Deserción', 'Detalle']], on='IDHASH', how='left')
    
    return result_df

In [161]:
grouped = calculate_metrics(matriculados_df)

In [None]:
max_global_codyear = matriculados_df['CodYear'].max()
grouped = apply_desercion_logic(grouped, matriculados_df, max_global_codyear)

In [None]:
unique_ids = matriculados_df['IDHASH'].unique()
personas_df = pd.DataFrame({'IDHASH': unique_ids})

In [None]:
result_df = personas_df.merge(grouped[['IDHASH', 'Deserción', 'Detalle']], on='IDHASH', how='left')
print(result_df)

                                                  IDHASH  Deserción  \
0      0002DF41C79D5E34153DB578014A6EADFF766488667DC3...          0   
1      0003F425197CD03C452E4A0EFB8ED4652F6156127C910D...          0   
2      0005769B3CA7B05EB2305248FE46234C4EB3ECC0934686...          0   
3      000952F42F529EBAB64CED94AB833CBDFEBA505EBF9112...          0   
4      0009AA652BEE77D36595E9D0CCE52094EE0C8D94202EEF...          0   
...                                                  ...        ...   
26915  FFE70F512709FB36AB7ACAF22E47FC72D45B2A06839154...          0   
26916  FFEB63BEB596A5EC3C1FC6418AB245E5A1EC6FC8A820AD...          0   
26917  FFF7BE3CEFD03FDC483D10DB6EC7D001D8996501D73290...          1   
26918  FFFBF883BD33AADF02A659578A6FDBDE0C0C51E1641FE8...          0   
26919  FFFF03197AE62EF69D51D7736AF55619467B7FFF91382C...          0   

                   Detalle  
0      Registros completos  
1         Finalizó carrera  
2         Finalizó carrera  
3               Tomó pausa  
4 

In [165]:
output_path_grouped = r'C:/Users/omarc/OneDrive/Escritorio/Proyectos VSC/OmarCano/Track Analítico UNI/05_Datos/02_Datos_Procesados/um_desercion.csv'
grouped.to_csv(output_path_grouped, index=False)

output_path_personas = r'C:/Users/omarc/OneDrive/Escritorio/Proyectos VSC/OmarCano/Track Analítico UNI/05_Datos/02_Datos_Procesados/um_personas.csv'
personas_df.to_csv(output_path_personas, index=False)

output_path_personas_desertadas = r'C:/Users/omarc/OneDrive/Escritorio/Proyectos VSC/OmarCano/Track Analítico UNI/05_Datos/02_Datos_Procesados/um_personas_desertadas.csv'
result_df.to_csv(output_path_personas_desertadas, index=False)

