In [1]:
import pandas as pd
from pathlib import Path

In [2]:
# Definir la ruta al directorio de los experimentos en tu bucket
bucket_dir = Path('/home/mili_irusta/buckets/b1/expw227/')  # Reemplaza con la ruta correcta de tu bucket

# Verificar que el directorio exista
if not bucket_dir.exists():
    print(f"Error: El directorio {bucket_dir} no existe")
else:
    print(f"Directorio {bucket_dir} encontrado.")

# Inicializar un DataFrame vacío
datos = pd.DataFrame()

# Iterar sobre las carpetas dentro del bucket que comienzan con 'SC'
for carpeta in bucket_dir.glob('SC*'):
    print(f"Revisando carpeta: {carpeta}")  # Depuración: Ver qué carpeta estamos procesando
    
    # Verificar si la carpeta contiene el archivo 'z-Rend.txt'
    if (carpeta / 'z-Rend.txt').exists():
        print(f"Archivo 'z-Rend.txt' encontrado en {carpeta}")
    else:
        print(f"Archivo 'z-Rend.txt' no encontrado en {carpeta}")
        continue
    
    # Ruta al archivo de predicción 'tb_future_prediccion.txt'
    archivo = carpeta / 'tb_future_prediccion.txt'
    
    # Verificar si el archivo existe antes de leerlo
    if archivo.exists():
        print(f"Archivo de predicción encontrado: {archivo}")
        df = pd.read_csv(archivo, sep='\t')
        
        # Verificar que 'foto_mes' sea 202109, si no es así, no agregar los datos
        if df.foto_mes.unique() != [202109]:
            print(f"foto_mes no es 202109 en {archivo}. Se omite.")
            continue
        
        # Eliminar las columnas que comienzan con 'm_' (esto es cuando hicimos Semillerio)
        df = df.loc[:, ~df.columns.str.startswith('m_')]
        
        # Eliminar las columnas 'foto_mes' y 'clase_ternaria'
        df = df.drop(columns=['foto_mes', 'clase_ternaria'])
        
        # Unir los datos con el DataFrame principal usando 'numero_de_cliente'
        if datos.empty:
            datos = df
        else:
            datos = datos.merge(df, on='numero_de_cliente', how='outer', suffixes=('', f'_{carpeta.name}'))
    else:
        print(f"El archivo de predicción no existe en {carpeta}.")
    
# Al final del ciclo, revisar el resultado
print(f"Datos combinados: {datos.shape[0]} filas y {datos.shape[1]} columnas")


Directorio /home/mili_irusta/buckets/b1/expw227 encontrado.
Revisando carpeta: /home/mili_irusta/buckets/b1/expw227/SC-0001
Archivo 'z-Rend.txt' encontrado en /home/mili_irusta/buckets/b1/expw227/SC-0001
Archivo de predicción encontrado: /home/mili_irusta/buckets/b1/expw227/SC-0001/tb_future_prediccion.txt
Revisando carpeta: /home/mili_irusta/buckets/b1/expw227/SC-0002
Archivo 'z-Rend.txt' encontrado en /home/mili_irusta/buckets/b1/expw227/SC-0002
Archivo de predicción encontrado: /home/mili_irusta/buckets/b1/expw227/SC-0002/tb_future_prediccion.txt
Revisando carpeta: /home/mili_irusta/buckets/b1/expw227/SC-0003
Archivo 'z-Rend.txt' encontrado en /home/mili_irusta/buckets/b1/expw227/SC-0003
Archivo de predicción encontrado: /home/mili_irusta/buckets/b1/expw227/SC-0003/tb_future_prediccion.txt
Revisando carpeta: /home/mili_irusta/buckets/b1/expw227/SC-0004
Archivo 'z-Rend.txt' encontrado en /home/mili_irusta/buckets/b1/expw227/SC-0004
Archivo de predicción encontrado: /home/mili_irusta/

In [3]:
datos

Unnamed: 0,numero_de_cliente,sem_1_1,sem_1_1_SC-0002,sem_1_1_SC-0003,sem_1_1_SC-0004,sem_1_1_SC-0005,sem_1_1_SC-0006,sem_1_1_SC-0007,sem_1_2,sem_1_1_SC-0008,sem_1_1_SC-0009,sem_1_1_SC-0010
0,249221109,0.029977,0.017487,0.003745,0.101317,0.119401,0.038811,0.045168,0.043686,0.024779,0.046791,0.014228
1,249221468,0.056352,0.043201,0.004990,0.104043,0.142221,0.064153,0.036906,0.045174,0.036236,0.027195,0.038718
2,249223005,0.001995,0.000953,0.000437,0.011491,0.006226,0.009228,0.003144,0.003193,0.001611,0.004006,0.001302
3,249232117,0.053960,0.031329,0.002938,0.085318,0.080635,0.082797,0.014575,0.018270,0.016386,0.024895,0.015859
4,249236712,0.008164,0.001641,0.000755,0.018912,0.008893,0.012029,0.004751,0.005801,0.002976,0.005962,0.002976
...,...,...,...,...,...,...,...,...,...,...,...,...
165639,1606470344,0.148696,0.105579,0.018126,0.305712,0.216855,0.206269,0.122536,0.115833,0.194347,0.103667,0.199798
165640,1606478730,0.437271,0.405282,0.094961,0.573687,0.508942,0.476556,0.232569,0.253240,0.390488,0.217234,0.383805
165641,1606600592,0.137866,0.111789,0.010479,0.193891,0.190968,0.123742,0.062816,0.070381,0.095733,0.057025,0.098635
165642,1606610942,0.360481,0.360087,0.061799,0.559556,0.631464,0.410828,0.266973,0.261791,0.356454,0.221946,0.374865


In [4]:
# Definir los pesos iniciales para las columnas
pesos = {
    'sem_1_1': 0.083,
    'sem_1_2': 0.083,
    'sem_1_1_SC-0008': 0.15,  
    'sem_1_1_SC-0002': 0.083,
    'sem_1_1_SC-0003': 0.083,
    'sem_1_1_SC-0004': 0.083,
    'sem_1_1_SC-0005': 0.083,
    'sem_1_1_SC-0006': 0.083,
    'sem_1_1_SC-0007': 0.083,
    'sem_1_1_SC-0009': 0.083,
    'sem_1_1_SC-0010': 0.083,
}

# Peso fijo para el modelo destacado
peso_fijo = 0.4

# Lista de columnas restantes (excluyendo `sem_1_1_SC-0008`)
columnas_restantes = [col for col in pesos.keys() if col != 'sem_1_1_SC-0008']

# Peso total disponible para las columnas restantes
peso_restante = 1 - peso_fijo

# Distribuir el peso restante equitativamente entre las demás columnas
pesos_actualizados = {col: peso_restante / len(columnas_restantes) for col in columnas_restantes}

# Asignar el peso fijo al modelo destacado
pesos_actualizados['sem_1_1_SC-0008'] = peso_fijo

# Calcular la probabilidad final ponderada
datos['probabilidad_final'] = sum(
    datos[col] * peso for col, peso in pesos_actualizados.items()
)

print("Pesos utilizados:", pesos_actualizados)



Pesos utilizados: {'sem_1_1': 0.06, 'sem_1_2': 0.06, 'sem_1_1_SC-0002': 0.06, 'sem_1_1_SC-0003': 0.06, 'sem_1_1_SC-0004': 0.06, 'sem_1_1_SC-0005': 0.06, 'sem_1_1_SC-0006': 0.06, 'sem_1_1_SC-0007': 0.06, 'sem_1_1_SC-0009': 0.06, 'sem_1_1_SC-0010': 0.06, 'sem_1_1_SC-0008': 0.4}


In [5]:
# Calculamos la probabilidad promediando
#datos['probabilidad_final'] = datos.filter(like='sem_').mean(axis=1)

#Ordenamos por probabilidad descendente
datos = datos.sort_values('probabilidad_final', ascending=False)

datos


Unnamed: 0,numero_de_cliente,sem_1_1,sem_1_1_SC-0002,sem_1_1_SC-0003,sem_1_1_SC-0004,sem_1_1_SC-0005,sem_1_1_SC-0006,sem_1_1_SC-0007,sem_1_2,sem_1_1_SC-0008,sem_1_1_SC-0009,sem_1_1_SC-0010,probabilidad_final
29130,467414436,0.997537,0.999394,0.986818,0.997858,0.999344,0.991356,0.995687,0.994701,0.998577,0.993587,0.999137,0.996756
4630,271059994,0.998618,0.998569,0.977347,0.998229,0.998131,0.996661,0.995231,0.995849,0.998391,0.991114,0.998698,0.996263
116029,954734591,0.996559,0.999483,0.989158,0.998891,0.999312,0.993923,0.993763,0.989395,0.997320,0.993505,0.997756,0.996033
73175,686309464,0.995649,0.997791,0.984179,0.995696,0.999889,0.988035,0.993924,0.992128,0.998907,0.987855,0.999213,0.995624
119985,981514942,0.996047,0.997969,0.983692,0.997583,0.998719,0.988865,0.992955,0.993974,0.998508,0.986120,0.998225,0.995452
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97797,844423582,0.000238,0.000023,0.000036,0.001871,0.000224,0.003661,0.000627,0.000510,0.000126,0.001206,0.000089,0.000559
103887,876424417,0.000287,0.000046,0.000022,0.001085,0.000257,0.003074,0.000640,0.000683,0.000160,0.001799,0.000121,0.000545
37830,507589735,0.000227,0.000053,0.000084,0.001795,0.000184,0.003269,0.000400,0.000486,0.000181,0.001141,0.000113,0.000538
47242,572955064,0.000443,0.000163,0.000037,0.001908,0.000374,0.003528,0.000377,0.000308,0.000045,0.001127,0.000042,0.000517


In [6]:
# Creamos la columna predicted con 1 para los primeros 10500
datos['Predicted'] = 0
datos.loc[datos.index[:10500], 'Predicted'] = 1

# Seleccionar solo las columnas 'numero_de_cliente' y 'Predicted'
resultado = datos[['numero_de_cliente', 'Predicted']]

In [7]:
resultado

Unnamed: 0,numero_de_cliente,Predicted
29130,467414436,1
4630,271059994,1
116029,954734591,1
73175,686309464,1
119985,981514942,1
...,...,...
97797,844423582,0
103887,876424417,0
37830,507589735,0
47242,572955064,0


In [20]:
resultado.Predicted.value_counts()

Predicted
0    155144
1     10500
Name: count, dtype: int64

In [8]:
outputh_dir = Path('/home/mili_irusta/buckets/b1/ensamble/') 
# Crear la carpeta si no existe
#outputh_dir.mkdir(parents=True, exist_ok=True)

# Guardar el resultado en el archivo CSV en la ruta especificada
resultado.to_csv(outputh_dir / 'KA_promedio_modelos_10500_prompond.csv', sep=',', index=False)

print("Archivo guardado correctamente en:", outputh_dir / 'KA_promedio_modelos_10500_prompond.csv')

Archivo guardado correctamente en: /home/mili_irusta/buckets/b1/ensamble/KA_promedio_modelos_10500_prompond.csv
