# Installs  and imports

In [None]:
!pip install shap catboost

In [None]:
import pandas as pd
import boto3
from io import StringIO
from scipy.spatial.distance import euclidean
import pickle
import os
import shap
import catboost
import s3fs
import random

# Functions

## General

In [None]:
def get_path_to_read_and_date(
    read_last_date: bool,
    bucket: str,
    key: str,
    partition_date: str,
    n_partition: int = 1,
):
    """Get path to read (given or last) and the chosen date.

    :param read_last_date: Boolean to read last valid date (True) or given date (False).
    :param bucket: S3 bucket.
    :param key: S3 key.
    :param partition_date: String with the execution date (could be separated by '=' sign).
    :param n_partition: 1 means select the last available partition, 2 the following one, and so on.
    :return: Tuple[
            Path with data,
            year of the read data,
            month of the read data,
            day of the read data
        ]
    """
    if read_last_date:
        exec_date = int(partition_date.split("=")[-1].replace("-", ""))
        date_preffix = (
            regex_split(r"[0-9]{4}-?[0-9]{2}-?[0-9]{2}", partition_date)[0]
            if "=" in partition_date
            else None
        )

        path = get_last_s3_partition(
            s3_dir=f"{bucket}/{key}/",
            execution_date=exec_date,
            preffix=date_preffix,
            n_partition=n_partition,
        )
        date = path.split("/")[-1].split("=")[-1].replace("-", "")
        # date = partition_date.split("/")[-1].split("=")[-1].replace("-", "")
        year, month, day = date[:4], date[4:6], date[6:]
        path = f"s3://{path}"
    else:
        path = f"s3://{bucket}/{key}/{partition_date}"
        date = partition_date.split("=")[-1]
        if "-" in partition_date:
            date = date.split("-")
            year, month, day = date[0], date[1], date[2]
        else:
            year, month, day = date[:4], date[4:6], date[6:]
    return path, year, month, day

def get_last_s3_partition(
    s3_dir: str,
    execution_date: int,
    preffix: str,
    n_partition: int = 1,
) -> str:
    """This function get the las partitition of a given path from an specified execution_date.

    :param s3_dir: S3 path data ending with '/'
    :param execution_date: Execution date to limit the search perimeter.
    :param preffix: Preffix of the s3 key for the date partition. (Could be 'insert_date_ci=').
    :param n_partition: 1 means select the last available partition, 2 the following one, and so on.
    :return: Complete path of the last partition to read.
    """
    preffix = " " if preffix is None else preffix
    print(preffix)
    
    s3_client = boto3.client('s3')
    s3_bucket = s3_dir.split("/", 1)[0]
    s3_prefix = s3_dir.split("/", 1)[-1]
    print(f's3_bucket: {s3_bucket}')
    print(f's3_prefix: {s3_prefix}')
    s3_contents = s3_client.list_objects_v2(
        Bucket=s3_bucket, Prefix=s3_prefix, Delimiter="/"
    ).get("CommonPrefixes")
    print(f's3_contents: {s3_contents}')
    partition_date_aux = [
        int(
            content["Prefix"]
            .strip("/")
            .split("/")[-1]
            .replace("-", "")
            .split(preffix)[-1]
        )
        for content in s3_contents
    ]
    partition_date = [
        content["Prefix"].strip("/").split("/")[-1].split(preffix)[-1]
        for content in s3_contents
    ]
    filtered_dates = list(
        filter(
            lambda e: e[0] <= execution_date, zip(partition_date_aux, partition_date)
        )
    )
    sorted_dates = sorted(filtered_dates, key=lambda e: e[0])
    try:
        return_path = os.path.join(s3_dir, f"{preffix}{str(sorted_dates[-n_partition][-1])}".strip())
    except IndexError:
        return_path = os.path.join(s3_dir, f"{preffix}_notfoundpreviousdate".strip())
    return return_path

## Aggregation

In [None]:
def calculate_nps(promoters, detractors, total_responses):
    """Calcula el Net Promoter Score (NPS)."""
    if total_responses == 0:
        return np.nan
    return ((promoters - detractors) / total_responses) * 100

def calculate_weighted_nps(group_df):
    """Calcula el NPS ponderado para un grupo de datos."""
    promoters_weight = group_df.loc[group_df['nps_100'] > 8, 'monthly_weight'].sum()
    detractors_weight = group_df.loc[group_df['nps_100'] <= 6, 'monthly_weight'].sum()
    total_weight = group_df['monthly_weight'].sum()
    
    if total_weight == 0:
        return np.nan
    return (promoters_weight - detractors_weight) / total_weight * 100

def calculate_satisfaction(df, variable):
    """Calcula la tasa de satisfacción para una variable dada, utilizando pesos mensuales si están disponibles."""
    # Comprobar si la columna 'monthly_weight' existe y no está completamente vacía para los datos relevantes
    if 'monthly_weight' in df.columns and not df[df[variable].notnull()]['monthly_weight'].isnull().all():
        # Suma de los pesos donde la variable es >= 8 y satisface la condición de estar satisfecho
        satisfied_weight = df[df[variable] >= 8]['monthly_weight'].sum()
        # Suma de todos los pesos donde la variable no es NaN
        total_weight = df[df[variable].notnull()]['monthly_weight'].sum()
        # Calcula el porcentaje de satisfacción usando los pesos
        if total_weight == 0:
            return np.nan
        return (satisfied_weight / total_weight) * 100
    else:
        # Contar respuestas satisfechas
        satisfied_count = df[df[variable] >= 8].shape[0]
        # Contar total de respuestas válidas
        total_count = df[variable].notnull().sum()
        # Calcula el porcentaje de satisfacción usando conteo
        if total_count == 0:
            return np.nan
        return (satisfied_count / total_count) * 100




def calculate_otp(df, n):
    """Calcula el On-Time Performance (OTP) como el porcentaje de valores igual a 1."""
    on_time_count = (df[f'otp{n}_takeoff'] == 0).sum()
    total_count = df[f'otp{n}_takeoff'].notnull().sum()
    return (on_time_count / total_count) * 100 if total_count > 0 else 0


def calculate_load_factor(df, pax_column, capacity_column):
    """Calcula el factor de carga para una cabina específica."""
    total_pax = df[pax_column].sum()
    total_capacity = df[capacity_column].sum()
    # Evitar la división por cero
    if total_capacity > 0:
        return (total_pax / total_capacity) * 100
    else:
        return 0


def calculate_mean(df, variable):
    """Calcula la media de una variable dada."""
    return df[variable].mean()
    
def calculate_metrics_summary(df, start_date, end_date, touchpoints):
    df['date_flight_local'] = pd.to_datetime(df['date_flight_local'])
    # Filtrar por rango de fechas
    df_filtered = df[(df['date_flight_local'] >= pd.to_datetime(start_date)) & (df['date_flight_local'] <= pd.to_datetime(end_date))]
    
    # Mapeo de cabinas a columnas de pax y capacidad
    cabin_mapping = {
        'Economy': ('pax_economy', 'capacity_economy'),
        'Business': ('pax_business', 'capacity_business'),
        'Premium Economy': ('pax_premium_ec', 'capacity_premium_ec')
    }
    
    results_list = []
    
    for (cabin, haul), group_df in df_filtered.groupby(['cabin_in_surveyed_flight', 'haul']):
        
        print(f'CABIN/HAUL: {cabin}/{haul}')
        result = {
            'start_date': start_date,
            'end_date': end_date,
            'cabin_in_surveyed_flight': cabin,
            'haul': haul,
            'ticket_price': group_df['ticket_price'].mean(),
            'load_factor': group_df['load_factor'].mean()*100,
            'out_prob_nps': group_df['out_prob_nps'].mean()*100,
            'uncertainty_nps': group_df['uncertainty_nps'].mean()*100,
        }
        
        # Calcula el NPS para el grupo
#         promoters = (group_df['nps_100'] >= 9).sum()
#         detractors = (group_df['nps_100'] <= 6).sum()
#         total_responses = group_df['nps_100'].notnull().sum()
#         result['NPS'] = calculate_nps(promoters, detractors, total_responses) if total_responses else None
        
#         # Calcula el NPS ponderado para el grupo
#         result['NPS_weighted'] = calculate_weighted_nps(group_df)
        
        # Satisfacción para cada touchpoint
        for tp in touchpoints:
            result[f'{tp}_satisfaction'] = calculate_satisfaction(group_df, tp)
            result[f'{tp}_satisfaction_nps'] = group_df[f'{tp}_nps'].mean()*100       
        
        results_list.append(result)
    
    return pd.DataFrame(results_list)



## Predict and explain

In [None]:
def inv_logit(x):
    return 1 / (1 + np.exp(-x))

def calculate_SHAP_and_probability_binary(model_promoter, model_detractor, df):
    # Extraer ID y fechas, manteniendo el índice
    varibales_to_pass_through = ['respondent_id', 'cabin_in_surveyed_flight', 'haul', 'date_flight_local','otp15_takeoff','promoter_binary', 'detractor_binary', 'monthly_weight','cluster']
    id_df = df[varibales_to_pass_through]
    
    # Preparar el conjunto de datos para predicciones, excluyendo ID y fechas
    test_set = df.drop(varibales_to_pass_through, axis=1, errors='ignore')
    
    # Predicciones y probabilidades para promotores
    promoter_test_set = test_set.drop(['promoter_binary'], axis=1, errors='ignore')
    predictions_promoter = pd.DataFrame(model_promoter.predict(promoter_test_set), index=promoter_test_set.index, columns=["prediction_prom"])
    proba_promoter = pd.DataFrame(model_promoter.predict_proba(promoter_test_set)[:, 1], index=promoter_test_set.index, columns=["out_prob_prom"])
    
    # Predicciones y probabilidades para detractores
    detractor_test_set = test_set.drop(['detractor_binary'], axis=1, errors='ignore')
    predictions_detractor = pd.DataFrame(model_detractor.predict(detractor_test_set), index=detractor_test_set.index, columns=["prediction_det"])
    proba_detractor = pd.DataFrame(model_detractor.predict_proba(detractor_test_set)[:, 1], index=detractor_test_set.index, columns=["out_prob_det"])
    
    # Combinar resultados de predicción, manteniendo el índice original
    prediction = pd.concat([id_df, test_set, predictions_promoter, proba_promoter, predictions_detractor, proba_detractor], axis=1)
    
    # SHAP values y explicadores para el modelo promotor
    shap_Explainer_promoter = shap.TreeExplainer(model_promoter)
    shap_values_promoter = shap_Explainer_promoter.shap_values(promoter_test_set)
    feature_names = [i for i in promoter_test_set.columns]
    shap_values_prom = pd.DataFrame(shap_values_promoter, index=promoter_test_set.index, columns=[f"{i}_prom" for i in feature_names])
    shap_values_prom["base_value_prom"] = shap_Explainer_promoter.expected_value
    shap_values_prom["out_value_prom"] = shap_values_prom.sum(axis=1)
    
    # SHAP values y explicadores para el modelo detractor
    shap_Explainer_detractor = shap.TreeExplainer(model_detractor)
    shap_values_detractor = shap_Explainer_detractor.shap_values(detractor_test_set)
    shap_values_det = pd.DataFrame(shap_values_detractor, index=detractor_test_set.index, columns=[f"{i}_det" for i in feature_names])
    shap_values_det["base_value_det"] = shap_Explainer_detractor.expected_value
    shap_values_det["out_value_det"] = shap_values_det.sum(axis=1)
    
    # Combinar SHAP values con predicciones, manteniendo el índice original
    output_df = pd.concat([prediction, shap_values_prom, shap_values_det], axis=1)
    
    # Devolver el dataframe de salida
    return output_df


def from_shap_to_probability_binary(df, features_dummy, label_binary):
    output_df = df.copy()
    
    # Determinar el sufijo basado en el tipo de modelo (promoter o detractor)
    class_suffix = '_prom' if label_binary == 'promoter_binary' else '_det'
    
    # Identificar columnas de SHAP para la clase de interés, asumiendo que ya tienen el sufijo correcto
    shap_columns = [col for col in df.columns if col.endswith(class_suffix)]
    base_value_col = f'base_value{class_suffix}'
    
    # Convertir el valor base a probabilidades y actualizar el nombre de la columna
    output_df[f'base_prob{class_suffix}'] = inv_logit(output_df[base_value_col])
    
    # Convertir valores SHAP a probabilidades sin cambiar los nombres de las columnas
    for col in shap_columns:
        output_df[col] = inv_logit(output_df[col])
    
    # Asegurarse de incluir solo las columnas relevantes en el DataFrame final
    relevant_columns = ['respondent_id', 'cabin_in_surveyed_flight', 'haul', 'date_flight_local','otp15_takeoff'] + shap_columns + ['promoter_binary', 'detractor_binary', 'monthly_weight','cluster'] + [f'base_prob{class_suffix}'] + features_dummy
    output_df = output_df[relevant_columns]
    print(output_df)
    return output_df

def adjust_shap_values_binary(shap_values, base_prob, out_prob):
    """Ajustar los valores SHAP para un modelo binario basado en la distancia."""
    # Calcular la distancia total deseada entre la probabilidad base y la de salida
    total_distance = out_prob - base_prob
    # Calcular la suma total de los valores SHAP
    total_shap = np.sum(shap_values)
    # Calcular el factor de ajuste si la suma total de SHAP no es cero
    adjustment_factor = total_distance / total_shap if total_shap != 0 else 0
    # Ajustar los valores SHAP
    return shap_values * adjustment_factor

def from_shap_to_probability_binary(df, features_dummy, label_binary):
    output_df = df.copy()
    
    # Determinar el sufijo basado en el tipo de modelo (promoter o detractor)
    class_suffix = '_prom' if label_binary == 'promoter_binary' else '_det'
    
    # Identificar columnas de SHAP para la clase de interés, asumiendo que ya tienen el sufijo correcto
    shap_columns = [f'{feature}{class_suffix}' for feature in features_dummy if f'{feature}{class_suffix}' in df.columns]
    base_value_col = f'base_value{class_suffix}'
    out_prob_col = f'out_prob{class_suffix}'

    # Calcular la probabilidad base usando softmax o inv_logit según sea apropiado
    output_df[f'base_prob{class_suffix}'] = inv_logit(output_df[base_value_col])

    for index, row in output_df.iterrows():
        # Extraer los valores SHAP para ajustar
        shap_values = row[shap_columns].values
        # Calcular los valores SHAP ajustados
        adjusted_shap_values = adjust_shap_values_binary(shap_values, row[f'base_prob{class_suffix}'], row[out_prob_col])
        # Actualizar el DataFrame con los valores SHAP ajustados
        output_df.loc[index, shap_columns] = adjusted_shap_values

    # Incluir solo las columnas relevantes en el DataFrame final
    relevant_columns = ['respondent_id', 'cabin_in_surveyed_flight', 'haul', 'date_flight_local','otp15_takeoff'] + shap_columns + ['promoter_binary', 'detractor_binary', 'monthly_weight','cluster'] + [f'base_prob{class_suffix}', out_prob_col] + features_dummy
    print(output_df)
    output_df = output_df[relevant_columns]
    
    return output_df

# Function to create virtual ensembles
def create_virtual_ensemble(model, total_trees, K):
    # Start taking snapshots only from half the total trees and take every K-th model
    start_at = total_trees // 2
    ensemble_models = []
    for i in range(start_at, total_trees, K):
        sub_model = model.copy()
        sub_model.shrink(ntree_start=0, ntree_end=i)
        ensemble_models.append(sub_model)
    return ensemble_models


# Function to predict with uncertainty
def predict_with_uncertainty(ensemble_models, X):
    predictions = np.array([model.predict_proba(X)[:, 1] for model in ensemble_models])
    mean_predictions = np.mean(predictions, axis=0)
    std_dev_predictions = np.std(predictions, axis=0)
    
    return mean_predictions, std_dev_predictions


def predict_and_explain(model_prom, model_det, df, features_dummy, K_uncertainty):
    """
    Realiza predicciones y genera explicaciones para modelos de promotores y detractores
    para todo el dataframe.

    Args:
    - model_prom: Modelo entrenado para predecir promotores.
    - model_det: Modelo entrenado para predecir detractores.
    - df: DataFrame con los datos.
    - features_dummy: Lista de características utilizadas para las predicciones.

    Returns:
    - Df final con .data, .values, .base_value, y predicciones.
    """
    # 1. Asumiendo que las funciones de cálculo de SHAP y probabilidad ya están implementadas y ajustadas para usar df
    df_contrib = calculate_SHAP_and_probability_binary(model_prom, model_det, df)

    # 3. Convertir valores SHAP a probabilidad
    df_probability_prom = from_shap_to_probability_binary(df_contrib, features_dummy, 'promoter_binary')
    df_probability_det = from_shap_to_probability_binary(df_contrib, features_dummy, 'detractor_binary')
    
    # 3.5 Calcular incertidumbre para ambos models
    # Generate virtual ensemble
    total_trees_prom = model_prom.tree_count_  # or any predefined number if you already know the model's tree count
    total_trees_det = model_det.tree_count_
    virtual_ensemble_prom = create_virtual_ensemble(model_prom, total_trees_prom, K_uncertainty)
    virtual_ensemble_det = create_virtual_ensemble(model_det, total_trees_det, K_uncertainty)
    
    # Use the ensemble to predict on new data
    mean_proba_prom, uncertainty_prom = predict_with_uncertainty(virtual_ensemble_prom, df_probability_prom[features_dummy])
    mean_proba_det, uncertainty_det = predict_with_uncertainty(virtual_ensemble_det, df_probability_det[features_dummy])
    
    # Add the mean prediction probabilities and uncertainty to the original DataFrame
    df_probability_prom['mean_proba_prom'] = mean_proba_prom
    df_probability_prom['uncertainty_prom'] = uncertainty_prom

    df_probability_det['mean_proba_det'] = mean_proba_det
    df_probability_det['uncertainty_det'] = uncertainty_det    
    

    # 4. Concatenar DataFrames para ambos modelos
    df_probability_prom = df_probability_prom.reset_index(drop=True)
    df_probability_det = df_probability_det.reset_index(drop=True)
    unique_columns_det = [col for col in df_probability_det.columns if col not in df_probability_prom.columns]
    df_probability_binary = pd.concat([df_probability_prom, df_probability_det[unique_columns_det]], axis=1)

    # 5. Calcular columnas NPS con la diferencia entre _prom y _det
    for column in df_probability_binary.columns:
        if '_prom' in column:
            base_name = column.split('_prom')[0]
            det_column = f'{base_name}_det'
            if det_column in df_probability_binary.columns:
                nps_column = f'{base_name}_nps'
                if base_name == 'uncertainty':
                    df_probability_binary[nps_column] = df_probability_binary[column] + df_probability_binary[det_column]
                else:
                    df_probability_binary[nps_column] = df_probability_binary[column] - df_probability_binary[det_column]                
                    
    return df_probability_binary

## Clustering

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

def elbow_study(df, variables, max_k=15):
    """
    Realiza el estudio del método del codo para determinar el número óptimo de clusters.
    
    Args:
        df (pd.DataFrame): DataFrame con los datos de entrada.
        variables (list): Lista de variables a utilizar para clustering.
        max_k (int): Número máximo de clusters a evaluar. Default es 15.
    
    Returns:
        list: Inercia para cada número de clusters en el rango de 1 a max_k.
    """
    # Selección de las variables y preprocesamiento
    X = df[variables]
    imputer = SimpleImputer(strategy="median")
    X_imputed = imputer.fit_transform(X)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)
    
    # Rango de posibles números de clusters
    k_range = range(1, max_k + 1)
    inertia = []
    
    # Aplicar KMeans y calcular la inercia para cada k
    for k in k_range:
        kmeans = KMeans(n_clusters=k, random_state=42)
        kmeans.fit(X_scaled)
        inertia.append(kmeans.inertia_)
    
    # Graficar el método del codo
    plt.figure(figsize=(8, 5))
    plt.plot(k_range, inertia, marker='o')
    plt.title('Elbow Method for Optimal k')
    plt.xlabel('Number of Clusters (k)')
    plt.ylabel('Inertia')
    plt.show()
    
    return inertia

def train_kmeans(df, variables, n_clusters=3):
    """
    Entrena un modelo KMeans en el DataFrame proporcionado.
    
    Args:
        df (pd.DataFrame): DataFrame con los datos de entrada.
        variables (list): Lista de variables a utilizar para clustering.
        n_clusters (int): Número de clusters a usar en KMeans.
    
    Returns:
        pd.DataFrame: DataFrame con una columna adicional 'cluster' que indica el cluster asignado a cada punto.
    """
    # Selección de las variables y preprocesamiento
    X = df[variables]
    imputer = SimpleImputer(strategy="median")
    X_imputed = imputer.fit_transform(X)
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_imputed)
    
    # Entrenar KMeans con el número especificado de clusters
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    df['cluster'] = kmeans.fit_predict(X_scaled)
    
    return df

def classify_into_clusters(df, touchpoints, scaler, kmeans):
    """
    Aplica el modelo K-means entrenado a un dataframe dado y asigna las etiquetas de cluster.

    Args:
    df (pd.DataFrame): DataFrame con los datos a segmentar.
    touchpoints (list): Lista de columnas de touchpoints.
    scaler (StandardScaler): Objeto StandardScaler ya entrenado.
    kmeans (KMeans): Objeto KMeans ya entrenado.

    Returns:
    pd.DataFrame: DataFrame original con las etiquetas de cluster asignadas.
    """
    # Copiar el dataframe original para evitar modificaciones accidentales
    df_copy = df.copy()

    # Filtrar el dataframe
    filtered_df = df_copy[touchpoints]

    # Reemplazar NaNs con -1 (u otro valor que no interfiera en tu análisis)
    filtered_df.fillna(-1, inplace=True)

    # Normalización de las variables utilizando el mismo scaler entrenado
    X_new_scaled = scaler.transform(filtered_df)

    # Aplicar el modelo K-means entrenado al nuevo dataframe
    new_clusters = kmeans.predict(X_new_scaled)

    # Asignar las etiquetas de cluster al dataframe original
    df_copy['cluster'] = new_clusters

    return df_copy

## Simualtion

In [None]:
def calculate_total_distance(df, targets, touchpoints):
    """
    Calcula la distancia total entre las satisfacciones actuales y los targets utilizando pesos suaves.
    
    Args:
    df (pd.DataFrame): DataFrame con las puntuaciones de los clientes.
    targets (pd.DataFrame): DataFrame con una fila que contiene los targets de satisfacción.
    touchpoints (list): Lista de touchpoints a considerar.
    
    Returns:
    float: Distancia total ponderada.
    """
    current_satisfaction = {f'{tp}_satisfaction': calculate_satisfaction(df, tp) for tp in touchpoints}
    total_distance = 0.0
    
    for tp in touchpoints:
        target_value = targets[f'{tp}_satisfaction'].mean()
        current_value = current_satisfaction[f'{tp}_satisfaction']
        if not np.isnan(current_value):
            total_distance += euclidean([target_value], [current_value])  
    
    return total_distance

import time

def soft_manual_sim_causal(df, targets, touchpoints, df_original, threshold=0.05):
    """
    Adjusts customer scores in the DataFrame to meet satisfaction targets.
    Ensures final satisfactions are within desired intervals based on the necessary change.
    Args:
        df (pd.DataFrame): Adjusted DataFrame after previous modifications.
        targets (pd.DataFrame): DataFrame with satisfaction targets.
        touchpoints (list): List of touchpoints to adjust.
        df_original (pd.DataFrame): Original DataFrame before any adjustments.
        threshold (float): Threshold to consider when adjustments have reached the target.
    Returns:
        pd.DataFrame: Adjusted DataFrame.
    """
    df_adjusted = df.copy()
    adjusted_clients_count = 0
    sum_differences = 0  # To store the sum of differences

    # Track the start time
    start_time = time.time()

    # Calculate original satisfactions from the original DataFrame
    original_satisfactions = {}
    for variable in targets.columns:
        target = targets[variable].values[0]
        if variable.endswith('_satisfaction'):
            touchpoint = variable.replace('_satisfaction', '')
            if touchpoint in touchpoints:
                original_satisfaction = calculate_satisfaction(df_original, touchpoint)
                original_satisfactions[variable] = original_satisfaction
                sum_differences += original_satisfaction - target

    for variable in targets.columns:
        # Break if the execution time exceeds 10 minutes
        if time.time() - start_time > 300:
            print("Execution stopped: exceeded time limit of 10 minutes.")
            break

        target = targets[variable].values[0]
        print(variable, target)
        if variable.endswith('_satisfaction'):
            touchpoint = variable.replace('_satisfaction', '')
            if touchpoint in touchpoints:
                current_satisfaction = calculate_satisfaction(df_adjusted, touchpoint)
                original_satisfaction = original_satisfactions[variable]

                # Determine desired interval based on whether satisfaction needed to increase or decrease
                if original_satisfaction < target:
                    # Satisfaction needed to increase
                    lower_bound = target
                    upper_bound = target + threshold
                elif original_satisfaction > target:
                    # Satisfaction needed to decrease
                    lower_bound = target - threshold
                    upper_bound = target
                else:
                    # Satisfaction was equal to target
                    if sum_differences > 0:
                        # Adjust upwards
                        lower_bound = original_satisfaction
                        upper_bound = original_satisfaction + threshold
                    else:
                        # Adjust downwards
                        lower_bound = original_satisfaction - threshold
                        upper_bound = original_satisfaction

                # Adjust satisfaction upwards
                if current_satisfaction < lower_bound:
                    for value in [7, 6, 5, 4, 3, 2, 1]:
                        while current_satisfaction < lower_bound:
                            # Break if the execution time exceeds 10 minutes
                            if time.time() - start_time > 600:
                                print("Execution stopped: exceeded time limit of 10 minutes.")
                                break

                            to_adjust = df_adjusted[df_adjusted[touchpoint] == value]
                            if to_adjust.empty:
                                break
                            to_adjust_sample = to_adjust.sample(n=1)
                            df_adjusted.loc[to_adjust_sample.index, touchpoint] = 8
                            adjusted_clients_count += 1
                            current_satisfaction = calculate_satisfaction(df_adjusted, touchpoint)
                            if current_satisfaction >= lower_bound:
                                break
                        if current_satisfaction >= lower_bound:
                            break

                # Adjust satisfaction downwards
                elif current_satisfaction > upper_bound:
                    for value in [8, 9, 10]:
                        while current_satisfaction > upper_bound:
                            # Break if the execution time exceeds 10 minutes
                            if time.time() - start_time > 600:
                                print("Execution stopped: exceeded time limit of 10 minutes.")
                                break

                            to_adjust = df_adjusted[df_adjusted[touchpoint] == value]
                            if to_adjust.empty:
                                break
                            to_adjust_sample = to_adjust.sample(n=1)
                            df_adjusted.loc[to_adjust_sample.index, touchpoint] = 7
                            adjusted_clients_count += 1
                            current_satisfaction = calculate_satisfaction(df_adjusted, touchpoint)
                            if current_satisfaction <= upper_bound:
                                break
                        if current_satisfaction <= upper_bound:
                            break

                else:
                    print(f"Current satisfaction for {touchpoint} is within the desired interval.")

            else:
                print(f"Touchpoint {touchpoint} not in touchpoints list.")

        else:
            # For non-satisfaction variables (e.g., mean scores)
            current_mean = calculate_mean(df_adjusted, variable)
            original_mean = calculate_mean(df_original, variable)
            if original_mean < target:
                desired_range = (target, target + threshold)
            elif original_mean > target:
                desired_range = (target - threshold, target)
            else:
                if sum_differences > 0:
                    desired_range = (original_mean, original_mean + threshold)
                else:
                    desired_range = (original_mean - threshold, original_mean)

            if current_mean < desired_range[0]:
                while current_mean < desired_range[0]:
                    # Break if the execution time exceeds 10 minutes
                    if time.time() - start_time > 600:
                        print("Execution stopped: exceeded time limit of 10 minutes.")
                        break

                    n = desired_range[0] - current_mean
                    adjustment_needed = min(n, threshold)
                    df_adjusted[variable] += adjustment_needed
                    adjusted_clients_count += 1
                    current_mean = calculate_mean(df_adjusted, variable)
            elif current_mean > desired_range[1]:
                while current_mean > desired_range[1]:
                    # Break if the execution time exceeds 10 minutes
                    if time.time() - start_time > 600:
                        print("Execution stopped: exceeded time limit of 10 minutes.")
                        break

                    n = current_mean - desired_range[1]
                    adjustment_needed = min(n, threshold)
                    df_adjusted[variable] -= adjustment_needed
                    adjusted_clients_count += 1
                    current_mean = calculate_mean(df_adjusted, variable)
            else:
                print(f"Current mean for {variable} is within the desired interval.")

        print(f"Total clients adjusted: {adjusted_clients_count}")

    return df_adjusted



def hard_manual_sim_rand_cluster_causal(df, targets, touchpoints, df_original, threshold=0.05, num_clients_per_iter=1):
    """
    Adjusts customer scores in the DataFrame to meet satisfaction targets
    by changing multiple touchpoints for a group of clients in each iteration.
    Ensures final satisfactions are within desired intervals based on the necessary change.
    Args:
        df (pd.DataFrame): Adjusted DataFrame after previous modifications.
        targets (pd.DataFrame): DataFrame with satisfaction targets.
        touchpoints (list): List of touchpoints to adjust.
        df_original (pd.DataFrame): Original DataFrame before any adjustments.
        threshold (float): Threshold to consider when adjustments have reached the target.
        num_clients_per_iter (int): Number of clients to select in each iteration.
    Returns:
        pd.DataFrame: Adjusted DataFrame.
    """
    df_adjusted = df.copy()
    df_adjusted['flag_reached'] = 0
    selected_clients = set()

    variables_to_increase = []
    variables_to_decrease = []

    adjusted_clients_count = 0
    sum_differences = 0  # To store the sum of differences
    original_satisfactions = {}

    # Calculate original satisfactions from the original DataFrame
    for variable in targets.columns:
        target = targets[variable].values[0]

        if variable.endswith('_satisfaction'):
            touchpoint = variable.replace('_satisfaction', '')
            if touchpoint in touchpoints:
                original_satisfaction = calculate_satisfaction(df_original, touchpoint)
                original_satisfactions[variable] = original_satisfaction
                sum_differences += original_satisfaction - target

                if original_satisfaction < target:
                    variables_to_increase.append((touchpoint, target))
                elif original_satisfaction > target:
                    variables_to_decrease.append((touchpoint, target))
                else:
                    # Satisfaction was equal to target
                    if sum_differences > 0:
                        # Need to increase within [original, original + threshold]
                        variables_to_increase.append((touchpoint, original_satisfaction + threshold))
                    else:
                        # Need to decrease within [original - threshold, original]
                        variables_to_decrease.append((touchpoint, original_satisfaction - threshold))
                        
    # Ordenar las variables según el orden en touchpoints
    variables_to_increase.sort(key=lambda x: touchpoints.index(x[0]))
    variables_to_decrease.sort(key=lambda x: touchpoints.index(x[0]))
    
    # Increase scores
    while variables_to_increase:
        # if not df_adjusted[df_adjusted['cluster'] == 2].empty:
        #     available_indices = df_adjusted[df_adjusted['cluster'] == 2].index.difference(selected_clients)
        #     if not available_indices.empty:
        #         to_adjust_samples = df_adjusted.loc[available_indices].sample(n=min(num_clients_per_iter, len(available_indices)))
        #     else:
        #         to_adjust_samples = df_adjusted.sample(n=num_clients_per_iter)
        # else:
        #     to_adjust_samples = df_adjusted.sample(n=num_clients_per_iter)
            
        available_indices = df_adjusted.index.difference(selected_clients)

        # Seleccionar un grupo de clientes que no hayan sido seleccionados previamente
        to_adjust_samples = df_adjusted.loc[available_indices].sample(n=min(num_clients_per_iter, len(available_indices)))
            
        

        selected_clients.update(to_adjust_samples.index)

        for value in [7, 6, 5, 4, 3, 2, 1, 0]:
            adjusted = False
            for touchpoint, target in variables_to_increase.copy():
                current_satisfaction = calculate_satisfaction(df_adjusted, touchpoint)
                if current_satisfaction >= target:
                    print(f"Variable {touchpoint} has reached the target satisfaction.")
                    variables_to_increase.remove((touchpoint, target))
                    continue

                mask = (to_adjust_samples[touchpoint] == value) | (to_adjust_samples[touchpoint] == value - 1)
                if mask.any():
                    new_value = random.randint(8, 10)
                    df_adjusted.loc[to_adjust_samples[mask].index, touchpoint] = new_value
                    adjusted = True
            if adjusted:
                adjusted_clients_count += len(to_adjust_samples[mask])
                break

    # Decrease scores
    while variables_to_decrease:
        # if not df_adjusted[df_adjusted['cluster'] == 1].empty:
        #     available_indices = df_adjusted[df_adjusted['cluster'] == 1].index.difference(selected_clients)
        #     if not available_indices.empty:
        #         to_adjust_samples = df_adjusted.loc[available_indices].sample(n=min(num_clients_per_iter, len(available_indices)))
        #     else:
        #         to_adjust_samples = df_adjusted.sample(n=num_clients_per_iter)
        # else:
        #     to_adjust_samples = df_adjusted.sample(n=num_clients_per_iter)
        
        available_indices = df_adjusted.index.difference(selected_clients)
        # Seleccionar un grupo de clientes que no hayan sido seleccionados previamente
        to_adjust_samples = df_adjusted.loc[available_indices].sample(n=min(num_clients_per_iter, len(available_indices)))

        selected_clients.update(to_adjust_samples.index)
        

        for value in [8, 9, 10]:
            adjusted = False
            for touchpoint, target in variables_to_decrease.copy():
                current_satisfaction = calculate_satisfaction(df_adjusted, touchpoint)
                if current_satisfaction <= target:
                    print(f"Variable {touchpoint} has reached the target satisfaction.")
                    variables_to_decrease.remove((touchpoint, target))
                    continue

                mask = (to_adjust_samples[touchpoint] == value) | (to_adjust_samples[touchpoint] == value + 1)
                if mask.any():
                    new_value = random.randint(0, 7)
                    df_adjusted.loc[to_adjust_samples[mask].index, touchpoint] = new_value
                    adjusted = True
            if adjusted:
                adjusted_clients_count += len(to_adjust_samples[mask])
                break

    print(f"Total clients adjusted: {adjusted_clients_count}")

    # Final verification
    verification_results = {}
    for variable in targets.columns:
        if variable.endswith('_satisfaction'):
            touchpoint = variable.replace('_satisfaction', '')
            if touchpoint in touchpoints:
                final_satisfaction = calculate_satisfaction(df_adjusted, touchpoint)
                verification_results[variable] = (final_satisfaction, targets[variable].values[0])
        else:
            final_mean = calculate_mean(df_adjusted, variable)
            verification_results[variable] = (final_mean, targets[variable].values[0])

    for var, (final, target) in verification_results.items():
        print(f"Variable: {var}, Final: {final:.2f}, Target: {target:.2f}, Met: {abs(final - target) <= threshold}")

    return df_adjusted

# Crear un diccionario de pesos basado en el orden de importancia, usando una función logarítmica
def log_weight_function(index, max_index):
    return 1 - np.log(1 + index) / np.log(1 + max_index)

def causal_swapping(df, targets, touchpoints, df_original, max_iterations=100000, threshold=0.05, patience=500):
    """
    Adjusts the customer population by swapping clients between clusters until targets are met or no further improvement.
    Ensures final satisfactions are within desired intervals based on the necessary change.
    Args:
        df (pd.DataFrame): DataFrame with customer scores.
        targets (pd.DataFrame): DataFrame with satisfaction targets.
        touchpoints (list): List of touchpoints to adjust.
        df_original (pd.DataFrame): Original DataFrame before any adjustments.
        max_iterations (int): Maximum number of iterations.
        threshold (float): Threshold to consider when adjustments have reached the target.
        patience (int): Number of additional iterations after the distance stops improving.
    Returns:
        pd.DataFrame: Adjusted DataFrame.
    """
    df_adjusted = df.copy()

    # Calculate original satisfactions and initial values
    original_satisfactions = {tp: calculate_satisfaction(df_original, tp) for tp in touchpoints}
    overall_satisfaction = {tp: calculate_satisfaction(df_adjusted, tp) for tp in touchpoints}
    initial_distance = calculate_total_distance(df_adjusted, targets, touchpoints)
    iteration = 0
    patience_counter = 0

    while iteration < max_iterations:
        adjustments_made = False
        local_adjustments_made = False

        # Loop through touchpoints to adjust satisfactions
        for tp in touchpoints:
            target_value = targets[f'{tp}_satisfaction'].values[0]
            current_satisfaction = overall_satisfaction[tp]
            original_satisfaction = original_satisfactions[tp]

            # Define desired interval
            if original_satisfaction < target_value:
                lower_bound, upper_bound = target_value, target_value + threshold
            elif original_satisfaction > target_value:
                lower_bound, upper_bound = target_value - threshold, target_value
            else:
                lower_bound, upper_bound = original_satisfaction - threshold, original_satisfaction + threshold

            # Check if adjustment is needed
            if current_satisfaction < lower_bound:
                # Need to increase satisfaction by swapping from cluster 2 to cluster 0 with specific logic
                df_adjusted, local_adjustments_made = swap_client_with_criteria(df_adjusted, from_cluster=2, to_cluster=0, touchpoint=tp, increase=True)
            elif current_satisfaction > upper_bound:
                # Need to decrease satisfaction by swapping from cluster 0 to cluster 2 with opposite specific logic
                df_adjusted, local_adjustments_made = swap_client_with_criteria(df_adjusted, from_cluster=0, to_cluster=2, touchpoint=tp, increase=False)

            # Break the loop if adjustment was made
            if local_adjustments_made:
                # Recalculate distances and satisfaction after adjustments
                new_distance = calculate_total_distance(df_adjusted, targets, touchpoints)
                overall_satisfaction = {tp: calculate_satisfaction(df_adjusted, tp) for tp in touchpoints}

                # If new distance is better, reset patience counter and break
                if new_distance < initial_distance:
                    initial_distance = new_distance
                    patience_counter = 0
                    adjustments_made = True
                    break  # Successful adjustment, move to next iteration of while loop
                else:
                    # No improvement, continue with next touchpoint
                    break

        # If no adjustments were made in the entire loop through touchpoints, apply simple swap
        if not adjustments_made:
            simple_adjustment_made = simple_swap_clients_between_clusters(df_adjusted, from_cluster=2, to_cluster=0)
            if simple_adjustment_made:
                # Recalculate distances and satisfaction after simple swap
                new_distance = calculate_total_distance(df_adjusted, targets, touchpoints)
                overall_satisfaction = {tp: calculate_satisfaction(df_adjusted, tp) for tp in touchpoints}

                # If new distance is better, reset patience counter
                if new_distance < initial_distance:
                    initial_distance = new_distance
                    patience_counter = 0
                    adjustments_made = True
                else:
                    patience_counter += 1
            else:
                patience_counter += 1

            if patience_counter >= patience:
                print(f"Local minimum reached at iteration {iteration} with distance {initial_distance:.4f} and patience count {patience_counter}")
                break

        # Check if all touchpoints are within desired intervals
        if all(within_interval(overall_satisfaction[tp], original_satisfactions[tp], target_value, threshold, sum_differences=sum(target_value - original_satisfaction for tp in touchpoints)) for tp in touchpoints):
            print(f"Targets achieved within desired intervals at iteration {iteration} with distance {new_distance:.4f}")
            break

        iteration += 1

    return df_adjusted

def swap_client_with_criteria(df_adjusted, from_cluster, to_cluster, touchpoint, increase=True):
    """
    Helper function to swap clients between clusters with specific criteria.
    Args:
        df_adjusted (pd.DataFrame): Adjusted DataFrame.
        from_cluster (int): Cluster to remove clients from.
        to_cluster (int): Cluster to add clients to.
        touchpoints (list): List of touchpoints to consider.
        increase (bool): Whether we are increasing or decreasing satisfaction.
    Returns:
        (pd.DataFrame, bool): Tuple containing the adjusted DataFrame and a boolean indicating if the swap was successful.
    """
    clients_from = df_adjusted[df_adjusted['cluster'] == from_cluster]
    clients_to = df_adjusted[df_adjusted['cluster'] == to_cluster]

    # Select the most important touchpoint
    important_touchpoint = touchpoint

    if not clients_from.empty and not clients_to.empty:
        if increase:
            # Select clients from 'from_cluster' with high out_prob_nps and high contributions in the most important touchpoint
            condition_from = (clients_from['out_prob_nps'] > clients_from['out_prob_nps'].quantile(0.90))
            condition_from &= (clients_from[f'{important_touchpoint}_nps'] > clients_from[f'{important_touchpoint}_nps'].quantile(0.90))
            clients_from_filtered = clients_from[condition_from]

            if clients_from_filtered.empty:
                return df_adjusted, False

            # Select a client to duplicate from 'from_cluster'
            client_to_duplicate = clients_from_filtered.sample(n=1)
            df_adjusted = df_adjusted.append(client_to_duplicate, ignore_index=True)

            # Select clients from 'to_cluster' with low out_prob_nps and negative contributions in the most important touchpoint
            condition_to = (clients_to['out_prob_nps'] < clients_to['out_prob_nps'].quantile(0.10))
            condition_to &= (clients_to[f'{important_touchpoint}_nps'] < clients_to[f'{important_touchpoint}_nps'].quantile(0.10))
            clients_to_filtered = clients_to[condition_to]

            if clients_to_filtered.empty:
                return df_adjusted, False

            # Select a client to remove from 'to_cluster'
            client_to_remove = clients_to_filtered.sample(n=1)
            df_adjusted.drop(client_to_remove.index, inplace=True)
        else:
            # Decrease satisfaction: Opposite logic
            # Select clients from 'from_cluster' with low out_prob_nps and negative contributions in the most important touchpoint
            condition_from = (clients_from['out_prob_nps'] < clients_from['out_prob_nps'].quantile(0.10))
            condition_from &= (clients_from[f'{important_touchpoint}_nps'] < clients_from[f'{important_touchpoint}_nps'].quantile(0.10))
            clients_from_filtered = clients_from[condition_from]

            if clients_from_filtered.empty:
                return df_adjusted, False

            # Select a client to duplicate from 'from_cluster'
            client_to_duplicate = clients_from_filtered.sample(n=1)
            df_adjusted = df_adjusted.append(client_to_duplicate, ignore_index=True)

            # Select clients from 'to_cluster' with high out_prob_nps and positive contributions in the most important touchpoint
            condition_to = (clients_to['out_prob_nps'] > clients_to['out_prob_nps'].quantile(0.90))
            condition_to &= (clients_to[f'{important_touchpoint}_nps'] > clients_to[f'{important_touchpoint}_nps'].quantile(0.90))
            clients_to_filtered = clients_to[condition_to]

            if clients_to_filtered.empty:
                return df_adjusted, False

            # Select a client to remove from 'to_cluster'
            client_to_remove = clients_to_filtered.sample(n=1)
            df_adjusted.drop(client_to_remove.index, inplace=True)

        return df_adjusted.copy(), True
    return df_adjusted, False

def simple_swap_clients_between_clusters(df_adjusted, from_cluster, to_cluster):
    """
    Helper function to simply swap clients between clusters.
    Args:
        df_adjusted (pd.DataFrame): Adjusted DataFrame.
        from_cluster (int): Cluster to remove clients from.
        to_cluster (int): Cluster to add clients to.
    Returns:
        bool: True if swap was successful, False otherwise.
    """
    clients_from = df_adjusted[df_adjusted['cluster'] == from_cluster]
    clients_to = df_adjusted[df_adjusted['cluster'] == to_cluster]

    if not clients_from.empty and not clients_to.empty:
        # Select a random client from 'from_cluster'
        client_to_remove = clients_from.sample(n=1)
        df_adjusted.drop(client_to_remove.index, inplace=True)

        # Select a random client from 'to_cluster'
        client_to_add = clients_to.sample(n=1)
        df_adjusted = pd.concat([df_adjusted, client_to_add], ignore_index=True)

        return True
    return False

def within_interval(current, original, target, threshold, sum_differences):
    """Helper function to determine if satisfaction is within the desired interval."""
    if original < target:
        return target <= current <= target + threshold
    elif original > target:
        return target - threshold <= current <= target
    else:
        if sum_differences > 0:
            return original <= current <= original + threshold
        else:
            return original - threshold <= current <= original


# List of variables

In [None]:
op_vars = ["ticket_price", "load_factor"]

touchpoints = ["pun_100_punctuality",
    "bkg_200_journey_preparation",
    "pfl_100_checkin",
    "pfl_200_security",
    "pfl_300_lounge",
    "pfl_500_boarding",
    "ifl_300_cabin",
    "ifl_200_flight_crew_annoucements",
    "ifl_600_wifi",
    "ifl_500_ife",
    "ifl_400_food_drink",
    "ifl_100_cabin_crew",
    "arr_100_arrivals",
    "con_100_connections",
    "loy_200_loyalty_programme",
    "img_310_ease_contact_phone"]

labels = ["promoter_binary",
    "detractor_binary"]

shaps = ["pun_100_punctuality_nps",
    "bkg_200_journey_preparation_nps",
    "pfl_100_checkin_nps",
    "pfl_200_security_nps",
    "pfl_300_lounge_nps",
    "pfl_500_boarding_nps",
    "ifl_300_cabin_nps",
    "ifl_200_flight_crew_annoucements_nps",
    "ifl_600_wifi_nps",
    "ifl_500_ife_nps",
    "ifl_400_food_drink_nps",
    "ifl_100_cabin_crew_nps",
    "arr_100_arrivals_nps",
    "con_100_connections_nps",
    "loy_200_loyalty_programme_nps",
    "img_310_ease_contact_phone_nps"]

preds = ["out_prob_prom",
    "out_prob_det",
    "out_prob_nps"]

nps = ['nps_100']

variables = touchpoints + labels


# Read historic data

In [None]:
# Initialize the S3 client
s3 = boto3.client('s3')

# Define the S3 bucket and prefix
S3_BUCKET = "iberia-data-lake"
s3_resource = boto3.resource("s3")
insert_date_ci='2024-10-28'
prefix = f'customer/nps_surveys/export_historic/insert_date_ci={insert_date_ci}/'

STR_START_DATE_o = '2023-06-01'
STR_END_DATE_o = '2023-08-30'
STR_START_DATE_f = '2024-06-01'
STR_END_DATE_f = '2024-08-30'
STR_CABIN = 'Business'
STR_HAUL = 'LH'

month = 6

In [None]:
sts_client = boto3.client('sts')

assumed_role = sts_client.assume_role(
    RoleArn="arn:aws:iam::320714865578:role/ibdata-prod-role-assume-customer-services-from-ibdata-aip-prod",
    RoleSessionName="test"
)

credentials = assumed_role['Credentials']

# Configura s3fs para acceder a S3 con tus credenciales
fs = s3fs.S3FileSystem(key=credentials['AccessKeyId'], secret=credentials['SecretAccessKey'], token=credentials['SessionToken'])

# Especifica la ruta de la carpeta
bucket_name = 'ibdata-prod-ew1-s3-customer'
folder_path = 'customer/load_factor_to_s3_nps_model/'

# Lista todos los archivos en la carpeta
files = fs.ls(f'{bucket_name}/{folder_path}')

# Leer y concatenar todos los archivos Parquet en un solo DataFrame
dataframes = []
for file in files:
    with fs.open(f's3://{file}') as f:
        df = pd.read_csv(f)
        dataframes.append(df)

# Concat the dfs
df_lf_historic = pd.concat(dataframes, ignore_index=True)

In [None]:
s3_keys = [item.key for item in s3_resource.Bucket(S3_BUCKET).objects.filter(Prefix=prefix)]
preprocess_paths = [f"s3://{S3_BUCKET}/{key}" for key in s3_keys]
df_nps_historic = pd.DataFrame()
for file in preprocess_paths:
    df = pd.read_csv(file)
    df_nps_historic = pd.concat([df_nps_historic, df], axis=0)

# Load models

## Kmeans

In [None]:
# s3 object
s3_resource = boto3.resource("s3")

# path
model_path = f"customer/simulations/sbx/01_clusterize_step/model/3means.pkl"
scaler_path = f"customer/simulations/sbx/01_clusterize_step/model/3means_scaler.pkl"
        
# Load the trained model from S3
model = (
    s3_resource.Bucket(S3_BUCKET)
    .Object(f"{model_path}")
    .get()
)
kmeans_model = pickle.loads(model["Body"].read())      
    
scaler = (
    s3_resource.Bucket(S3_BUCKET)
    .Object(f"{scaler_path}")
    .get()
)
kmeans_scaler = pickle.loads(scaler["Body"].read())  


## Classifiers (Client Model)

In [None]:
model_names = ['Promoters', 'Detractors']

# Define the paths for reading data and the trained model
clf_model={}
for name in model_names:
    path_read_train = f"customer/nps_explainability_model/prod/02_train_step/{name}"

    # Determine the path to read the model from
    model_path, model_year, model_month, model_day = get_path_to_read_and_date(
        read_last_date=True,
        bucket=S3_BUCKET,
        key=path_read_train,
        partition_date=insert_date_ci,
    )

    # Extract the bucket and object key from the model_path
    if 's3://' in model_path:
        model_path = model_path.split('//')[1].replace(f"{S3_BUCKET}/", '')

    # Load the trained model from S3
    fitted_clf_model = (
        s3_resource.Bucket(S3_BUCKET)
        .Object(f"{model_path}/model/CatBoostClassifier_cv.pkl")
        .get()
    )
    clf_model[name] = pickle.loads(fitted_clf_model["Body"].read())

# Clean and filter df

In [None]:
 # ETL Code

# 1. Filter dataframes by carrier code.
df_nps_historic['haul'] = df_nps_historic['haul'].replace('MH', 'SH')
    
# NPS HISTORIC
condition_1 = (df_nps_historic['operating_airline_code'].isin(['IB', 'YW']))
# condition_1 = (df_nps_historic['operating_airline_code'].isin(['IB'])) #Ejercicio FP de Iberia por separado
condition_2 = ((df_nps_historic['invitegroup_ib'] != 3) | (df_nps_historic['invitegroup_ib'].isnull()))
condition_3 = (df_nps_historic['invitegroup'] == 2)
    
df_nps_historic = df_nps_historic.loc[condition_1 & (condition_2 & condition_3)]

# 2. Transform date column to datetime format
delay_features = ['real_departure_time_local', 'scheduled_departure_time_local']
for feat in delay_features:
    df_nps_historic[feat] = pd.to_datetime(df_nps_historic[feat], format="%Y%m%d %H:%M:%S", errors = 'coerce')
            
df_nps_historic['delay_departure'] = (df_nps_historic['real_departure_time_local'] - df_nps_historic['scheduled_departure_time_local']).dt.total_seconds()/60
    
# NPS
df_nps_historic['date_flight_local'] = pd.to_datetime(df_nps_historic['date_flight_local'])

# Load Factor
df_lf_historic['flight_date_local'] = pd.to_datetime(df_lf_historic['flight_date_local'])

# 3. Filter out covid years
# NPS (historic)
df_nps_historic = df_nps_historic[df_nps_historic['date_flight_local'].dt.year >= 2019]
df_nps_historic = df_nps_historic[~df_nps_historic['date_flight_local'].dt.year.isin([2020, 2021])]
    
# Load factor (historic)
df_lf_historic = df_lf_historic[df_lf_historic['flight_date_local'].dt.year >= 2019]
df_lf_historic = df_lf_historic[~df_lf_historic['flight_date_local'].dt.year.isin([2020, 2021])]

# 4. Create otp, promoter, detractor and load factor columns.
# OTP
df_nps_historic['otp15_takeoff'] = (df_nps_historic['delay'] > 15).astype(int)

# Promoter and Detractor columns
df_nps_historic["promoter_binary"] = df_nps_historic["nps_category"].apply(lambda x: 1 if x == "Promoter" else 0)
df_nps_historic["detractor_binary"] = df_nps_historic["nps_category"].apply(lambda x: 1 if x == "Detractor" else 0)

# Load Factor
df_lf_historic['load_factor_business'] = df_lf_historic['pax_business'] / df_lf_historic['capacity_business']
df_lf_historic['load_factor_premium_ec'] = df_lf_historic['pax_premium_ec'] / df_lf_historic['capacity_premium_ec']
df_lf_historic['load_factor_economy'] = df_lf_historic['pax_economy'] / df_lf_historic['capacity_economy']


# 5. Merge dataframes.
cabin_to_load_factor_column = {
    'Economy': 'load_factor_economy',
    'Business': 'load_factor_business',
    'Premium Economy': 'load_factor_premium_ec'
}

# HISTORIC
if 'operating_carrier' in df_lf_historic.columns:
    df_lf_historic.columns = ['date_flight_local' if x=='flight_date_local' else 
                                'operating_airline_code' if x=='operating_carrier' else
                                'surveyed_flight_number' if x=='op_flight_num' else
                                x for x in df_lf_historic.columns]
elif 'op_carrier_group_ib' in df_lf_historic.columns:
    df_lf_historic.columns = ['date_flight_local' if x=='flight_date_local' else 
                                'operating_airline_code' if x=='op_carrier_group_ib' else
                                'surveyed_flight_number' if x=='op_flight_num' else
                                x for x in df_lf_historic.columns]       
    
df_lf_historic['date_flight_local']=pd.to_datetime(df_lf_historic['date_flight_local'])
df_lf_historic['surveyed_flight_number'] = df_lf_historic['surveyed_flight_number'].astype('float64')
    
# List of columns to transform
load_factor_columns = ['load_factor_business', 'load_factor_premium_ec', 'load_factor_economy']

# Automatically determine id_vars by excluding load_factor_columns from all columns
id_vars = [col for col in df_lf_historic.columns if col not in load_factor_columns]

# Reshaping the DataFrame while dynamically keeping all other columns
df_lf_historic = pd.melt(df_lf_historic, id_vars=id_vars, 
                    value_vars=load_factor_columns,
                    var_name='cabin_in_surveyed_flight', value_name='load_factor')

# Replacing the column names in 'cabin_in_surveyed_flight' with the desired cabin types
df_lf_historic['cabin_in_surveyed_flight'] = df_lf_historic['cabin_in_surveyed_flight'].map({
    'load_factor_business': 'Business',
    'load_factor_premium_ec': 'Premium Economy',
    'load_factor_economy': 'Economy'
})

    
df_historic = pd.merge(df_nps_historic, df_lf_historic, 
                    how='left', 
                    on=['date_flight_local', 'surveyed_flight_number', 'cabin_in_surveyed_flight', 'haul'])
    

In [None]:
 # Condiciones de filtrado de fechas
condition_start_o = (df_historic['date_flight_local'] >= STR_START_DATE_o)
condition_end_o = (df_historic['date_flight_local'] <= STR_END_DATE_o)

condition_start_f = (df_historic['date_flight_local'] >= STR_START_DATE_f)
condition_end_f = (df_historic['date_flight_local'] <= STR_END_DATE_f)
    
# Filtrar cabinas; si STR_CABIN es "All", omitimos el filtro, de lo contrario usamos isin() para listas
condition_cabin = df_historic['cabin_in_surveyed_flight'] == STR_CABIN

# Filtrar haul; si STR_HAUL es "All", omitimos el filtro, de lo contrario usamos isin() para listas
condition_haul = df_historic['haul'] == STR_HAUL

# Filtrar el DataFrame utilizando todas las condiciones
df_2023 = df_historic[condition_start_o & condition_end_o & condition_cabin & condition_haul]
df_2024 = df_historic[condition_start_f & condition_end_f & condition_cabin & condition_haul]
    
n = len(df_2023)
    
df_historic['date_flight_local'] = pd.to_datetime(df_historic['date_flight_local'])    
condition_month = (df_historic['date_flight_local'].dt.month == month)
    
df_sampled = df_historic[condition_cabin & condition_haul & condition_month].sample(n=n)


# Clusterize

## Train model

In [None]:
# Estudio del codo
# inertia_values = elbow_study(df, variables, max_k=15)

In [None]:
# Entrenamiento con el número óptimo de clusters
# df_clustered = train_kmeans(df, variables, n_clusters=3)

## Inference with 3Means

In [None]:
df_sampled.columns

In [None]:
df_s = classify_into_clusters(df_sampled, touchpoints+labels, kmeans_scaler, kmeans_model)
df_f = classify_into_clusters(df_2023, touchpoints+labels, kmeans_scaler, kmeans_model)
df_r = classify_into_clusters(df_2024, touchpoints+labels, kmeans_scaler, kmeans_model)

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def plot_variable_against_list(df, main_var, var_list, cluster_col='cluster'):
    """
    Plots a main variable against each variable in a list with added noise for better cluster visualization.
    
    Parameters:
    - df (pd.DataFrame): DataFrame containing the variables and cluster labels.
    - main_var (str): Name of the main variable/column to plot on the x-axis.
    - var_list (list of str): List of variable names to plot on the y-axis.
    - cluster_col (str): Name of the column with cluster labels. Default is 'cluster'.
    """
    
    # Add noise to the main variable
    df[f'{main_var}_noisy'] = df[main_var] + np.random.uniform(-0.5, 0.5, df.shape[0])
    
    for var in var_list:
        # Add noise to the current variable in var_list
        df[f'{var}_noisy'] = df[var] + np.random.uniform(-0.5, 0.5, df.shape[0])
        
        # Plotting
        plt.figure(figsize=(10, 6))
        plt.scatter(df[f'{main_var}_noisy'], df[f'{var}_noisy'], c=df[cluster_col], cmap='viridis', alpha=0.7)
        plt.colorbar(label='Cluster')
        plt.xlabel(f'{main_var} (noisy)')
        plt.ylabel(f'{var} (noisy)')
        plt.title(f'{main_var} vs. {var} with Cluster Labels (with Noise)')
        plt.show()
        
        
        
main_var = 'pun_100_punctuality'

# Create a copy of the list without the main_var element
plot_variable_against_list(df_f, main_var, [var for var in touchpoints if var != main_var], cluster_col='cluster')


In [None]:
df_f_0 = df_f[df_f['cluster']==0]
df_f_1 = df_f[df_f['cluster']==1]
df_f_2 = df_f[df_f['cluster']==2]

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

numeric_columns = [col for col in df.columns if col.endswith('_nps')]

cluster_df = df_f_0.copy()

# Ajustar estilo de los gráficos (opcional)
plt.style.use('ggplot')  # Puedes elegir otro estilo disponible

# Generar histogramas y boxplots
for col in numeric_columns:
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    # Histograma
    axes[0].hist(cluster_df[col].dropna(), bins=50, edgecolor='k', alpha=0.7)
    axes[0].set_title(f"Histograma de {col}")
    axes[0].set_xlabel(col)
    axes[0].set_ylabel("Frecuencia")

    # Boxplot
    axes[1].boxplot(cluster_df[col].dropna(), vert=False)
    axes[1].set_title(f"Boxplot de {col}")
    axes[1].set_xlabel(col)

    plt.tight_layout()
    plt.show()


# Predict explain original

In [None]:
features = op_vars + touchpoints
columns = [
    "respondent_id",
    'cabin_in_surveyed_flight', 
    'haul',
    "date_flight_local",
    "otp15_takeoff",
    "ticket_price",
    "load_factor",
    "pun_100_punctuality",
    "bkg_200_journey_preparation",
    "pfl_100_checkin",
    "pfl_200_security",
    "pfl_300_lounge",
    "pfl_500_boarding",
    "ifl_300_cabin",
    "ifl_200_flight_crew_annoucements",
    "ifl_600_wifi",
    "ifl_500_ife",
    "ifl_400_food_drink",
    "ifl_100_cabin_crew",
    "arr_100_arrivals",
    "con_100_connections",
    "loy_200_loyalty_programme",
    "img_310_ease_contact_phone",
    "promoter_binary",
    "detractor_binary",
    "monthly_weight",
    "cluster"
]

In [None]:
df_f = df_f[columns]
df_r = df_r[columns]

In [None]:
df_probabilities_2023 = predict_and_explain(clf_model[model_names[0]], clf_model[model_names[1]], df_f, features, K_uncertainty=5)
df_probabilities_2024 = predict_and_explain(clf_model[model_names[0]], clf_model[model_names[1]], df_r, features, K_uncertainty=5)

# Simulations

In [None]:
agg_df_2023 = calculate_metrics_summary(df_probabilities_2023, STR_START_DATE_o, STR_END_DATE_o, touchpoints)

# Calcular métricas agregadas para el año siguiente
agg_df_2024 = calculate_metrics_summary(df_probabilities_2024, STR_START_DATE_f, STR_END_DATE_f, touchpoints)

HEADERS_TARGETS = [
    "ticket_price",
    "load_factor",
    "pun_100_punctuality_satisfaction",
    "bkg_200_journey_preparation_satisfaction",
    "pfl_100_checkin_satisfaction",
    "pfl_200_security_satisfaction",
    "pfl_300_lounge_satisfaction",
    "pfl_500_boarding_satisfaction",
    "ifl_300_cabin_satisfaction",
    "ifl_200_flight_crew_annoucements_satisfaction",
    "ifl_600_wifi_satisfaction",
    "ifl_500_ife_satisfaction",
    "ifl_400_food_drink_satisfaction",
    "ifl_100_cabin_crew_satisfaction",
    "arr_100_arrivals_satisfaction",
    "con_100_connections_satisfaction",
    "loy_200_loyalty_programme_satisfaction",
    "img_310_ease_contact_phone_satisfaction"
]


targets = agg_df_2024[HEADERS_TARGETS]
            
print(targets)

### Naive client based optimization

In [None]:
import pandas as pd
import numpy as np
import random


def causal_swapping(df, targets, touchpoints, df_original, max_iterations=100000, threshold=0.05, patience=500, k=5):
    """
    Adjusts the customer population by swapping clients between clusters until targets are met or no further improvement.
    Ensures final satisfactions are within desired intervals based on the necessary change.
    Args:
        df (pd.DataFrame): DataFrame with customer scores.
        targets (pd.DataFrame): DataFrame with satisfaction targets.
        touchpoints (list): List of touchpoints to adjust.
        df_original (pd.DataFrame): Original DataFrame before any adjustments.
        max_iterations (int): Maximum number of iterations.
        threshold (float): Threshold to consider when adjustments have reached the target.
        patience (int): Number of additional iterations after the distance stops improving.
        k (int): Number of top touchpoints to consider when determining direction.
    Returns:
        pd.DataFrame: Adjusted DataFrame.
    """
    df_adjusted = df.copy()

    # Calculate original satisfactions and initial values
    original_satisfactions = {tp: calculate_satisfaction(df_original, tp) for tp in touchpoints}
    initial_distance = calculate_total_distance(df_adjusted, targets, touchpoints)
    iteration = 0
    patience_counter = 0

    while iteration < max_iterations:
        adjustments_made = False
        local_adjustments_made = False

        # Determine direction of adjustment based on the top k touchpoints
        direction = 0  # Positive for increase, negative for decrease
        total_increase_needed = 0
        total_decrease_needed = 0

        for tp in touchpoints[:k]:
            current_satisfaction = calculate_satisfaction(df_adjusted, tp)
            target_value = targets[f'{tp}_satisfaction'].values[0]
            original_satisfaction = original_satisfactions[tp]

            # Determine desired interval
            if original_satisfaction < target_value:
                lower_bound, upper_bound = target_value, target_value + threshold
            elif original_satisfaction > target_value:
                lower_bound, upper_bound = target_value - threshold, target_value
            else:
                lower_bound, upper_bound = original_satisfaction - threshold, original_satisfaction + threshold

            # Calculate direction needed for the current touchpoint
            if current_satisfaction < lower_bound:
                increase_needed = lower_bound - current_satisfaction
                total_increase_needed += increase_needed
                direction += 1
            elif current_satisfaction > upper_bound:
                decrease_needed = current_satisfaction - upper_bound
                total_decrease_needed += decrease_needed
                direction -= 1

        # Determine overall direction based on the sum of needs
        if total_increase_needed > total_decrease_needed:
            increase_satisfaction = True
        else:
            increase_satisfaction = False

        # Apply swap based on the direction determined
        if increase_satisfaction:
            # Increase satisfaction: swap from cluster 2 to cluster 0
            df_adjusted, local_adjustments_made = swap_client_based_on_out_prob(df_adjusted, from_cluster=2, to_cluster=0, increase=True)
        else:
            # Decrease satisfaction: swap from cluster 0 to cluster 2
            df_adjusted, local_adjustments_made = swap_client_based_on_out_prob(df_adjusted, from_cluster=0, to_cluster=2, increase=False)

        # Check if adjustment was made and recalculate distance
        if local_adjustments_made:
            new_distance = calculate_total_distance(df_adjusted, targets, touchpoints)

            # If new distance is better, reset patience counter
            if new_distance < initial_distance:
                initial_distance = new_distance
                patience_counter = 0
                adjustments_made = True
            else:
                patience_counter += 1
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"Local minimum reached at iteration {iteration} with distance {initial_distance:.4f} and patience count {patience_counter}")
            break

        iteration += 1

    return df_adjusted

def swap_client_based_on_out_prob(df_adjusted, from_cluster, to_cluster, increase=True):
    """
    Helper function to swap clients between clusters based on out_prob_nps values.
    Args:
        df_adjusted (pd.DataFrame): Adjusted DataFrame.
        from_cluster (int): Cluster to remove clients from.
        to_cluster (int): Cluster to add clients to.
        increase (bool): Whether we are increasing or decreasing satisfaction.
    Returns:
        (pd.DataFrame, bool): Tuple containing the adjusted DataFrame and a boolean indicating if the swap was successful.
    """
    clients_from = df_adjusted[df_adjusted['cluster'] == from_cluster]
    clients_to = df_adjusted[df_adjusted['cluster'] == to_cluster]

    if not clients_from.empty and not clients_to.empty:
        if increase:
            # Select client with high out_prob_nps from 'to_cluster'
            client_to_add = clients_to[clients_to['out_prob_nps'] > clients_to['out_prob_nps'].quantile(0.90)].sample(n=1)
            if client_to_add.empty:
                return df_adjusted, False

            # Select client with low out_prob_nps from 'from_cluster'
            client_to_remove = clients_from[clients_from['out_prob_nps'] < clients_from['out_prob_nps'].quantile(0.10)].sample(n=1)
            if client_to_remove.empty:
                return df_adjusted, False
        else:
            # Select client with low out_prob_nps from 'to_cluster'
            client_to_add = clients_to[clients_to['out_prob_nps'] < clients_to['out_prob_nps'].quantile(0.10)].sample(n=1)
            if client_to_add.empty:
                return df_adjusted, False

            # Select client with high out_prob_nps from 'from_cluster'
            client_to_remove = clients_from[clients_from['out_prob_nps'] > clients_from['out_prob_nps'].quantile(0.90)].sample(n=1)
            if client_to_remove.empty:
                return df_adjusted, False

        # Update df_adjusted by adding and removing clients
        df_adjusted = pd.concat([df_adjusted, client_to_add], ignore_index=True)
        df_adjusted.drop(client_to_remove.index, inplace=True)

        return df_adjusted.copy(), True

    return df_adjusted, False

def simple_swap_clients_between_clusters(df_adjusted, from_cluster, to_cluster):
    """
    Helper function to simply swap clients between clusters.
    Args:
        df_adjusted (pd.DataFrame): Adjusted DataFrame.
        from_cluster (int): Cluster to remove clients from.
        to_cluster (int): Cluster to add clients to.
    Returns:
        bool: True if swap was successful, False otherwise.
    """
    clients_from = df_adjusted[df_adjusted['cluster'] == from_cluster]
    clients_to = df_adjusted[df_adjusted['cluster'] == to_cluster]

    if not clients_from.empty and not clients_to.empty:
        # Select a random client from 'from_cluster'
        client_to_remove = clients_from.sample(n=1)
        df_adjusted.drop(client_to_remove.index, inplace=True)

        # Select a random client from 'to_cluster'
        client_to_add = clients_to.sample(n=1)
        df_adjusted = pd.concat([df_adjusted, client_to_add], ignore_index=True)

        return True
    return False

def within_interval(current, original, target, threshold, sum_differences):
    """Helper function to determine if satisfaction is within the desired interval."""
    if original < target:
        return target <= current <= target + threshold
    elif original > target:
        return target - threshold <= current <= target
    else:
        if sum_differences > 0:
            return original <= current <= original + threshold
        else:
            return original - threshold <= current <= original


In [None]:
import pandas as pd
import numpy as np
import random


def causal_swapping(df, targets, touchpoints, df_original, max_iterations=100000, threshold=0.05, patience=500, k=10):
    """
    Adjusts the customer population by swapping clients between clusters until targets are met or no further improvement.
    Ensures final satisfactions are within desired intervals based on the necessary change.
    Args:
        df (pd.DataFrame): DataFrame with customer scores.
        targets (pd.DataFrame): DataFrame with satisfaction targets.
        touchpoints (list): List of touchpoints to adjust.
        df_original (pd.DataFrame): Original DataFrame before any adjustments.
        max_iterations (int): Maximum number of iterations.
        threshold (float): Threshold to consider when adjustments have reached the target.
        patience (int): Number of additional iterations after the distance stops improving.
        k (int): Number of top touchpoints to consider when determining direction.
    Returns:
        pd.DataFrame: Adjusted DataFrame.
    """
    df_adjusted = df.copy()

    # Calculate original satisfactions and initial values
    original_satisfactions = {tp: calculate_satisfaction(df_original, tp) for tp in touchpoints}
    initial_distance = calculate_total_distance(df_adjusted, targets, touchpoints)
    iteration = 0
    patience_counter = 0

    while iteration < max_iterations:
        adjustments_made = False
        local_adjustments_made = False

        # Determine direction of adjustment based on the top k touchpoints
        direction = 0  # Positive for increase, negative for decrease
        total_increase_needed = 0
        total_decrease_needed = 0
        selected_touchpoint = None

        for tp in touchpoints[:k]:
            current_satisfaction = calculate_satisfaction(df_adjusted, tp)
            target_value = targets[f'{tp}_satisfaction'].values[0]
            original_satisfaction = original_satisfactions[tp]

            # Determine desired interval
            if original_satisfaction < target_value:
                lower_bound, upper_bound = target_value, target_value + threshold
            elif original_satisfaction > target_value:
                lower_bound, upper_bound = target_value - threshold, target_value
            else:
                lower_bound, upper_bound = original_satisfaction - threshold, original_satisfaction + threshold

            # Calculate direction needed for the current touchpoint
            if current_satisfaction < lower_bound:
                increase_needed = lower_bound - current_satisfaction
                total_increase_needed += increase_needed
                direction += 1
                # Select the first touchpoint with the same nature of adjustment
                if selected_touchpoint is None:
                    selected_touchpoint = tp
            elif current_satisfaction > upper_bound:
                decrease_needed = current_satisfaction - upper_bound
                total_decrease_needed += decrease_needed
                direction -= 1
                # Select the first touchpoint with the same nature of adjustment
                if selected_touchpoint is None:
                    selected_touchpoint = tp

        # Determine overall direction based on the sum of needs
        if total_increase_needed > total_decrease_needed:
            increase_satisfaction = True
        else:
            increase_satisfaction = False

        # Apply swap based on the direction determined, using only the selected touchpoint
        if increase_satisfaction:
            # Increase satisfaction: swap from cluster 2 to cluster 0
            df_adjusted, local_adjustments_made = swap_client_based_on_out_prob(df_adjusted, from_cluster=0, to_cluster=2, increase=True, touchpoints=[selected_touchpoint])
        else:
            # Decrease satisfaction: swap from cluster 0 to cluster 2
            df_adjusted, local_adjustments_made = swap_client_based_on_out_prob(df_adjusted, from_cluster=2, to_cluster=0, increase=False, touchpoints=[selected_touchpoint])

        # Check if adjustment was made and recalculate distance
        if local_adjustments_made:
            new_distance = calculate_total_distance(df_adjusted, targets, touchpoints)

            # If new distance is better, reset patience counter
            if new_distance < initial_distance:
                initial_distance = new_distance
                patience_counter = 0
                adjustments_made = True
            else:
                patience_counter += 1
        else:
            patience_counter += 1

        if patience_counter >= patience:
            print(f"Local minimum reached at iteration {iteration} with distance {initial_distance:.4f} and patience count {patience_counter}")
            break
            
        print(f'Iteration {iteration} with distance {new_distance:.4f}')

        iteration += 1
        

    return df_adjusted

def swap_client_based_on_out_prob(df_adjusted, from_cluster, to_cluster, increase=True, touchpoints=None):
    """
    Helper function to swap clients between clusters based on out_prob_nps values and conditions on touchpoints.
    Args:
        df_adjusted (pd.DataFrame): Adjusted DataFrame.
        from_cluster (int): Cluster to remove clients from.
        to_cluster (int): Cluster to add clients to.
        increase (bool): Whether we are increasing or decreasing satisfaction.
        touchpoints (list): List of touchpoints to consider for criteria.
    Returns:
        (pd.DataFrame, bool): Tuple containing the adjusted DataFrame and a boolean indicating if the swap was successful.
    """
    clients_from = df_adjusted[df_adjusted['cluster'] == from_cluster]
    clients_to = df_adjusted[df_adjusted['cluster'] == to_cluster]

    if not clients_from.empty and not clients_to.empty:
        if increase:
            # Iterate over clients in 'to_cluster' with high out_prob_nps until conditions are met
            # clients_to_candidates = clients_to[clients_to['out_prob_nps'] > clients_to['out_prob_nps'].quantile(0.90)]
            clients_to_candidates = clients_to[clients_to['out_prob_nps'] > 0]
            for _, client_to_add in clients_to_candidates.iterrows():
                # clients_from_candidates = clients_from[clients_from['out_prob_nps'] < clients_from['out_prob_nps'].quantile(0.10)]
                clients_from_candidates = clients_from[clients_from['out_prob_nps'] < 0]
                for _, client_to_remove in clients_from_candidates.iterrows():
                    # Check conditions for touchpoints, only if the satisfaction for that touchpoint needed an increase
                    valid_swap = True
                    if touchpoints:
                        for tp in touchpoints:
                            if increase and (client_to_add[f'{tp}_nps'] <= client_to_remove[f'{tp}_nps'] or client_to_add[tp] <= client_to_remove[tp]):
                                valid_swap = False
                                break
                            elif not increase and (client_to_add[f'{tp}_nps'] >= client_to_remove[f'{tp}_nps'] or client_to_add[tp] >= client_to_remove[tp]):
                                valid_swap = False
                                break
                        if valid_swap:
                            # Perform swap
                            df_adjusted = pd.concat([df_adjusted, client_to_add.to_frame().T], ignore_index=True)
                            df_adjusted.drop(client_to_remove.name, inplace=True)
                            return df_adjusted.copy(), True

            # Relax condition: ignore SHAP value comparison
            for _, client_to_add in clients_to_candidates.iterrows():
                # clients_from_candidates = clients_from[clients_from['out_prob_nps'] < clients_from['out_prob_nps'].quantile(0.10)]
                clients_from_candidates = clients_from[clients_from['out_prob_nps'] < 0]
                for _, client_to_remove in clients_from_candidates.iterrows():
                    # Check condition only for touchpoint value
                    if touchpoints:
                        if increase:
                            for tp in touchpoints:
                                if client_to_add[tp] <= client_to_remove[tp]:
                                    continue
                        else:
                            for tp in touchpoints:
                                if client_to_add[tp] >= client_to_remove[tp]:
                                    continue
                    # Perform swap
                    df_adjusted = pd.concat([df_adjusted, client_to_add.to_frame().T], ignore_index=True)
                    df_adjusted.drop(client_to_remove.name, inplace=True)
                    return df_adjusted.copy(), True

            # Relax condition further: ignore touchpoint value comparison
            if not clients_to_candidates.empty and not clients_from_candidates.empty:
                client_to_add = clients_to_candidates.sample(n=1)
                client_to_remove = clients_from_candidates.sample(n=1)
                df_adjusted = pd.concat([df_adjusted, client_to_add], ignore_index=True)
                df_adjusted.drop(client_to_remove.index, inplace=True)
                return df_adjusted.copy(), True

        else:
            # Decrease satisfaction: Opposite logic
            # clients_to_candidates = clients_to[clients_to['out_prob_nps'] < clients_to['out_prob_nps'].quantile(0.10)]
            clients_to_candidates = clients_to[clients_to['out_prob_nps'] < 0]
            for _, client_to_add in clients_to_candidates.iterrows():
                # clients_from_candidates = clients_from[clients_from['out_prob_nps'] > clients_from['out_prob_nps'].quantile(0.90)]
                clients_from_candidates = clients_from[clients_from['out_prob_nps'] > 0]
                for _, client_to_remove in clients_from_candidates.iterrows():
                    # Check conditions for touchpoints, only if the satisfaction for that touchpoint needed a decrease
                    valid_swap = True
                    if touchpoints:
                        for tp in touchpoints:
                            if increase and (client_to_add[f'{tp}_nps'] <= client_to_remove[f'{tp}_nps'] or client_to_add[tp] <= client_to_remove[tp]):
                                valid_swap = False
                                break
                            elif not increase and (client_to_add[f'{tp}_nps'] >= client_to_remove[f'{tp}_nps'] or client_to_add[tp] >= client_to_remove[tp]):
                                valid_swap = False
                                break
                        if valid_swap:
                            # Perform swap
                            df_adjusted = pd.concat([df_adjusted, client_to_add.to_frame().T], ignore_index=True)
                            df_adjusted.drop(client_to_remove.name, inplace=True)
                            return df_adjusted.copy(), True

            # Relax condition: ignore SHAP value comparison
            for _, client_to_add in clients_to_candidates.iterrows():
                # clients_from_candidates = clients_from[clients_from['out_prob_nps'] > clients_from['out_prob_nps'].quantile(0.90)]
                clients_from_candidates = clients_from[clients_from['out_prob_nps'] > 0]
                for _, client_to_remove in clients_from_candidates.iterrows():
                    # Check condition only for touchpoint value
                    if touchpoints:
                        if increase:
                            for tp in touchpoints:
                                if client_to_add[tp] <= client_to_remove[tp]:
                                    continue
                        else:
                            for tp in touchpoints:
                                if client_to_add[tp] >= client_to_remove[tp]:
                                    continue
                    # Perform swap
                    df_adjusted = pd.concat([df_adjusted, client_to_add.to_frame().T], ignore_index=True)
                    df_adjusted.drop(client_to_remove.name, inplace=True)
                    return df_adjusted.copy(), True

            # Relax condition further: ignore touchpoint value comparison
            if not clients_to_candidates.empty and not clients_from_candidates.empty:
                client_to_add = clients_to_candidates.sample(n=1)
                client_to_remove = clients_from_candidates.sample(n=1)
                df_adjusted = pd.concat([df_adjusted, client_to_add], ignore_index=True)
                df_adjusted.drop(client_to_remove.index, inplace=True)
                return df_adjusted.copy(), True

    return df_adjusted, False

def simple_swap_clients_between_clusters(df_adjusted, from_cluster, to_cluster):
    """
    Helper function to simply swap clients between clusters.
    Args:
        df_adjusted (pd.DataFrame): Adjusted DataFrame.
        from_cluster (int): Cluster to remove clients from.
        to_cluster (int): Cluster to add clients to.
    Returns:
        bool: True if swap was successful, False otherwise.
    """
    clients_from = df_adjusted[df_adjusted['cluster'] == from_cluster]
    clients_to = df_adjusted[df_adjusted['cluster'] == to_cluster]

    if not clients_from.empty and not clients_to.empty:
        # Select a random client from 'from_cluster'
        client_to_remove = clients_from.sample(n=1)
        df_adjusted.drop(client_to_remove.index, inplace=True)

        # Select a random client from 'to_cluster'
        client_to_add = clients_to.sample(n=1)
        df_adjusted = pd.concat([df_adjusted, client_to_add], ignore_index=True)

        return True
    return False

def within_interval(current, original, target, threshold, sum_differences):
    """Helper function to determine if satisfaction is within the desired interval."""
    if original < target:
        return target <= current <= target + threshold
    elif original > target:
        return target - threshold <= current <= target
    else:
        if sum_differences > 0:
            return original <= current <= original + threshold
        else:
            return original - threshold <= current <= original


In [None]:
def calculate_total_distance(df, targets, touchpoints):
    """
    Calcula la distancia total entre las satisfacciones actuales y los targets utilizando pesos suaves.

    Args:
    df (pd.DataFrame): DataFrame con las puntuaciones de los clientes.
    targets (pd.DataFrame): DataFrame con una fila que contiene los targets de satisfacción.
    touchpoints (list): Lista de touchpoints a considerar.

    Returns:
    float: Distancia total ponderada.
    """
    current_satisfaction = np.array([calculate_satisfaction(df, tp) for tp in touchpoints])
    target_values = targets[[f'{tp}_satisfaction' for tp in touchpoints]].values.flatten()

    ## get the euclidean distance between the two arrays
    total_distance = np.linalg.norm(current_satisfaction - target_values)

    return total_distance

def causal_swapping(df, targets, touchpoints, max_iterations=100000, threshold=1, patience=5000):
    """
    Adjusts the customer population by swapping clients between clusters until targets are met or no further improvement.
    Ensures final satisfactions are within desired intervals based on the necessary change.
    Args:
        df (pd.DataFrame): DataFrame with customer scores.
        targets (pd.DataFrame): DataFrame with satisfaction targets.
        touchpoints (list): List of touchpoints to adjust.
        df_original (pd.DataFrame): Original DataFrame before any adjustments.
        max_iterations (int): Maximum number of iterations.
        threshold (float): Threshold to consider when adjustments have reached the target.
        patience (int): Number of additional iterations after the distance stops improving.
    Returns:
        pd.DataFrame: Adjusted DataFrame.
    """
    iteration = 0
    patience_counter = 0
    df_original = df.copy()
    df_current = df.copy()

    original_satisfactions = {tp: calculate_satisfaction(df_original, tp) for tp in touchpoints}
    initial_distance = calculate_total_distance(df_original, targets, touchpoints)
    print("Initial distance between dataframe satisfactions and targets:", initial_distance)

    ## display initial satisfactions in a dataframe
    initial_conditions_df = pd.DataFrame({
        'Original Satisfaction': [original_satisfactions[tp] for tp in touchpoints],
        'Lower Bound': [targets[f'{tp}_satisfaction'].values[0] - threshold for tp in touchpoints],
        'Target': [targets[f'{tp}_satisfaction'].values[0] for tp in touchpoints],
        'Upper Bound': [targets[f'{tp}_satisfaction'].values[0] + threshold for tp in touchpoints],
    }, index=touchpoints).T

    display(initial_conditions_df)

    target_intervals = initial_conditions_df.loc[['Lower Bound', 'Upper Bound']].to_dict('list')
    target_values = targets[[f'{tp}_satisfaction' for tp in touchpoints]].values.flatten()
    current_satisfactions = np.array([calculate_satisfaction(df_current, tp) for tp in touchpoints])

    while iteration < max_iterations:


        adjustments_made = False
        mean_difference = np.mean(current_satisfactions - target_values)

        if iteration % 250 == 0:
            print("Iteration number: ", iteration)
            print("Mean difference: ", mean_difference)

        df_test = df_current.copy()

        if mean_difference < 0:
            ## if the mean difference is negative, increase satisfaction (swap a client from level 0 to level 2)
            adjustments_made, df_test = swap_client_with_criteria(df_test, from_cluster=0, to_cluster=2, touchpoints=touchpoints)
        elif mean_difference > 0:
            ## if the mean difference is positive, decrease satisfaction (swap a client from level 2 to level 0)
            adjustments_made, df_test = swap_client_with_criteria(df_test, from_cluster=2, to_cluster=0, touchpoints=touchpoints)

        ## if an adjustment was made, recalculate the distance
        if adjustments_made:

            new_distance = calculate_total_distance(df_test, targets, touchpoints)

            if iteration % 250 == 0 : # 
                print(f"After adjustment, distance: {new_distance}")


            if new_distance < initial_distance:
                ## if the distance is smaller, we include the adjustment to 'df_current' and recalculate satisfactions                
                df_current = df_test.copy()
                current_satisfactions = np.array([calculate_satisfaction(df_current, tp) for tp in touchpoints])

                initial_distance = new_distance
                patience_counter = 0
            else:
                patience_counter += 1
                if iteration % 250 == 0: # 
                    print(f"Unnefficient iteration ({iteration}). Patience counter is: {patience_counter}")

            if patience_counter >= patience:
                print(f"Local minimum reached at iteration {iteration} with distance {initial_distance:.4f}")
                break
        else:
            print(f"No more adjustments can be made at iteration {iteration}.")
            break

        if new_distance < 1:
            print("New_distance < 1. Stopping.")
            break        
        elif all_within_limits(current_satisfactions, target_intervals, touchpoints):
            print(f"All satisfactions are within the target intervals at iteration {iteration}. Stopping.")
            break

        iteration += 1

    ## display final satisfactions in a dataframe
    final_satisfactions = {tp: calculate_satisfaction(df_current, tp) for tp in touchpoints}
    final_conditions_df = pd.DataFrame({
        'Final Satisfactions': [final_satisfactions[tp] for tp in touchpoints],
        'Lower Bound': [targets[f'{tp}_satisfaction'].values[0] - threshold for tp in touchpoints],
        'Target': [targets[f'{tp}_satisfaction'].values[0] for tp in touchpoints],
        'Upper Bound': [targets[f'{tp}_satisfaction'].values[0] + threshold for tp in touchpoints],
    }, index=touchpoints).T
    display(final_conditions_df)


    return df_current


def swap_client_with_criteria(df_adjusted, from_cluster, to_cluster, touchpoints, k=None, increase=True):
    """
    Función auxiliar para intercambiar clientes entre clusters según criterios específicos.
    Args:
        df_adjusted (pd.DataFrame): DataFrame ajustado.
        from_cluster (int): Cluster desde el que se eliminarán clientes.
        to_cluster (int): Cluster al que se agregarán clientes.
        touchpoints (list): Lista de touchpoints a considerar.
        k (int): Número de variables principales a considerar para filtrar.
        increase (bool): Indica si se debe aumentar o disminuir la satisfacción.
    Returns:
        bool: True si el intercambio fue exitoso, False en caso contrario.
    """
    clients_from = df_adjusted[df_adjusted['cluster'] == from_cluster]
    clients_to = df_adjusted[df_adjusted['cluster'] == to_cluster]

    ## select k most important clusters
    if k:
        important_touchpoints = touchpoints[:k]
    else:
        important_touchpoints = touchpoints

    if not clients_from.empty and not clients_to.empty:

        ## select a client from the 'clients_from' dataframe to be removed
        client_to_remove = clients_from.sample(n=1)
        df_adjusted.drop(client_to_remove.index, inplace=True)

        ## select a client from the 'clients_to' dataframe to be duplicated
        client_to_add = clients_to.sample(n=1)
        df_adjusted = pd.concat([df_adjusted, client_to_add], ignore_index=True) ## no se estan añadiendo bien los clientes
        return True, df_adjusted
    return False, df_adjusted

def all_within_limits(satisfactions, intervals, touchpoints):
    return all(
        intervals[tp][0] <= satisfactions[i] <= intervals[tp][1]
        for i, tp in enumerate(touchpoints)
    )


In [None]:
# Apply the causal adjust function
# swaped_df = causal_swapping(df_probabilities_2023, targets, touchpoints, df_probabilities_2023)
swaped_df = causal_swapping(df_probabilities_2023, targets, touchpoints, threshold=0.25)

# # Apply the causal soft and hard simulations
soft_sim_df = soft_manual_sim_causal(df_probabilities_2023, targets, touchpoints, df_probabilities_2023)
# hard_sim_df_aux = hard_manual_sim_rand_cluster_causal(swaped_df, targets, touchpoints, df_f)
# hard_sim_df = soft_manual_sim_causal(hard_sim_df_aux, targets, touchpoints, df_f)

### Monte Carlo

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import norm, beta

def simulate_client_satisfaction_with_beta(
    historical_df, 
    target_df, 
    num_simulations=1000, 
    beta_params=(2, 5)  # Parámetros alpha y beta, ajustables
):
    """
    Simula la satisfacción del cliente con distribuciones beta para touchpoints y normales para ticket_price y load_factor.
    
    Parameters:
    - historical_df (DataFrame): DataFrame con respuestas históricas de clientes.
    - target_df (DataFrame): DataFrame de una fila con los objetivos de satisfacción en cada touchpoint y valores medios
      de 'ticket_price' y 'load_factor'.
    - num_simulations (int): Número de simulaciones a realizar.
    - beta_params (tuple): Parámetros alpha y beta para las distribuciones beta de los touchpoints.

    Returns:
    - simulated_df (DataFrame): DataFrame con las simulaciones generadas.
    """
    # Extraer los objetivos y variables a partir de target_df
    target_values = target_df.iloc[0]
    touchpoints = [col.replace("_satisfaction", "") for col in target_df.columns if '_satisfaction' in col]
    ticket_price_mean = target_values['ticket_price']
    load_factor_mean = target_values['load_factor']
    
    # Filtrar los touchpoints en el df histórico
    touchpoint_columns = [col for col in historical_df.columns if any(tp in col for tp in touchpoints)]
    touchpoint_df = historical_df[touchpoint_columns]
    
    # Calcular medias y correlaciones de los touchpoints históricos
    touchpoint_means = touchpoint_df.mean()
    correlation_matrix = touchpoint_df.corr()
    
    # Obtener la desviación estándar de las variables "ticket_price" y "load_factor"
    ticket_price_std = historical_df['ticket_price'].std()
    load_factor_std = historical_df['load_factor'].std()

    # Generar simulaciones de satisfacción para los touchpoints en una distribución normal multivariante
    simulated_touchpoints = np.random.multivariate_normal(
        mean=touchpoint_means,
        cov=correlation_matrix,
        size=num_simulations
    )

    # Paso 1: Convertir las simulaciones de normal a uniformes (CDF de la normal)
    uniform_touchpoints = norm.cdf(simulated_touchpoints)

    # Paso 2: Convertir uniformes a beta, escalado al rango 0-10
    simulated_touchpoints_beta = beta.ppf(uniform_touchpoints, *beta_params) * 10

    # Generar simulaciones para 'ticket_price' y 'load_factor' con distribución normal
    simulated_ticket_price = np.random.normal(ticket_price_mean, ticket_price_std, num_simulations)
    simulated_load_factor = np.random.normal(load_factor_mean, load_factor_std, num_simulations)
    
    # Convertir las simulaciones en un DataFrame
    simulated_df = pd.DataFrame(simulated_touchpoints_beta, columns=touchpoint_columns)
    simulated_df['ticket_price'] = simulated_ticket_price
    simulated_df['load_factor'] = simulated_load_factor

    # Evaluar la similitud con los objetivos
    for tp_col, tp_target in zip(touchpoint_columns, [target_values[f"{tp}_satisfaction"] for tp in touchpoints]):
        # Verificar que >= 8 refleja el porcentaje deseado en targets (asumidos en escala 0-100)
        satisfaction_percentage = (simulated_df[tp_col] >= 8).mean() * 100
        print(f"{tp_col} - Objetivo: {tp_target}%, Simulado: {satisfaction_percentage:.2f}%")

    print(f"ticket_price - Objetivo: {target_values['ticket_price']}, Simulado: {simulated_df['ticket_price'].mean():.2f}")
    print(f"load_factor - Objetivo: {target_values['load_factor']}, Simulado: {simulated_df['load_factor'].mean():.2f}")

    return simulated_df



In [None]:
mc_columns = op_vars + touchpoints
mc_sim = simulate_client_satisfaction_with_beta(df_historic[mc_columns], targets)

# Comprobación 

In [None]:
debug_swaped_pipe = pd.read_csv('swaped_simulated_debug_pipe.csv')
debug_swaped_pipe['cabin_in_surveyed_flight'] = 'Economy'
debug_swaped_pipe['haul'] = 'SH'

In [None]:
import numpy as np
df_probabilities_swaped = predict_and_explain(clf_model[model_names[0]], clf_model[model_names[1]], debug_swaped_pipe[columns], features, K_uncertainty=5)

In [None]:
start_date_f = '2023-03-01'
end_date_f = '2023-03-31'

annual_df_sim = calculate_metrics_summary(df_probabilities_swaped, start_date_f, end_date_f, touchpoints)

In [None]:
df_probabilities_swaped

In [None]:
annual_df_sim

In [None]:
df_probabilities_swaped = predict_and_explain(clf_model[model_names[0]], clf_model[model_names[1]], swaped_df[columns], features, K_uncertainty=5)
df_probabilities_soft_sim = predict_and_explain(clf_model[model_names[0]], clf_model[model_names[1]], soft_sim_df[columns], features, K_uncertainty=5)
# df_probabilities_hard_sim = predict_and_explain(clf_model[model_names[0]], clf_model[model_names[1]], hard_sim_df, features, K_uncertainty=5)

# Añadir la columna 'simulation_client_type' a cada DataFrame
df_probabilities_2023['simulation_client_type'] = 'original_2023'
df_probabilities_2024['simulation_client_type'] = 'original_2024'
df_probabilities_swaped['simulation_client_type'] = 'swaped_simulated'
df_probabilities_soft_sim['simulation_client_type'] = 'soft_simulated'
# df_probabilities_hard_sim['simulation_client_type'] = 'hard_simulated'

# Concatenar los DataFrames
client_df = pd.concat(
    [df_probabilities_2023, df_probabilities_2024, df_probabilities_swaped, df_probabilities_soft_sim,],
    ignore_index=True
)


In [None]:
plot_variable_against_list(df_probabilities_2024, main_var, [var for var in touchpoints if var != main_var], cluster_col='cluster')

In [None]:
import matplotlib.pyplot as plt

# Definir las fechas de inicio y fin del mes
start_date_o = '2023-06-01'
end_date_o = '2023-08-30'

start_date_f = '2024-06-01'
end_date_f = '2024-08-30'

# Filtrar el DataFrame por tipo de cliente y fechas del mes actual
filtered_client_df_or = client_df[client_df['simulation_client_type'] == 'original_2023']
filtered_client_df_sim = client_df[client_df['simulation_client_type'] == 'original_2024']

# Calcular las métricas de resumen para el mes
annual_df_or = calculate_metrics_summary(filtered_client_df_or, start_date_o, end_date_o, touchpoints)
annual_df_sim = calculate_metrics_summary(filtered_client_df_sim, start_date_f, end_date_f, touchpoints)

numerical_cols = annual_df_or.select_dtypes(include='number').columns

diff_df = annual_df_sim[numerical_cols] - annual_df_or[numerical_cols]

diff_df



In [None]:
diff_df

In [None]:
# Copiar el DataFrame
df = diff_df.copy()

# Filtrar columnas que terminan en '_nps' excepto las columnas específicas
nps_columns = [col for col in df.columns if col.endswith('_nps') and col not in ['uncertainty_nps', 'base_prob_nps', 'out_prob_nps']]

# Calcular el valor de 'out_prob_nps'
out_prob_nps = df['out_prob_nps'].values[0]

# Crear una lista de contribuciones de cada columna a 'out_prob_nps'
waterfall_data = df[nps_columns].iloc[0].tolist()

# Obtener las etiquetas (nombres de las columnas) para el gráfico
labels = nps_columns


# Crear los valores del gráfico waterfall
waterfall_values = waterfall_data

# Crear las columnas correspondientes sin el sufijo "_nps" para los valores adicionales
corresponding_columns = [col.replace('_nps', '') for col in nps_columns]

# Obtener los valores correspondientes de las columnas sin el sufijo "_nps"
corresponding_values = df[corresponding_columns].iloc[0].tolist()

# Crear el gráfico waterfall con los valores adicionales encima de las barras
def waterfall_plot(labels, values, corresponding_values, title="Original 2023 to Original 2024"):
    fig, ax = plt.subplots(figsize=(10, 6))

    # Crear barras de contribución
    prev_value = 0
    for i in range(len(values)):
        color = 'green' if values[i] > 0 else 'red'
        ax.bar(labels[i], values[i], bottom=prev_value, color=color)
        
        # Posición en la que agregar el texto
        bar_position = prev_value + values[i] / 2
        ax.text(i, bar_position, f'{corresponding_values[i]:.2f}', ha='center', va='center', color='black', fontsize=10)
        
        prev_value += values[i]

    # Añadir etiquetas y título
    ax.set_ylabel('Contributions to out_prob_nps')
    ax.set_title(title)

    # Rotar etiquetas del eje X
    plt.xticks(rotation=45, ha="right")
    plt.tight_layout()
    plt.show()

# Llamar a la función para crear el gráfico con los valores adicionales
waterfall_plot(labels, waterfall_values, corresponding_values)
