In [2]:
!pip install plotly
!pip install boto3==1.19.12
!pip install s3fs
!pip install lightgbm
!pip install shap
!pip install catboost

[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mn

In [3]:
# General
import pandas as pd
from pandas.tseries.offsets import MonthEnd
from datetime import datetime, timedelta
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import os
import numpy as np
import xlsxwriter
import datetime
import boto3
import s3fs
from itertools import combinations
import pickle

# Sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


# Models
from catboost import CatBoostClassifier, cv, Pool
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor


# Plots
import matplotlib.pyplot as plt
import seaborn as sns

# SHAP
import shap

# Random
import random

#Warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
df=pd.read_csv('pipeline_output/incremental.csv')

In [5]:
df['load_factor']

0        NaN
1        NaN
2        NaN
3        NaN
4      0.810
        ... 
7543     NaN
7544     NaN
7545     NaN
7546     NaN
7547     NaN
Name: load_factor, Length: 7548, dtype: float64

# Utils

In [6]:
def inv_logit(x):
    return 1 / (1 + np.exp(-x))

def calculate_SHAP_and_probability_binary(model_promoter, model_detractor, df):
    # Extraer ID y fechas, manteniendo el índice
    id_df = df[['respondent_id', 'date_flight_local']]
    
    # Preparar el conjunto de datos para predicciones, excluyendo ID y fechas
    test_set = df.drop(['respondent_id', 'date_flight_local'], axis=1, errors='ignore')
    
    # Predicciones y probabilidades para promotores
    promoter_test_set = test_set.drop(['promoter_binary'], axis=1, errors='ignore')
    predictions_promoter = pd.DataFrame(model_promoter.predict(promoter_test_set), index=promoter_test_set.index, columns=["prediction_prom"])
    proba_promoter = pd.DataFrame(model_promoter.predict_proba(promoter_test_set)[:, 1], index=promoter_test_set.index, columns=["out_prob_prom"])
    
    # Predicciones y probabilidades para detractores
    detractor_test_set = test_set.drop(['detractor_binary'], axis=1, errors='ignore')
    predictions_detractor = pd.DataFrame(model_detractor.predict(detractor_test_set), index=detractor_test_set.index, columns=["prediction_det"])
    proba_detractor = pd.DataFrame(model_detractor.predict_proba(detractor_test_set)[:, 1], index=detractor_test_set.index, columns=["out_prob_det"])
    
    # Combinar resultados de predicción, manteniendo el índice original
    prediction = pd.concat([id_df, test_set, predictions_promoter, proba_promoter, predictions_detractor, proba_detractor], axis=1)
    
    # SHAP values y explicadores para el modelo promotor
    shap_Explainer_promoter = shap.TreeExplainer(model_promoter)
    shap_values_promoter = shap_Explainer_promoter.shap_values(promoter_test_set)
    feature_names = [i for i in promoter_test_set.columns]
    shap_values_prom = pd.DataFrame(shap_values_promoter, index=promoter_test_set.index, columns=[f"{i}_prom" for i in feature_names])
    shap_values_prom["base_value_prom"] = shap_Explainer_promoter.expected_value
    shap_values_prom["out_value_prom"] = shap_values_prom.sum(axis=1)
    
    # SHAP values y explicadores para el modelo detractor
    shap_Explainer_detractor = shap.TreeExplainer(model_detractor)
    shap_values_detractor = shap_Explainer_detractor.shap_values(detractor_test_set)
    shap_values_det = pd.DataFrame(shap_values_detractor, index=detractor_test_set.index, columns=[f"{i}_det" for i in feature_names])
    shap_values_det["base_value_det"] = shap_Explainer_detractor.expected_value
    shap_values_det["out_value_det"] = shap_values_det.sum(axis=1)
    
    # Combinar SHAP values con predicciones, manteniendo el índice original
    output_df = pd.concat([prediction, shap_values_prom, shap_values_det], axis=1)
    
    # Devolver el dataframe de salida
    return output_df


def from_shap_to_probability_binary(df, features_dummy, label_binary):
    output_df = df.copy()
    
    # Determinar el sufijo basado en el tipo de modelo (promoter o detractor)
    class_suffix = '_prom' if label_binary == 'promoter_binary' else '_det'
    
    # Identificar columnas de SHAP para la clase de interés, asumiendo que ya tienen el sufijo correcto
    shap_columns = [col for col in df.columns if col.endswith(class_suffix)]
    base_value_col = f'base_value{class_suffix}'
    
    # Convertir el valor base a probabilidades y actualizar el nombre de la columna
    output_df[f'base_prob{class_suffix}'] = inv_logit(output_df[base_value_col])
    
    # Convertir valores SHAP a probabilidades sin cambiar los nombres de las columnas
    for col in shap_columns:
        output_df[col] = inv_logit(output_df[col])
    
    # Asegurarse de incluir solo las columnas relevantes en el DataFrame final
    relevant_columns = ['respondent_id', 'date_flight_local'] + shap_columns + [f'base_prob{class_suffix}'] + features_dummy
    output_df = output_df[relevant_columns]
    return output_df

def adjust_shap_values_binary(shap_values, base_prob, out_prob):
    """Ajustar los valores SHAP para un modelo binario basado en la distancia."""
    # Calcular la distancia total deseada entre la probabilidad base y la de salida
    total_distance = out_prob - base_prob
    # Calcular la suma total de los valores SHAP
    total_shap = np.sum(shap_values)
    # Calcular el factor de ajuste si la suma total de SHAP no es cero
    adjustment_factor = total_distance / total_shap if total_shap != 0 else 0
    # Ajustar los valores SHAP
    return shap_values * adjustment_factor

def from_shap_to_probability_binary(df, features_dummy, label_binary):
    output_df = df.copy()
    
    # Determinar el sufijo basado en el tipo de modelo (promoter o detractor)
    class_suffix = '_prom' if label_binary == 'promoter_binary' else '_det'
    
    # Identificar columnas de SHAP para la clase de interés, asumiendo que ya tienen el sufijo correcto
    shap_columns = [f'{feature}{class_suffix}' for feature in features_dummy if f'{feature}{class_suffix}' in df.columns]
    base_value_col = f'base_value{class_suffix}'
    out_prob_col = f'out_prob{class_suffix}'

    # Calcular la probabilidad base usando softmax o inv_logit según sea apropiado
    output_df[f'base_prob{class_suffix}'] = inv_logit(output_df[base_value_col])

    for index, row in output_df.iterrows():
        # Extraer los valores SHAP para ajustar
        shap_values = row[shap_columns].values
        # Calcular los valores SHAP ajustados
        adjusted_shap_values = adjust_shap_values_binary(shap_values, row[f'base_prob{class_suffix}'], row[out_prob_col])
        # Actualizar el DataFrame con los valores SHAP ajustados
        output_df.loc[index, shap_columns] = adjusted_shap_values

    # Incluir solo las columnas relevantes en el DataFrame final
    relevant_columns = ['respondent_id', 'date_flight_local'] + shap_columns + [f'base_prob{class_suffix}', out_prob_col] + features_dummy
    output_df = output_df[relevant_columns]
    
    return output_df

def predict_and_explain(model_prom, model_det, df, features_dummy):
    """
    Realiza predicciones y genera explicaciones para modelos de promotores y detractores
    para todo el dataframe.

    Args:
    - model_prom: Modelo entrenado para predecir promotores.
    - model_det: Modelo entrenado para predecir detractores.
    - df: DataFrame con los datos.
    - features_dummy: Lista de características utilizadas para las predicciones.

    Returns:
    - Df final con .data, .values, .base_value, y predicciones.
    """
    # 1. Asumiendo que las funciones de cálculo de SHAP y probabilidad ya están implementadas y ajustadas para usar df
    df_contrib = calculate_SHAP_and_probability_binary(model_prom, model_det, df)

    # 3. Convertir valores SHAP a probabilidad
    df_probability_prom = from_shap_to_probability_binary(df_contrib, features_dummy, 'promoter_binary')
    df_probability_det = from_shap_to_probability_binary(df_contrib, features_dummy, 'detractor_binary')

    # 4. Concatenar DataFrames para ambos modelos
    df_probability_prom = df_probability_prom.reset_index(drop=True)
    df_probability_det = df_probability_det.reset_index(drop=True)
    unique_columns_det = [col for col in df_probability_det.columns if col not in df_probability_prom.columns]
    df_probability_binary = pd.concat([df_probability_prom, df_probability_det[unique_columns_det]], axis=1)

    # 5. Calcular columnas NPS con la diferencia entre _prom y _det
    for column in df_probability_binary.columns:
        if '_prom' in column:
            base_name = column.split('_prom')[0]
            det_column = f'{base_name}_det'
            if det_column in df_probability_binary.columns:
                nps_column = f'{base_name}_nps'
                df_probability_binary[nps_column] = df_probability_binary[column] - df_probability_binary[det_column]

    return df_probability_binary


In [7]:
def inv_logit(x):
    return 1 / (1 + np.exp(-x))

def calculate_SHAP_and_probability_binary(model_promoter, model_detractor, test_set):
    # Predicciones para el modelo de promotores
    promoter_test_set = test_set.drop(['promoter_binary'], axis=1, errors='ignore')
    predictions_promoter = pd.DataFrame(model_promoter.predict(promoter_test_set), columns=["prediction_prom"])
    proba_promoter = pd.DataFrame(model_promoter.predict_proba(promoter_test_set))[[1]].rename(columns={1: "out_prob_prom"})
    
    # Predicciones para el modelo de detractores
    detractor_test_set = test_set.drop(['detractor_binary'], axis=1, errors='ignore')
    predictions_detractor = pd.DataFrame(model_detractor.predict(detractor_test_set), columns=["prediction_det"])
    proba_detractor = pd.DataFrame(model_detractor.predict_proba(detractor_test_set))[[1]].rename(columns={1: "out_prob_det"})
    
    # Combinar resultados de predicción
    prediction = pd.concat([predictions_promoter, proba_promoter, predictions_detractor, proba_detractor, test_set.reset_index(drop=True)], axis=1)
    
    # SHAP values y explicadores para el modelo promotor
    shap_Explainer_promoter = shap.TreeExplainer(model_promoter)
    shap_values_promoter = shap_Explainer_promoter.shap_values(promoter_test_set)
    feature_names = [i for i in promoter_test_set.columns]
    shap_values_prom = pd.DataFrame(shap_values_promoter, columns=[f"{i}_prom" for i in feature_names])
    shap_values_prom["base_value_prom"] = shap_Explainer_promoter.expected_value
    shap_values_prom["out_value_prom"] = shap_values_prom.sum(axis=1)
    
    # SHAP values y explicadores para el modelo detractor
    shap_Explainer_detractor = shap.TreeExplainer(model_detractor)
    shap_values_detractor = shap_Explainer_detractor.shap_values(detractor_test_set)
    shap_values_det = pd.DataFrame(shap_values_detractor, columns=[f"{i}_det" for i in feature_names])
    shap_values_det["base_value_det"] = shap_Explainer_detractor.expected_value
    shap_values_det["out_value_det"] = shap_values_det.sum(axis=1)
    
    # Combinar SHAP values con predicciones
    output_df = pd.concat([prediction, shap_values_prom, shap_values_det], axis=1)
    
    # Devolver el dataframe de salida y los explicadores SHAP
    return output_df, shap_Explainer_promoter, shap_Explainer_detractor

def from_shap_to_probability_binary(df, features_dummy, label_binary):
    output_df = df.copy()
    
    # Determinar el sufijo basado en el tipo de modelo (promoter o detractor)
    class_suffix = '_prom' if label_binary == 'promoter_binary' else '_det'
    
    # Identificar columnas de SHAP para la clase de interés, asumiendo que ya tienen el sufijo correcto
    shap_columns = [col for col in df.columns if col.endswith(class_suffix)]
    base_value_col = f'base_value{class_suffix}'
    
    # Convertir el valor base a probabilidades y actualizar el nombre de la columna
    output_df[f'base_prob{class_suffix}'] = inv_logit(output_df[base_value_col])
    
    # Convertir valores SHAP a probabilidades sin cambiar los nombres de las columnas
    for col in shap_columns:
        output_df[col] = inv_logit(output_df[col])
    
    # Asegurarse de incluir solo las columnas relevantes en el DataFrame final
    relevant_columns = shap_columns + [f'base_prob{class_suffix}'] + features_dummy
    output_df = output_df[relevant_columns]
    
    return output_df

def adjust_shap_values_binary(shap_values, base_prob, out_prob):
    """Ajustar los valores SHAP para un modelo binario basado en la distancia."""
    # Calcular la distancia total deseada entre la probabilidad base y la de salida
    total_distance = out_prob - base_prob
    # Calcular la suma total de los valores SHAP
    total_shap = np.sum(shap_values)
    # Calcular el factor de ajuste si la suma total de SHAP no es cero
    adjustment_factor = total_distance / total_shap if total_shap != 0 else 0
    # Ajustar los valores SHAP
    return shap_values * adjustment_factor

def from_shap_to_probability_binary(df, features_dummy, label_binary):
    output_df = df.copy()
    
    # Determinar el sufijo basado en el tipo de modelo (promoter o detractor)
    class_suffix = '_prom' if label_binary == 'promoter_binary' else '_det'
    
    # Identificar columnas de SHAP para la clase de interés, asumiendo que ya tienen el sufijo correcto
    shap_columns = [f'{feature}{class_suffix}' for feature in features_dummy if f'{feature}{class_suffix}' in df.columns]
    base_value_col = f'base_value{class_suffix}'
    out_prob_col = f'out_prob{class_suffix}'

    # Calcular la probabilidad base usando softmax o inv_logit según sea apropiado
    output_df[f'base_prob{class_suffix}'] = inv_logit(output_df[base_value_col])

    for index, row in output_df.iterrows():
        # Extraer los valores SHAP para ajustar
        shap_values = row[shap_columns].values
        # Calcular los valores SHAP ajustados
        adjusted_shap_values = adjust_shap_values_binary(shap_values, row[f'base_prob{class_suffix}'], row[out_prob_col])
        # Actualizar el DataFrame con los valores SHAP ajustados
        output_df.loc[index, shap_columns] = adjusted_shap_values

    # Incluir solo las columnas relevantes en el DataFrame final
    relevant_columns = shap_columns + [f'base_prob{class_suffix}', out_prob_col] + features_dummy
    output_df = output_df[relevant_columns]
    
    return output_df

def predict_and_explain(model_prom, model_det, df, features_dummy, start_date, end_date):
    """
    Realiza predicciones y genera explicaciones para modelos de promotores y detractores
    dentro de un rango de fechas específico.

    Args:
    - model_prom: Modelo entrenado para predecir promotores.
    - model_det: Modelo entrenado para predecir detractores.
    - df: DataFrame con los datos.
    - features_dummy: Lista de características utilizadas para las predicciones.
    - start_date: Fecha de inicio para los datos de prueba (formato 'YYYY-MM-DD').
    - end_date: Fecha de fin para los datos de prueba (formato 'YYYY-MM-DD').

    Returns:
    - Nada, pero guarda los objetos de explicación SHAP con nombres que reflejan el mes y año.
    """
    # 1. Filtrar el DataFrame por el rango de fechas
    df_filtered = df[(df['date_flight_local'] >= start_date) & (df['date_flight_local'] < end_date)]

    # 2. Asumiendo que las funciones de cálculo de SHAP y probabilidad ya están implementadas y ajustadas para usar df_filtered
    df_contrib, explainer_prom, explainer_det = calculate_SHAP_and_probability_binary(model_prom, model_det, df_filtered[features_dummy])

    # 3. Convertir valores SHAP a probabilidad
    df_probability_prom = from_shap_to_probability_binary(df_contrib, features_dummy, 'promoter_binary')
    df_probability_det = from_shap_to_probability_binary(df_contrib, features_dummy, 'detractor_binary')

    # 4. Concatenar DataFrames para ambos modelos
    df_probability_prom = df_probability_prom.reset_index(drop=True)
    df_probability_det = df_probability_det.reset_index(drop=True)
    unique_columns_det = [col for col in df_probability_det.columns if col not in df_probability_prom.columns]
    df_probability_binary = pd.concat([df_probability_prom, df_probability_det[unique_columns_det]], axis=1)

    # 5. Calcular columnas NPS con la diferencia entre _prom y _det
    for column in df_probability_binary.columns:
        if '_prom' in column:
            base_name = column.split('_prom')[0]
            det_column = f'{base_name}_det'
            if det_column in df_probability_binary.columns:
                nps_column = f'{base_name}_nps'
                df_probability_binary[nps_column] = df_probability_binary[column] - df_probability_binary[det_column]

    # 6. Agregar variables y valores SHAP para crear una explicación general
    # num_vars = ['delay_departure','ticket_price']
    num_vars = ['ticket_price', 'load_factor']
    bin_vars = ['otp15_takeoff']
    # bin_vars = ['otp15_takeoff'] + [col for col in df_nps_tkt.columns if 'country_agg' in col]
    # cat_vars=['segment']
    # bin_vars=[]
    cat_vars=[]
    touchpoints = [feat for feat in features_dummy if feat not in bin_vars + num_vars]
    values_nps_sum = [df_probability_binary[f'{feat}_nps'].mean()*100 for feat in features_dummy]
    num_var_scores = [df_probability_binary[num_var].mean() for num_var in num_vars]
    bin_vars_scores = []
    for var in bin_vars:
        if var == 'otp15_takeoff':
            # Para 'otp15_takeoff', calcula el porcentaje de 0s.
            score = (df_probability_binary[df_probability_binary[var] == 0][var].count() / 
                     df_probability_binary[var].count()) * 100
        else:
            # Para cualquier otra variable, calcula el porcentaje de 1s.
            score = (df_probability_binary[df_probability_binary[var] == 1][var].count() / 
                     df_probability_binary[var].count()) * 100
        bin_vars_scores.append(score)
    # Continuación después de calcular bin_vars_scores
    cat_vars_scores = []
    for cat_var in cat_vars:
        cat_vars_scores.append(0)

    satisfaction_scores = [df_probability_binary[df_probability_binary[tp] >= 8][tp].count() / df_probability_binary[tp].count() * 100 for tp in touchpoints]
    shap_data = np.array(num_var_scores + bin_vars_scores + satisfaction_scores)
    base_value_nps_sum = df_probability_binary['base_prob_prom'].mean() * 100 - df_probability_binary['base_prob_det'].mean() * 100
    shap_values = np.array(values_nps_sum)  # Convertimos la lista en un array 2D
    features_names = np.array(features_dummy)
    explainer = shap.Explanation(values=shap_values, 
                                 base_values=base_value_nps_sum, 
                                 data=shap_data, 
                                 feature_names=features_names)
        
    return explainer, df_probability_binary

# Predict

In [8]:
features = ['ticket_price', 'load_factor', 'otp15_takeoff', 'bkg_200_journey_preparation', 'pfl_100_checkin', 
                  'pfl_200_security', 'pfl_300_lounge', 'pfl_500_boarding', 'ifl_300_cabin', 
                  'ifl_200_flight_crew_annoucements', 'ifl_600_wifi', 'ifl_500_ife', 'ifl_400_food_drink', 
                  'ifl_100_cabin_crew', 'arr_100_arrivals', 'con_100_connections', 
                  'loy_200_loyalty_programme', 'img_310_ease_contact_phone']

In [9]:
model_names=['PROM','DET']
clf_model={}
for name in model_names:
    path_model=f'pipeline_output/CatBoostClassifier_cv_{name}.pkl'
    # Cargar el modelo desde el archivo .pkl
    with open(path_model, 'rb') as file:
        clf_model[name] = pickle.load(file)

In [10]:
# Load the data to predict
df_predict = pd.read_csv(f"pipeline_output/data_for_historic_prediction.csv")
    
# Asegurarse de que 'date_flight_local' esté en formato datetime
df_predict['date_flight_local'] = pd.to_datetime(df_predict['date_flight_local'])
df_predict = df_predict[df_predict['date_flight_local'].dt.year == 2023]
    
df_predict = df_predict[df_predict['date_flight_local'].dt.month == 1]

def filter_data_by_quarter(df, quarter):
    # Definir los rangos de fechas para cada trimestre
    quarters = {
        "q1": (1, 3),
        "q2": (4, 6),
        "q3": (7, 9),
        "q4": (10, 12)
    }

    # Obtener el rango de meses para el trimestre especificado
    start_month, end_month = quarters[quarter]

    # Filtrar el DataFrame por el rango de fechas del trimestre
    df_filtered = df[df['date_flight_local'].dt.month.between(start_month, end_month)]

    return df_filtered

quarters = ['q1']

for quarter in quarters:
    df_predict = filter_data_by_quarter(df_predict, quarter)
    # Perform prediction and add the probabilities to the dataframe
    test_set = df_predict.drop(['respondent_id'], axis=1, errors='ignore')
    # df_probabilities = predict_and_explain(clf_model[model_names[0]], clf_model[model_names[1]], df_predict, features)
    explainer, df_probabilities = predict_and_explain(clf_model[model_names[0]], clf_model[model_names[1]], test_set, features, '2023-01-01','2023-01-31')


In [11]:
df_probabilities['out_prob_nps'].mean()

0.30753317961568943

In [12]:
df_probabilities['out_prob_nps'].mean()

0.30753317961568943

In [13]:

# # Rename columns, add insert date and select columns to save
# df_probabilities['insert_date_ci'] = STR_EXECUTION_DATE
# df_probabilities['model_version']=f'{model_year}-{model_month}-{model_day}'
# df_probabilities = df_probabilities[config['PREDICT']['COLUMNS_SAVE']]

# Save the prediction results to S3
df_probabilities.to_csv(save_path, index=False)

NameError: name 'save_path' is not defined

# CHECK historic prediction

In [None]:
df_hist=pd.read_csv('pipeline_output/historic_predictions (8).csv')

In [None]:
df_hist['date_flight_local']=pd.to_datetime(df_hist['date_flight_local'])

In [None]:
def aggregate_shaps(df,features_dummy, start_date, end_date):

    # 1. Filtrar el DataFrame por el rango de fechas
    df_probability_binary = df[(df['date_flight_local'] >= start_date) & (df['date_flight_local'] < end_date)]
    
    # df_probability_binary.drop(columns=['respondent_id','date_flight_local','model_version','insert_date_ci'])

    # 2. Asumiendo que las funciones de cálculo de SHAP y probabilidad ya están implementadas y ajustadas para usar df_filtered
    # 6. Agregar variables y valores SHAP para crear una explicación general
    # num_vars = ['delay_departure','ticket_price']
    num_vars = ['ticket_price', 'load_factor']
    bin_vars = ['otp15_takeoff']
    # bin_vars = ['otp15_takeoff'] + [col for col in df_nps_tkt.columns if 'country_agg' in col]
    # cat_vars=['segment']
    # bin_vars=[]
    cat_vars=[]
    touchpoints = [feat for feat in features_dummy if feat not in bin_vars + num_vars]
    
    # values_nps_sum = [pd.to_numeric(df_probability_binary[f'{feat}_nps'], errors='coerce').mean()*100 for feat in features_dummy]
    values_nps_sum = [df_probability_binary[f'{feat}_nps'].mean()*100 for feat in features_dummy]
    num_var_scores = [df_probability_binary[num_var].mean() for num_var in num_vars]
    bin_vars_scores = []
    for var in bin_vars:
        if var == 'otp15_takeoff':
            # Para 'otp15_takeoff', calcula el porcentaje de 0s.
            score = (df_probability_binary[df_probability_binary[var] == 0][var].count() / 
                     df_probability_binary[var].count()) * 100
        else:
            # Para cualquier otra variable, calcula el porcentaje de 1s.
            score = (df_probability_binary[df_probability_binary[var] == 1][var].count() / 
                     df_probability_binary[var].count()) * 100
        bin_vars_scores.append(score)
    # Continuación después de calcular bin_vars_scores
    cat_vars_scores = []
    for cat_var in cat_vars:
        cat_vars_scores.append(0)

    satisfaction_scores = [df_probability_binary[df_probability_binary[tp] >= 8][tp].count() / df_probability_binary[tp].count() * 100 for tp in touchpoints]
    shap_data = np.array(num_var_scores + bin_vars_scores + satisfaction_scores)
    base_value_nps_sum = df_probability_binary['base_prob_nps'].mean() * 100
    shap_values = np.array(values_nps_sum)  # Convertimos la lista en un array 2D
    features_names = np.array(features_dummy)
    explainer = shap.Explanation(values=shap_values, 
                                 base_values=base_value_nps_sum, 
                                 data=shap_data, 
                                 feature_names=features_names)
    pred_nps = df_probability_binary['out_prob_nps'].mean()
        
    return explainer, df_probability_binary, pred_nps

# model_prom=train_results['models']['promoter_binary']
# model_det=train_results['models']['detractor_binary']

# Lista de años de interés
years_of_interest = [2019, 2022, 2023, 2024]
explanations=[]
features_dummy = ['ticket_price', 'load_factor', 'otp15_takeoff', 'bkg_200_journey_preparation', 'pfl_100_checkin', 
                  'pfl_200_security', 'pfl_300_lounge', 'pfl_500_boarding', 'ifl_300_cabin', 
                  'ifl_200_flight_crew_annoucements', 'ifl_600_wifi', 'ifl_500_ife', 'ifl_400_food_drink', 
                  'ifl_100_cabin_crew', 'arr_100_arrivals', 'con_100_connections', 
                  'loy_200_loyalty_programme', 'img_310_ease_contact_phone']
for year in years_of_interest:
    for month in range(1, 13):
        if year == 2024 and month > 4:
            break
        


        start_date = f"{year}-{month:02d}-01"
        end_date = (pd.to_datetime(start_date) + MonthEnd(1)).strftime('%Y-%m-%d')

#         # Filtrar el DataFrame por las fechas de inicio y fin
#         df_filtered = df_nps_tkt[(df_nps_tkt['date_flight_local'] >= start_date) & (df_nps_tkt['date_flight_local'] < end_date)]

#         # Verificar si df_filtered está vacío
#         if df_filtered.empty:
#             print(f"No hay datos para el rango de fechas desde {start_date} hasta {end_date}.")
#             continue

        explanation, df_probability_binary, pred_nps = aggregate_shaps(df_hist,features_dummy, start_date, end_date)
    
       
    
        # No romper el bucle; solo continuar si 'explanation' es None
        if explanation is not None:
            print(explanation)
            explanations.append(explanation)
            # Guardar cada explicación con un nombre de archivo que refleje el mes y año
            file_name = f'pipeline_output/raw_explanations/explanation_{month}_{year}.pkl'
            with open(file_name, 'wb') as file:
                pickle.dump(explanation, file)
            print(file_name)
            print(pred_nps)
            shap.plots.waterfall(explanation, max_display=20)

In [None]:
def create_uplifting_explanation(explanation2, explanation1):
    """
    Create a new Explanation object representing the uplifting between two Explanation objects.

    Parameters:
        - explanation1: The first shap.Explanation object.
        - explanation2: The second shap.Explanation object.

    Returns:
        - A new shap.Explanation object representing the uplifting.
    """
    # Calculate the difference in values, base_values, and data
    diff_values = explanation2.values - explanation1.values
    
    diff_base_values = explanation1.base_values + sum(explanation1.values)
    print(sum(explanation1.values))
    diff_data = explanation2.data - explanation1.data

    # Create a new Explanation object with the difference values
    diff_explanation = shap.Explanation(values=diff_values, base_values=diff_base_values, data=diff_data,
                                        feature_names=explanation1.feature_names)

    return diff_explanation

In [None]:
def load_explanation(year, month):
    """Cargar un objeto de explicación desde un archivo."""
    file_name = f'pipeline_output/raw_explanations/explanation_{month}_{year}.pkl'
    try:
        with open(file_name, 'rb') as file:
            explanation = pickle.load(file)
        return explanation
    except FileNotFoundError:
        print(f"No explanation file found for {month}/{year}.")
        return None

# Ejemplo de uso: Comparar febrero de 2023 con febrero de 2022
explanation_2024_01 = load_explanation(2023, 2)
explanation_2024_02 = load_explanation(2024, 1)

if explanation_2024_01 is not None and explanation_2024_02 is not None:
    print('2024: Januery vs Februery')
    uplifting_explanation = create_uplifting_explanation(explanation_2024_02, explanation_2024_01)
    # Procesar o visualizar el uplifting_explanation según sea necesario
    shap.plots.waterfall(uplifting_explanation, max_display=30)


In [None]:
def compare_monthly_explanations(start_year, start_month, end_year, end_month):
    """
    Compara automáticamente cada mes con el mismo mes del año anterior o con 2019 si el año es 2022,
    desde una fecha de inicio dada, y devuelve un diccionario con los objetos de explicación.

    Args:
    - start_year: Año de inicio para las comparaciones.
    - start_month: Mes de inicio para las comparaciones.
    - end_year: Año final para las comparaciones.
    - end_month: Mes final para las comparaciones.

    Returns:
    - Un diccionario con las comparaciones de objetos de explicación. Las claves son las fechas de comparación.
    """
    uplifting_explanations_dict = {}

    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            if year == start_year and month < start_month:
                continue
            if year == end_year and month > end_month:
                break

            # Ajuste para el año 2022: comparar con 2019
            if year == 2022:
                previous_year_explanation = load_explanation(2019, month)
            else:
                previous_year_explanation = load_explanation(year - 1, month)

            current_explanation = load_explanation(year, month)

            if current_explanation is not None and previous_year_explanation is not None:
                uplifting_explanation = create_uplifting_explanation(previous_year_explanation,current_explanation)
                # Ajuste en la clave para reflejar la comparación especial del año 2022 con 2019
                if year == 2022:
                    date_key = f"2019-{month:02d} to {year}-{month:02d}"
                else:
                    date_key = f"{year-1}-{month:02d} to {year}-{month:02d}"
                uplifting_explanations_dict[date_key] = uplifting_explanation
            print(date_key)  
            shap.plots.waterfall(uplifting_explanations_dict[date_key], max_display=20)

    return uplifting_explanations_dict

# Ejemplo de uso
uplifting_explanations_dict = compare_monthly_explanations(2023, 3, 2024, 3)

# Check incremental

In [37]:
s3_resource = boto3.resource("s3")
S3_BUCKET_NPS = 'iberia-data-lake'
insert_date_ci='2024-04-12'
today_nps_surveys_prefix = f'customer/nps_surveys/export_historic/insert_date_ci={insert_date_ci}/'
dir_dict = 's3://iberia-data-lake/customer/nps_surveys/nps_dictionaries'

lf_dir = 's3://ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/'

In [38]:
    from datetime import datetime, timedelta
    # Convert to datetime object
    execution_date = datetime.strptime(insert_date_ci, "%Y-%m-%d")

    # Calculate yesterday's date
    yesterday_date = execution_date - timedelta(days=1)
    # Format dates as strings for S3 prefixes
    today_date_str = execution_date.strftime("%Y-%m-%d")
    yesterday_date_str = yesterday_date.strftime("%Y-%m-%d")
    
    yesterday_nps_surveys_prefix= f'customer/nps_surveys/export_historic/insert_date_ci={yesterday_date_str}/'

In [39]:
    # READ TODAY DATA (HISTORIC NPS)
    s3_keys = [item.key for item in s3_resource.Bucket(S3_BUCKET_NPS).objects.filter(Prefix=today_nps_surveys_prefix)]
    preprocess_paths = [f"s3://{S3_BUCKET_NPS}/{key}" for key in s3_keys]

    df_nps_historic = pd.DataFrame()
    for file in preprocess_paths:
        df = pd.read_csv(file)
        df_nps_historic = pd.concat([df_nps_historic, df], axis=0)
    df_nps_historic = df_nps_historic.reset_index(drop=True)

    # READ PREVIOUS NPS DATA (FOR INCREMENTAL)
    yesterday_s3_keys = [item.key for item in s3_resource.Bucket(S3_BUCKET_NPS).objects.filter(Prefix=yesterday_nps_surveys_prefix)]
    yesterday_preprocess_paths = [f"s3://{S3_BUCKET_NPS}/{key}" for key in yesterday_s3_keys]

    df_nps_yesterday = pd.DataFrame()
    for file in yesterday_preprocess_paths:
        df = pd.read_csv(file)
        df_nps_yesterday = pd.concat([df_nps_yesterday, df], axis=0)
    df_nps_yesterday = df_nps_yesterday.reset_index(drop=True)

    # INCREMENTAL NPS  
    df_nps_incremental = pd.merge(df_nps_historic, df_nps_yesterday, how='left', indicator=True, on=df_nps_historic.columns.tolist())
    df_nps_incremental = df_nps_incremental[df_nps_incremental['_merge'] == 'left_only']
    df_nps_incremental = df_nps_incremental.drop(columns=['_merge'])
    df_nps_incremental = df_nps_incremental.reset_index(drop=True)

In [89]:
# READ LF DATA SOURCE
    # lf_dir = 's3://ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/'    
load_factor_prefix = 's3://ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/' 

    # Assume rol for prod
sts_client = boto3.client('sts')
assumed_role = sts_client.assume_role(
    RoleArn="arn:aws:iam::320714865578:role/ibdata-prod-role-assume-customer-services-from-ibdata-aip-prod",
    RoleSessionName="test"
)
credentials = assumed_role['Credentials']
fs = s3fs.S3FileSystem(key=credentials['AccessKeyId'], secret=credentials['SecretAccessKey'], token=credentials['SessionToken'])

    # Listall the files
load_factor_list = fs.ls(load_factor_prefix)
print(load_factor_list)
    
dataframes = []
for file_path in load_factor_list:
    try:
        file_info = fs.info(file_path)
        if file_info['Size'] == 0:
            continue

        with fs.open(f's3://{file_path}') as f:
            if today_date_str in file_path:
                df_lf_incremental = pd.read_csv(f)
            df = pd.read_csv(f)
            dataframes.append(df)
    except pd.errors.EmptyDataError:
        print(f"Caught EmptyDataError for file: {file_path}, skipping...")
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")

if dataframes:
    df_lf_historic = pd.concat(dataframes, ignore_index=True)
else:
    df_lf_historic = pd.DataFrame()

['ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-02000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-03000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-04000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-05000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-06000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-07000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-08000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-09000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-10000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-

In [90]:
    # 1. Filter dataframes by carrier code.
    print("userlog: ETL 1.0 Filter dataframes by carrier code.")
    # NPS HISTORIC
    condition_1 = (df_nps_historic['operating_airline_code'].isin(['IB', 'YW']))
    condition_2 = ((df_nps_historic['invitegroup_ib'] != 3) | (df_nps_historic['invitegroup_ib'].isnull()))
    condition_3 = (df_nps_historic['invitegroup'] == 2)
    
    df_nps_historic = df_nps_historic.loc[condition_1 & (condition_2 & condition_3)]

    # NPS INCREMENTAL
    condition_1 = (df_nps_incremental['operating_airline_code'].isin(['IB', 'YW']))
    condition_2 = ((df_nps_incremental['invitegroup_ib'] != 3) | (df_nps_incremental['invitegroup_ib'].isnull()))
    condition_3 = (df_nps_incremental['invitegroup'] == 2)

    df_nps_incremental = df_nps_incremental.loc[condition_1 & (condition_2 & condition_3)]

    # LOAD FACTOR HISTORIC
    df_lf_historic = df_lf_historic.loc[(df_lf_historic['operating_carrier'].isin(['IB', 'YW']))]

    # LOAD FACTOR INCREMENTAL
    df_lf_incremental = df_lf_incremental.loc[(df_lf_incremental['operating_carrier'].isin(['IB', 'YW']))]


    # 2. Transform date column to datetime format
    print("userlog: ETL 2.0 Transform date column to datetime format.")
    delay_features = ['real_departure_time_local', 'scheduled_departure_time_local']
    for feat in delay_features:
        df_nps_historic[feat] = pd.to_datetime(df_nps_historic[feat], format="%Y%m%d %H:%M:%S", errors = 'coerce')
        df_nps_incremental[feat] = pd.to_datetime(df_nps_incremental[feat], format="%Y%m%d %H:%M:%S", errors = 'coerce')
            
    df_nps_historic['delay_departure'] = (df_nps_historic['real_departure_time_local'] - df_nps_historic['scheduled_departure_time_local']).dt.total_seconds()/60
    df_nps_incremental['delay_departure'] = (df_nps_incremental['real_departure_time_local'] - df_nps_incremental['scheduled_departure_time_local']).dt.total_seconds()/60
    
    # NPS
    df_nps_historic['date_flight_local'] = pd.to_datetime(df_nps_historic['date_flight_local'])
    df_nps_incremental['date_flight_local'] = pd.to_datetime(df_nps_incremental['date_flight_local'])

    # Load Factor
    df_lf_historic['flight_date_local'] = pd.to_datetime(df_lf_historic['flight_date_local'])
    df_lf_incremental['flight_date_local'] = pd.to_datetime(df_lf_incremental['flight_date_local'])

    # 3. Filter out covid years
    print("userlog: ETL 3.0 Filter out covid years.")
    # NPS (historic)
    df_nps_historic = df_nps_historic[df_nps_historic['date_flight_local'].dt.year >= 2019]
    df_nps_historic = df_nps_historic[~df_nps_historic['date_flight_local'].dt.year.isin([2020, 2021])]
    # Load factor (historic)
    df_lf_historic = df_lf_historic[df_lf_historic['flight_date_local'].dt.year >= 2019]
    df_lf_historic = df_lf_historic[~df_lf_historic['flight_date_local'].dt.year.isin([2020, 2021])]

    # 4. Create otp, promoter, detractor and load factor columns.
    print("userlog: ETL 4.0 Create otp, promoter, detractor and load factor columns.")
    # OTP
    df_nps_historic['otp15_takeoff'] = (df_nps_historic['delay_departure'] > 15).astype(int)
    df_nps_incremental['otp15_takeoff'] = (df_nps_incremental['delay_departure'] > 15).astype(int)

    # Promoter and Detractor columns
    df_nps_historic["promoter_binary"] = df_nps_historic["nps_category"].apply(lambda x: 1 if x == "Promoter" else 0)
    df_nps_historic["detractor_binary"] = df_nps_historic["nps_category"].apply(lambda x: 1 if x == "Detractor" else 0)
    df_nps_incremental["promoter_binary"] = df_nps_incremental["nps_category"].apply(lambda x: 1 if x == "Promoter" else 0)
    df_nps_incremental["detractor_binary"] = df_nps_incremental["nps_category"].apply(lambda x: 1 if x == "Detractor" else 0)

    # Load Factor
    df_lf_historic['load_factor_business'] = df_lf_historic['pax_business'] / df_lf_historic['capacity_business']
    df_lf_historic['load_factor_premium_ec'] = df_lf_historic['pax_premium_ec'] / df_lf_historic['capacity_premium_ec']
    df_lf_historic['load_factor_economy'] = df_lf_historic['pax_economy'] / df_lf_historic['capacity_economy']

    df_lf_incremental['load_factor_business'] = df_lf_incremental['pax_business'] / df_lf_incremental['capacity_business']
    df_lf_incremental['load_factor_premium_ec'] = df_lf_incremental['pax_premium_ec'] / df_lf_incremental['capacity_premium_ec']
    df_lf_incremental['load_factor_economy'] = df_lf_incremental['pax_economy'] / df_lf_incremental['capacity_economy']

    

userlog: ETL 1.0 Filter dataframes by carrier code.
userlog: ETL 2.0 Transform date column to datetime format.
userlog: ETL 3.0 Filter out covid years.
userlog: ETL 4.0 Create otp, promoter, detractor and load factor columns.


In [91]:
    # 5. Merge dataframes.
    print("userlog: ETL 5.0 Merge dataframes.")
    cabin_to_load_factor_column = {
        'Economy': 'load_factor_economy',
        'Business': 'load_factor_business',
        'Premium Economy': 'load_factor_premium_ec'
    }

    # HISTORIC
    df_lf_historic.columns = ['date_flight_local' if x=='flight_date_local' else 
                                    'operating_airline_code' if x=='operating_carrier' else
                                    'surveyed_flight_number' if x=='op_flight_num' else
                                    x for x in df_lf_historic.columns]
    
    print(df_lf_historic.dtypes)


    

userlog: ETL 5.0 Merge dataframes.
date_flight_local             datetime64[ns]
flight_date_utc                       object
operating_airline_code                object
surveyed_flight_number                 int64
boardpoint_stn_code_actual            object
offpoint_stn_code_actual              object
haul                                  object
calc_dep_diff                          int64
punctuality                           object
capacity_business                      int64
pax_business                           int64
capacity_premium_ec                    int64
pax_premium_ec                         int64
capacity_economy                       int64
pax_economy                            int64
load_factor_business                 float64
load_factor_premium_ec               float64
load_factor_economy                  float64
dtype: object


In [43]:
print(df_nps_historic.dtypes)

respondent_id                              int64
sample_id                                  int64
surveyed_flight_number                   float64
date_flight_local                 datetime64[ns]
scheduled_departure_time_local    datetime64[ns]
                                       ...      
codeshare                                 object
delay_departure                          float64
otp15_takeoff                              int64
promoter_binary                            int64
detractor_binary                           int64
Length: 580, dtype: object


In [65]:
print(df_long.dtypes)

date_flight_local             datetime64[ns]
flight_date_utc                       object
operating_airline_code                object
surveyed_flight_number               float64
boardpoint_stn_code_actual            object
offpoint_stn_code_actual              object
haul                                  object
calc_dep_diff                          int64
punctuality                           object
capacity_business                      int64
pax_business                           int64
capacity_premium_ec                    int64
pax_premium_ec                         int64
capacity_economy                       int64
pax_economy                            int64
cabin_in_surveyed_flight              object
load_factor                          float64
dtype: object


In [92]:
df=df_lf_historic.copy()

# List of columns to transform
load_factor_columns = ['load_factor_business', 'load_factor_premium_ec', 'load_factor_economy']

# Automatically determine id_vars by excluding load_factor_columns from all columns
id_vars = [col for col in df.columns if col not in load_factor_columns]

# Reshaping the DataFrame while dynamically keeping all other columns
df = pd.melt(df, id_vars=id_vars, 
                  value_vars=load_factor_columns,
                  var_name='cabin_in_surveyed_flight', value_name='load_factor')



In [95]:
# Replacing the column names in 'cabin_in_surveyed_flight' with the desired cabin types
df['cabin_in_surveyed_flight'] = df['cabin_in_surveyed_flight'].map({
    'load_factor_business': 'Business',
    'load_factor_premium_ec': 'Premium Economy',
    'load_factor_economy': 'Economy'
})

df['operating_airline_code'].unique()

Unnamed: 0,date_flight_local,flight_date_utc,operating_airline_code,surveyed_flight_number,boardpoint_stn_code_actual,offpoint_stn_code_actual,haul,calc_dep_diff,punctuality,capacity_business,pax_business,capacity_premium_ec,pax_premium_ec,capacity_economy,pax_economy,cabin_in_surveyed_flight,load_factor
0,2023-09-20,2023-09-20,IB,3237,FCO,MAD,SH,72,OTP15,12,12,0,0,199,192,Business,1.000
1,2023-10-15,2023-10-15,IB,6301,MAD,SJU,LH,3,,19,18,0,0,269,266,Business,0.947
2,2023-10-16,2023-10-16,IB,3148,MAD,PRG,SH,-2,,16,16,0,0,156,150,Business,1.000
3,2023-10-16,2023-10-16,IB,6461,MAD,GYE,LH,-1,,19,19,0,0,269,268,Business,1.000
4,2023-10-16,2023-10-16,IB,3149,PRG,MAD,SH,-12,,12,12,0,0,162,162,Business,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1548517,2024-04-13,2024-04-14,IB,6342,SAL,MAD,LH,-11,,19,19,0,0,269,238,Economy,0.885
1548518,2024-04-13,2024-04-14,IB,6118,MIA,MAD,LH,46,OTP15,29,29,21,21,242,239,Economy,0.988
1548519,2024-04-13,2024-04-14,IB,6832,SCL,MAD,LH,0,,31,31,24,24,293,292,Economy,0.997
1548520,2024-04-13,2024-04-14,IB,6588,BOG,MAD,LH,9,,31,31,24,24,293,283,Economy,0.966


In [96]:
df['operating_airline_code'].unique()

array(['IB', 'YW'], dtype=object)

In [78]:
df_nps_incremental[['date_flight_local', 'operating_airline_code', 'surveyed_flight_number', 'cabin_in_surveyed_flight', 'haul']]
df_long[['date_flight_local', 'operating_airline_code', 'surveyed_flight_number', 'cabin_in_surveyed_flight', 'haul']]


Unnamed: 0,date_flight_local,operating_airline_code,surveyed_flight_number,cabin_in_surveyed_flight,haul
0,2024-03-29,IB,3444.000,Economy,SH
1,2024-03-29,IB,577.000,Economy,SH
2,2024-04-04,IB,6253.000,Economy,LH
3,2024-04-03,IB,3398.000,Economy,SH
4,2024-04-04,IB,6132.000,Economy,LH
...,...,...,...,...,...
7530,2024-04-02,IB,6588.000,Economy,LH
7531,2024-03-29,YW,8966.000,Economy,SH
7532,2024-04-05,YW,8391.000,Economy,SH
7533,2024-04-07,IB,6149.000,Economy,LH


In [79]:
df_long[['date_flight_local', 'operating_airline_code', 'surveyed_flight_number', 'cabin_in_surveyed_flight', 'haul']]

Unnamed: 0,date_flight_local,operating_airline_code,surveyed_flight_number,cabin_in_surveyed_flight,haul
0,2023-09-20,IB,3237.000,Business,SH
1,2023-10-15,IB,6301.000,Business,LH
2,2023-10-16,IB,3148.000,Business,SH
3,2023-10-16,IB,6461.000,Business,LH
4,2023-10-16,IB,3149.000,Business,SH
...,...,...,...,...,...
1548517,2024-04-13,IB,6342.000,Economy,LH
1548518,2024-04-13,IB,6118.000,Economy,LH
1548519,2024-04-13,IB,6832.000,Economy,LH
1548520,2024-04-13,IB,6588.000,Economy,LH


In [82]:
intersection_df = pd.merge(
    df_nps_incremental,
    df,
    how='left',
    on=['date_flight_local', 'operating_airline_code', 'surveyed_flight_number', 'cabin_in_surveyed_flight', 'haul']
)


In [86]:
intersection_df['load_factor'].notna().sum()

1060

In [69]:
df_nps_incremental['haul'] = df_nps_incremental['haul'].replace('MH', 'SH')

In [None]:
df_lf_historic['date_flight_local']=pd.to_datetime(df_lf_historic['date_flight_local'])
df_lf_historic['surveyed_flight_number'] = df_lf_historic['surveyed_flight_number'].astype('float64')

In [62]:
df_long['date_flight_local']=pd.to_datetime(df_long['date_flight_local'])
df_long['surveyed_flight_number'] = df_long['surveyed_flight_number'].astype('float64')

In [87]:
    df_historic = pd.merge(df_nps_historic, df_long, 
                        how='left', 
                        on=['date_flight_local', 'operating_airline_code', 'surveyed_flight_number', 'cabin_in_surveyed_flight', 'haul'])

In [88]:
df_historic['load_factor'].notna().sum()

308777

In [None]:
    # 6. Filter out final columns for the model
    print("userlog: ETL 6.0 Filter out final columns for the model")
    features_dummy = ['ticket_price', 'load_factor', 'otp15_takeoff'] + ['bkg_200_journey_preparation', 'pfl_100_checkin', 
                  'pfl_200_security', 'pfl_300_lounge', 'pfl_500_boarding', 'ifl_300_cabin', 
                  'ifl_200_flight_crew_annoucements', 'ifl_600_wifi', 'ifl_500_ife', 'ifl_400_food_drink', 
                  'ifl_100_cabin_crew', 'arr_100_arrivals', 'con_100_connections', 
                  'loy_200_loyalty_programme', 'img_310_ease_contact_phone']

    labels = ['promoter_binary', 'detractor_binary']

    df_historic = df_historic[['respondent_id' , 'date_flight_local'] + features_dummy + labels]
    df_incremental = df_incremental[['respondent_id' , 'date_flight_local'] + features_dummy + labels]

    df_historic = df_historic.drop_duplicates()
    df_incremental = df_incremental.drop_duplicates()
    
    print("userlog: Size of resulting df_historic:", df_historic.shape)
    print("userlog: Size of resulting df_incremental:", df_incremental.shape)