In [2]:
!pip install plotly
!pip install boto3==1.19.12
!pip install s3fs
!pip install lightgbm
!pip install shap
!pip install catboost

[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[33mDEPRECATION: pyodbc 4.0.0-unsupported has a non-standard version number. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pyodbc or contact the author to suggest that they release a version with a conforming version number. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mn

In [3]:
# General
import pandas as pd
from pandas.tseries.offsets import MonthEnd
from datetime import datetime, timedelta
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import os
import numpy as np
import xlsxwriter
import datetime
import boto3
import s3fs
from itertools import combinations
import pickle

# Sklearn
from sklearn.model_selection import train_test_split, KFold
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer


# Models
from catboost import CatBoostClassifier, cv, Pool
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor


# Plots
import matplotlib.pyplot as plt
import seaborn as sns

# SHAP
import shap

# Random
import random

#Warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
df=pd.read_csv('pipeline_output/incremental.csv')

In [5]:
df['load_factor']

0        NaN
1        NaN
2        NaN
3        NaN
4      0.810
        ... 
7543     NaN
7544     NaN
7545     NaN
7546     NaN
7547     NaN
Name: load_factor, Length: 7548, dtype: float64

# Utils

In [6]:
def inv_logit(x):
    return 1 / (1 + np.exp(-x))

def calculate_SHAP_and_probability_binary(model_promoter, model_detractor, df):
    # Extraer ID y fechas, manteniendo el índice
    id_df = df[['respondent_id', 'date_flight_local']]
    
    # Preparar el conjunto de datos para predicciones, excluyendo ID y fechas
    test_set = df.drop(['respondent_id', 'date_flight_local'], axis=1, errors='ignore')
    
    # Predicciones y probabilidades para promotores
    promoter_test_set = test_set.drop(['promoter_binary'], axis=1, errors='ignore')
    predictions_promoter = pd.DataFrame(model_promoter.predict(promoter_test_set), index=promoter_test_set.index, columns=["prediction_prom"])
    proba_promoter = pd.DataFrame(model_promoter.predict_proba(promoter_test_set)[:, 1], index=promoter_test_set.index, columns=["out_prob_prom"])
    
    # Predicciones y probabilidades para detractores
    detractor_test_set = test_set.drop(['detractor_binary'], axis=1, errors='ignore')
    predictions_detractor = pd.DataFrame(model_detractor.predict(detractor_test_set), index=detractor_test_set.index, columns=["prediction_det"])
    proba_detractor = pd.DataFrame(model_detractor.predict_proba(detractor_test_set)[:, 1], index=detractor_test_set.index, columns=["out_prob_det"])
    
    # Combinar resultados de predicción, manteniendo el índice original
    prediction = pd.concat([id_df, test_set, predictions_promoter, proba_promoter, predictions_detractor, proba_detractor], axis=1)
    
    # SHAP values y explicadores para el modelo promotor
    shap_Explainer_promoter = shap.TreeExplainer(model_promoter)
    shap_values_promoter = shap_Explainer_promoter.shap_values(promoter_test_set)
    feature_names = [i for i in promoter_test_set.columns]
    shap_values_prom = pd.DataFrame(shap_values_promoter, index=promoter_test_set.index, columns=[f"{i}_prom" for i in feature_names])
    shap_values_prom["base_value_prom"] = shap_Explainer_promoter.expected_value
    shap_values_prom["out_value_prom"] = shap_values_prom.sum(axis=1)
    
    # SHAP values y explicadores para el modelo detractor
    shap_Explainer_detractor = shap.TreeExplainer(model_detractor)
    shap_values_detractor = shap_Explainer_detractor.shap_values(detractor_test_set)
    shap_values_det = pd.DataFrame(shap_values_detractor, index=detractor_test_set.index, columns=[f"{i}_det" for i in feature_names])
    shap_values_det["base_value_det"] = shap_Explainer_detractor.expected_value
    shap_values_det["out_value_det"] = shap_values_det.sum(axis=1)
    
    # Combinar SHAP values con predicciones, manteniendo el índice original
    output_df = pd.concat([prediction, shap_values_prom, shap_values_det], axis=1)
    
    # Devolver el dataframe de salida
    return output_df


def from_shap_to_probability_binary(df, features_dummy, label_binary):
    output_df = df.copy()
    
    # Determinar el sufijo basado en el tipo de modelo (promoter o detractor)
    class_suffix = '_prom' if label_binary == 'promoter_binary' else '_det'
    
    # Identificar columnas de SHAP para la clase de interés, asumiendo que ya tienen el sufijo correcto
    shap_columns = [col for col in df.columns if col.endswith(class_suffix)]
    base_value_col = f'base_value{class_suffix}'
    
    # Convertir el valor base a probabilidades y actualizar el nombre de la columna
    output_df[f'base_prob{class_suffix}'] = inv_logit(output_df[base_value_col])
    
    # Convertir valores SHAP a probabilidades sin cambiar los nombres de las columnas
    for col in shap_columns:
        output_df[col] = inv_logit(output_df[col])
    
    # Asegurarse de incluir solo las columnas relevantes en el DataFrame final
    relevant_columns = ['respondent_id', 'date_flight_local'] + shap_columns + [f'base_prob{class_suffix}'] + features_dummy
    output_df = output_df[relevant_columns]
    return output_df

def adjust_shap_values_binary(shap_values, base_prob, out_prob):
    """Ajustar los valores SHAP para un modelo binario basado en la distancia."""
    # Calcular la distancia total deseada entre la probabilidad base y la de salida
    total_distance = out_prob - base_prob
    # Calcular la suma total de los valores SHAP
    total_shap = np.sum(shap_values)
    # Calcular el factor de ajuste si la suma total de SHAP no es cero
    adjustment_factor = total_distance / total_shap if total_shap != 0 else 0
    # Ajustar los valores SHAP
    return shap_values * adjustment_factor

def from_shap_to_probability_binary(df, features_dummy, label_binary):
    output_df = df.copy()
    
    # Determinar el sufijo basado en el tipo de modelo (promoter o detractor)
    class_suffix = '_prom' if label_binary == 'promoter_binary' else '_det'
    
    # Identificar columnas de SHAP para la clase de interés, asumiendo que ya tienen el sufijo correcto
    shap_columns = [f'{feature}{class_suffix}' for feature in features_dummy if f'{feature}{class_suffix}' in df.columns]
    base_value_col = f'base_value{class_suffix}'
    out_prob_col = f'out_prob{class_suffix}'

    # Calcular la probabilidad base usando softmax o inv_logit según sea apropiado
    output_df[f'base_prob{class_suffix}'] = inv_logit(output_df[base_value_col])

    for index, row in output_df.iterrows():
        # Extraer los valores SHAP para ajustar
        shap_values = row[shap_columns].values
        # Calcular los valores SHAP ajustados
        adjusted_shap_values = adjust_shap_values_binary(shap_values, row[f'base_prob{class_suffix}'], row[out_prob_col])
        # Actualizar el DataFrame con los valores SHAP ajustados
        output_df.loc[index, shap_columns] = adjusted_shap_values

    # Incluir solo las columnas relevantes en el DataFrame final
    relevant_columns = ['respondent_id', 'date_flight_local'] + shap_columns + [f'base_prob{class_suffix}', out_prob_col] + features_dummy
    output_df = output_df[relevant_columns]
    
    return output_df

def predict_and_explain(model_prom, model_det, df, features_dummy):
    """
    Realiza predicciones y genera explicaciones para modelos de promotores y detractores
    para todo el dataframe.

    Args:
    - model_prom: Modelo entrenado para predecir promotores.
    - model_det: Modelo entrenado para predecir detractores.
    - df: DataFrame con los datos.
    - features_dummy: Lista de características utilizadas para las predicciones.

    Returns:
    - Df final con .data, .values, .base_value, y predicciones.
    """
    # 1. Asumiendo que las funciones de cálculo de SHAP y probabilidad ya están implementadas y ajustadas para usar df
    df_contrib = calculate_SHAP_and_probability_binary(model_prom, model_det, df)

    # 3. Convertir valores SHAP a probabilidad
    df_probability_prom = from_shap_to_probability_binary(df_contrib, features_dummy, 'promoter_binary')
    df_probability_det = from_shap_to_probability_binary(df_contrib, features_dummy, 'detractor_binary')

    # 4. Concatenar DataFrames para ambos modelos
    df_probability_prom = df_probability_prom.reset_index(drop=True)
    df_probability_det = df_probability_det.reset_index(drop=True)
    unique_columns_det = [col for col in df_probability_det.columns if col not in df_probability_prom.columns]
    df_probability_binary = pd.concat([df_probability_prom, df_probability_det[unique_columns_det]], axis=1)

    # 5. Calcular columnas NPS con la diferencia entre _prom y _det
    for column in df_probability_binary.columns:
        if '_prom' in column:
            base_name = column.split('_prom')[0]
            det_column = f'{base_name}_det'
            if det_column in df_probability_binary.columns:
                nps_column = f'{base_name}_nps'
                df_probability_binary[nps_column] = df_probability_binary[column] - df_probability_binary[det_column]

    return df_probability_binary


In [7]:
def inv_logit(x):
    return 1 / (1 + np.exp(-x))

def calculate_SHAP_and_probability_binary(model_promoter, model_detractor, test_set):
    # Predicciones para el modelo de promotores
    promoter_test_set = test_set.drop(['promoter_binary'], axis=1, errors='ignore')
    predictions_promoter = pd.DataFrame(model_promoter.predict(promoter_test_set), columns=["prediction_prom"])
    proba_promoter = pd.DataFrame(model_promoter.predict_proba(promoter_test_set))[[1]].rename(columns={1: "out_prob_prom"})
    
    # Predicciones para el modelo de detractores
    detractor_test_set = test_set.drop(['detractor_binary'], axis=1, errors='ignore')
    predictions_detractor = pd.DataFrame(model_detractor.predict(detractor_test_set), columns=["prediction_det"])
    proba_detractor = pd.DataFrame(model_detractor.predict_proba(detractor_test_set))[[1]].rename(columns={1: "out_prob_det"})
    
    # Combinar resultados de predicción
    prediction = pd.concat([predictions_promoter, proba_promoter, predictions_detractor, proba_detractor, test_set.reset_index(drop=True)], axis=1)
    
    # SHAP values y explicadores para el modelo promotor
    shap_Explainer_promoter = shap.TreeExplainer(model_promoter)
    shap_values_promoter = shap_Explainer_promoter.shap_values(promoter_test_set)
    feature_names = [i for i in promoter_test_set.columns]
    shap_values_prom = pd.DataFrame(shap_values_promoter, columns=[f"{i}_prom" for i in feature_names])
    shap_values_prom["base_value_prom"] = shap_Explainer_promoter.expected_value
    shap_values_prom["out_value_prom"] = shap_values_prom.sum(axis=1)
    
    # SHAP values y explicadores para el modelo detractor
    shap_Explainer_detractor = shap.TreeExplainer(model_detractor)
    shap_values_detractor = shap_Explainer_detractor.shap_values(detractor_test_set)
    shap_values_det = pd.DataFrame(shap_values_detractor, columns=[f"{i}_det" for i in feature_names])
    shap_values_det["base_value_det"] = shap_Explainer_detractor.expected_value
    shap_values_det["out_value_det"] = shap_values_det.sum(axis=1)
    
    # Combinar SHAP values con predicciones
    output_df = pd.concat([prediction, shap_values_prom, shap_values_det], axis=1)
    
    # Devolver el dataframe de salida y los explicadores SHAP
    return output_df, shap_Explainer_promoter, shap_Explainer_detractor

def from_shap_to_probability_binary(df, features_dummy, label_binary):
    output_df = df.copy()
    
    # Determinar el sufijo basado en el tipo de modelo (promoter o detractor)
    class_suffix = '_prom' if label_binary == 'promoter_binary' else '_det'
    
    # Identificar columnas de SHAP para la clase de interés, asumiendo que ya tienen el sufijo correcto
    shap_columns = [col for col in df.columns if col.endswith(class_suffix)]
    base_value_col = f'base_value{class_suffix}'
    
    # Convertir el valor base a probabilidades y actualizar el nombre de la columna
    output_df[f'base_prob{class_suffix}'] = inv_logit(output_df[base_value_col])
    
    # Convertir valores SHAP a probabilidades sin cambiar los nombres de las columnas
    for col in shap_columns:
        output_df[col] = inv_logit(output_df[col])
    
    # Asegurarse de incluir solo las columnas relevantes en el DataFrame final
    relevant_columns = shap_columns + [f'base_prob{class_suffix}'] + features_dummy
    output_df = output_df[relevant_columns]
    
    return output_df

def adjust_shap_values_binary(shap_values, base_prob, out_prob):
    """Ajustar los valores SHAP para un modelo binario basado en la distancia."""
    # Calcular la distancia total deseada entre la probabilidad base y la de salida
    total_distance = out_prob - base_prob
    # Calcular la suma total de los valores SHAP
    total_shap = np.sum(shap_values)
    # Calcular el factor de ajuste si la suma total de SHAP no es cero
    adjustment_factor = total_distance / total_shap if total_shap != 0 else 0
    # Ajustar los valores SHAP
    return shap_values * adjustment_factor

def from_shap_to_probability_binary(df, features_dummy, label_binary):
    output_df = df.copy()
    
    # Determinar el sufijo basado en el tipo de modelo (promoter o detractor)
    class_suffix = '_prom' if label_binary == 'promoter_binary' else '_det'
    
    # Identificar columnas de SHAP para la clase de interés, asumiendo que ya tienen el sufijo correcto
    shap_columns = [f'{feature}{class_suffix}' for feature in features_dummy if f'{feature}{class_suffix}' in df.columns]
    base_value_col = f'base_value{class_suffix}'
    out_prob_col = f'out_prob{class_suffix}'

    # Calcular la probabilidad base usando softmax o inv_logit según sea apropiado
    output_df[f'base_prob{class_suffix}'] = inv_logit(output_df[base_value_col])

    for index, row in output_df.iterrows():
        # Extraer los valores SHAP para ajustar
        shap_values = row[shap_columns].values
        # Calcular los valores SHAP ajustados
        adjusted_shap_values = adjust_shap_values_binary(shap_values, row[f'base_prob{class_suffix}'], row[out_prob_col])
        # Actualizar el DataFrame con los valores SHAP ajustados
        output_df.loc[index, shap_columns] = adjusted_shap_values

    # Incluir solo las columnas relevantes en el DataFrame final
    relevant_columns = shap_columns + [f'base_prob{class_suffix}', out_prob_col] + features_dummy
    output_df = output_df[relevant_columns]
    
    return output_df

def predict_and_explain(model_prom, model_det, df, features_dummy, start_date, end_date):
    """
    Realiza predicciones y genera explicaciones para modelos de promotores y detractores
    dentro de un rango de fechas específico.

    Args:
    - model_prom: Modelo entrenado para predecir promotores.
    - model_det: Modelo entrenado para predecir detractores.
    - df: DataFrame con los datos.
    - features_dummy: Lista de características utilizadas para las predicciones.
    - start_date: Fecha de inicio para los datos de prueba (formato 'YYYY-MM-DD').
    - end_date: Fecha de fin para los datos de prueba (formato 'YYYY-MM-DD').

    Returns:
    - Nada, pero guarda los objetos de explicación SHAP con nombres que reflejan el mes y año.
    """
    # 1. Filtrar el DataFrame por el rango de fechas
    df_filtered = df[(df['date_flight_local'] >= start_date) & (df['date_flight_local'] < end_date)]

    # 2. Asumiendo que las funciones de cálculo de SHAP y probabilidad ya están implementadas y ajustadas para usar df_filtered
    df_contrib, explainer_prom, explainer_det = calculate_SHAP_and_probability_binary(model_prom, model_det, df_filtered[features_dummy])

    # 3. Convertir valores SHAP a probabilidad
    df_probability_prom = from_shap_to_probability_binary(df_contrib, features_dummy, 'promoter_binary')
    df_probability_det = from_shap_to_probability_binary(df_contrib, features_dummy, 'detractor_binary')

    # 4. Concatenar DataFrames para ambos modelos
    df_probability_prom = df_probability_prom.reset_index(drop=True)
    df_probability_det = df_probability_det.reset_index(drop=True)
    unique_columns_det = [col for col in df_probability_det.columns if col not in df_probability_prom.columns]
    df_probability_binary = pd.concat([df_probability_prom, df_probability_det[unique_columns_det]], axis=1)

    # 5. Calcular columnas NPS con la diferencia entre _prom y _det
    for column in df_probability_binary.columns:
        if '_prom' in column:
            base_name = column.split('_prom')[0]
            det_column = f'{base_name}_det'
            if det_column in df_probability_binary.columns:
                nps_column = f'{base_name}_nps'
                df_probability_binary[nps_column] = df_probability_binary[column] - df_probability_binary[det_column]

    # 6. Agregar variables y valores SHAP para crear una explicación general
    # num_vars = ['delay_departure','ticket_price']
    num_vars = ['ticket_price', 'load_factor']
    bin_vars = ['otp15_takeoff']
    # bin_vars = ['otp15_takeoff'] + [col for col in df_nps_tkt.columns if 'country_agg' in col]
    # cat_vars=['segment']
    # bin_vars=[]
    cat_vars=[]
    touchpoints = [feat for feat in features_dummy if feat not in bin_vars + num_vars]
    values_nps_sum = [df_probability_binary[f'{feat}_nps'].mean()*100 for feat in features_dummy]
    num_var_scores = [df_probability_binary[num_var].mean() for num_var in num_vars]
    bin_vars_scores = []
    for var in bin_vars:
        if var == 'otp15_takeoff':
            # Para 'otp15_takeoff', calcula el porcentaje de 0s.
            score = (df_probability_binary[df_probability_binary[var] == 0][var].count() / 
                     df_probability_binary[var].count()) * 100
        else:
            # Para cualquier otra variable, calcula el porcentaje de 1s.
            score = (df_probability_binary[df_probability_binary[var] == 1][var].count() / 
                     df_probability_binary[var].count()) * 100
        bin_vars_scores.append(score)
    # Continuación después de calcular bin_vars_scores
    cat_vars_scores = []
    for cat_var in cat_vars:
        cat_vars_scores.append(0)

    satisfaction_scores = [df_probability_binary[df_probability_binary[tp] >= 8][tp].count() / df_probability_binary[tp].count() * 100 for tp in touchpoints]
    shap_data = np.array(num_var_scores + bin_vars_scores + satisfaction_scores)
    base_value_nps_sum = df_probability_binary['base_prob_prom'].mean() * 100 - df_probability_binary['base_prob_det'].mean() * 100
    shap_values = np.array(values_nps_sum)  # Convertimos la lista en un array 2D
    features_names = np.array(features_dummy)
    explainer = shap.Explanation(values=shap_values, 
                                 base_values=base_value_nps_sum, 
                                 data=shap_data, 
                                 feature_names=features_names)
        
    return explainer, df_probability_binary

# Predict

In [8]:
features = ['ticket_price', 'load_factor', 'otp15_takeoff', 'bkg_200_journey_preparation', 'pfl_100_checkin', 
                  'pfl_200_security', 'pfl_300_lounge', 'pfl_500_boarding', 'ifl_300_cabin', 
                  'ifl_200_flight_crew_annoucements', 'ifl_600_wifi', 'ifl_500_ife', 'ifl_400_food_drink', 
                  'ifl_100_cabin_crew', 'arr_100_arrivals', 'con_100_connections', 
                  'loy_200_loyalty_programme', 'img_310_ease_contact_phone']

In [9]:
model_names=['PROM','DET']
clf_model={}
for name in model_names:
    path_model=f'pipeline_output/CatBoostClassifier_cv_{name}.pkl'
    # Cargar el modelo desde el archivo .pkl
    with open(path_model, 'rb') as file:
        clf_model[name] = pickle.load(file)

In [10]:
# Load the data to predict
df_predict = pd.read_csv(f"pipeline_output/data_for_historic_prediction.csv")
    
# Asegurarse de que 'date_flight_local' esté en formato datetime
df_predict['date_flight_local'] = pd.to_datetime(df_predict['date_flight_local'])
df_predict = df_predict[df_predict['date_flight_local'].dt.year == 2023]
    
df_predict = df_predict[df_predict['date_flight_local'].dt.month == 1]

def filter_data_by_quarter(df, quarter):
    # Definir los rangos de fechas para cada trimestre
    quarters = {
        "q1": (1, 3),
        "q2": (4, 6),
        "q3": (7, 9),
        "q4": (10, 12)
    }

    # Obtener el rango de meses para el trimestre especificado
    start_month, end_month = quarters[quarter]

    # Filtrar el DataFrame por el rango de fechas del trimestre
    df_filtered = df[df['date_flight_local'].dt.month.between(start_month, end_month)]

    return df_filtered

quarters = ['q1']

for quarter in quarters:
    df_predict = filter_data_by_quarter(df_predict, quarter)
    # Perform prediction and add the probabilities to the dataframe
    test_set = df_predict.drop(['respondent_id'], axis=1, errors='ignore')
    # df_probabilities = predict_and_explain(clf_model[model_names[0]], clf_model[model_names[1]], df_predict, features)
    explainer, df_probabilities = predict_and_explain(clf_model[model_names[0]], clf_model[model_names[1]], test_set, features, '2023-01-01','2023-01-31')


In [11]:
df_probabilities['out_prob_nps'].mean()

0.30753317961568943

In [12]:
df_probabilities['out_prob_nps'].mean()

0.30753317961568943

In [13]:

# # Rename columns, add insert date and select columns to save
# df_probabilities['insert_date_ci'] = STR_EXECUTION_DATE
# df_probabilities['model_version']=f'{model_year}-{model_month}-{model_day}'
# df_probabilities = df_probabilities[config['PREDICT']['COLUMNS_SAVE']]

# Save the prediction results to S3
df_probabilities.to_csv(save_path, index=False)

NameError: name 'save_path' is not defined

# CHECK historic prediction

In [None]:
df_hist=pd.read_csv('pipeline_output/historic_predictions (8).csv')

In [None]:
df_hist['date_flight_local']=pd.to_datetime(df_hist['date_flight_local'])

In [None]:
def aggregate_shaps(df,features_dummy, start_date, end_date):

    # 1. Filtrar el DataFrame por el rango de fechas
    df_probability_binary = df[(df['date_flight_local'] >= start_date) & (df['date_flight_local'] < end_date)]
    
    # df_probability_binary.drop(columns=['respondent_id','date_flight_local','model_version','insert_date_ci'])

    # 2. Asumiendo que las funciones de cálculo de SHAP y probabilidad ya están implementadas y ajustadas para usar df_filtered
    # 6. Agregar variables y valores SHAP para crear una explicación general
    # num_vars = ['delay_departure','ticket_price']
    num_vars = ['ticket_price', 'load_factor']
    bin_vars = ['otp15_takeoff']
    # bin_vars = ['otp15_takeoff'] + [col for col in df_nps_tkt.columns if 'country_agg' in col]
    # cat_vars=['segment']
    # bin_vars=[]
    cat_vars=[]
    touchpoints = [feat for feat in features_dummy if feat not in bin_vars + num_vars]
    
    # values_nps_sum = [pd.to_numeric(df_probability_binary[f'{feat}_nps'], errors='coerce').mean()*100 for feat in features_dummy]
    values_nps_sum = [df_probability_binary[f'{feat}_nps'].mean()*100 for feat in features_dummy]
    num_var_scores = [df_probability_binary[num_var].mean() for num_var in num_vars]
    bin_vars_scores = []
    for var in bin_vars:
        if var == 'otp15_takeoff':
            # Para 'otp15_takeoff', calcula el porcentaje de 0s.
            score = (df_probability_binary[df_probability_binary[var] == 0][var].count() / 
                     df_probability_binary[var].count()) * 100
        else:
            # Para cualquier otra variable, calcula el porcentaje de 1s.
            score = (df_probability_binary[df_probability_binary[var] == 1][var].count() / 
                     df_probability_binary[var].count()) * 100
        bin_vars_scores.append(score)
    # Continuación después de calcular bin_vars_scores
    cat_vars_scores = []
    for cat_var in cat_vars:
        cat_vars_scores.append(0)

    satisfaction_scores = [df_probability_binary[df_probability_binary[tp] >= 8][tp].count() / df_probability_binary[tp].count() * 100 for tp in touchpoints]
    shap_data = np.array(num_var_scores + bin_vars_scores + satisfaction_scores)
    base_value_nps_sum = df_probability_binary['base_prob_nps'].mean() * 100
    shap_values = np.array(values_nps_sum)  # Convertimos la lista en un array 2D
    features_names = np.array(features_dummy)
    explainer = shap.Explanation(values=shap_values, 
                                 base_values=base_value_nps_sum, 
                                 data=shap_data, 
                                 feature_names=features_names)
    pred_nps = df_probability_binary['out_prob_nps'].mean()
        
    return explainer, df_probability_binary, pred_nps

# model_prom=train_results['models']['promoter_binary']
# model_det=train_results['models']['detractor_binary']

# Lista de años de interés
years_of_interest = [2019, 2022, 2023, 2024]
explanations=[]
features_dummy = ['ticket_price', 'load_factor', 'otp15_takeoff', 'bkg_200_journey_preparation', 'pfl_100_checkin', 
                  'pfl_200_security', 'pfl_300_lounge', 'pfl_500_boarding', 'ifl_300_cabin', 
                  'ifl_200_flight_crew_annoucements', 'ifl_600_wifi', 'ifl_500_ife', 'ifl_400_food_drink', 
                  'ifl_100_cabin_crew', 'arr_100_arrivals', 'con_100_connections', 
                  'loy_200_loyalty_programme', 'img_310_ease_contact_phone']
for year in years_of_interest:
    for month in range(1, 13):
        if year == 2024 and month > 4:
            break
        


        start_date = f"{year}-{month:02d}-01"
        end_date = (pd.to_datetime(start_date) + MonthEnd(1)).strftime('%Y-%m-%d')

#         # Filtrar el DataFrame por las fechas de inicio y fin
#         df_filtered = df_nps_tkt[(df_nps_tkt['date_flight_local'] >= start_date) & (df_nps_tkt['date_flight_local'] < end_date)]

#         # Verificar si df_filtered está vacío
#         if df_filtered.empty:
#             print(f"No hay datos para el rango de fechas desde {start_date} hasta {end_date}.")
#             continue

        explanation, df_probability_binary, pred_nps = aggregate_shaps(df_hist,features_dummy, start_date, end_date)
    
       
    
        # No romper el bucle; solo continuar si 'explanation' es None
        if explanation is not None:
            print(explanation)
            explanations.append(explanation)
            # Guardar cada explicación con un nombre de archivo que refleje el mes y año
            file_name = f'pipeline_output/raw_explanations/explanation_{month}_{year}.pkl'
            with open(file_name, 'wb') as file:
                pickle.dump(explanation, file)
            print(file_name)
            print(pred_nps)
            shap.plots.waterfall(explanation, max_display=20)

In [None]:
def create_uplifting_explanation(explanation2, explanation1):
    """
    Create a new Explanation object representing the uplifting between two Explanation objects.

    Parameters:
        - explanation1: The first shap.Explanation object.
        - explanation2: The second shap.Explanation object.

    Returns:
        - A new shap.Explanation object representing the uplifting.
    """
    # Calculate the difference in values, base_values, and data
    diff_values = explanation2.values - explanation1.values
    
    diff_base_values = explanation1.base_values + sum(explanation1.values)
    print(sum(explanation1.values))
    diff_data = explanation2.data - explanation1.data

    # Create a new Explanation object with the difference values
    diff_explanation = shap.Explanation(values=diff_values, base_values=diff_base_values, data=diff_data,
                                        feature_names=explanation1.feature_names)

    return diff_explanation

In [None]:
def load_explanation(year, month):
    """Cargar un objeto de explicación desde un archivo."""
    file_name = f'pipeline_output/raw_explanations/explanation_{month}_{year}.pkl'
    try:
        with open(file_name, 'rb') as file:
            explanation = pickle.load(file)
        return explanation
    except FileNotFoundError:
        print(f"No explanation file found for {month}/{year}.")
        return None

# Ejemplo de uso: Comparar febrero de 2023 con febrero de 2022
explanation_2024_01 = load_explanation(2023, 2)
explanation_2024_02 = load_explanation(2024, 1)

if explanation_2024_01 is not None and explanation_2024_02 is not None:
    print('2024: Januery vs Februery')
    uplifting_explanation = create_uplifting_explanation(explanation_2024_02, explanation_2024_01)
    # Procesar o visualizar el uplifting_explanation según sea necesario
    shap.plots.waterfall(uplifting_explanation, max_display=30)


In [None]:
def compare_monthly_explanations(start_year, start_month, end_year, end_month):
    """
    Compara automáticamente cada mes con el mismo mes del año anterior o con 2019 si el año es 2022,
    desde una fecha de inicio dada, y devuelve un diccionario con los objetos de explicación.

    Args:
    - start_year: Año de inicio para las comparaciones.
    - start_month: Mes de inicio para las comparaciones.
    - end_year: Año final para las comparaciones.
    - end_month: Mes final para las comparaciones.

    Returns:
    - Un diccionario con las comparaciones de objetos de explicación. Las claves son las fechas de comparación.
    """
    uplifting_explanations_dict = {}

    for year in range(start_year, end_year + 1):
        for month in range(1, 13):
            if year == start_year and month < start_month:
                continue
            if year == end_year and month > end_month:
                break

            # Ajuste para el año 2022: comparar con 2019
            if year == 2022:
                previous_year_explanation = load_explanation(2019, month)
            else:
                previous_year_explanation = load_explanation(year - 1, month)

            current_explanation = load_explanation(year, month)

            if current_explanation is not None and previous_year_explanation is not None:
                uplifting_explanation = create_uplifting_explanation(previous_year_explanation,current_explanation)
                # Ajuste en la clave para reflejar la comparación especial del año 2022 con 2019
                if year == 2022:
                    date_key = f"2019-{month:02d} to {year}-{month:02d}"
                else:
                    date_key = f"{year-1}-{month:02d} to {year}-{month:02d}"
                uplifting_explanations_dict[date_key] = uplifting_explanation
            print(date_key)  
            shap.plots.waterfall(uplifting_explanations_dict[date_key], max_display=20)

    return uplifting_explanations_dict

# Ejemplo de uso
uplifting_explanations_dict = compare_monthly_explanations(2023, 3, 2024, 3)

# Check incremental

In [33]:
s3_resource = boto3.resource("s3")
S3_BUCKET_NPS = 'iberia-data-lake'
insert_date_ci='2024-04-12'
today_nps_surveys_prefix = f'customer/nps_surveys/export_historic/insert_date_ci={insert_date_ci}/'
dir_dict = 's3://iberia-data-lake/customer/nps_surveys/nps_dictionaries'

lf_dir = 's3://ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/'

In [34]:
    from datetime import datetime, timedelta
    # Convert to datetime object
    execution_date = datetime.strptime(insert_date_ci, "%Y-%m-%d")

    # Calculate yesterday's date
    yesterday_date = execution_date - timedelta(days=1)
    # Format dates as strings for S3 prefixes
    today_date_str = execution_date.strftime("%Y-%m-%d")
    yesterday_date_str = yesterday_date.strftime("%Y-%m-%d")
    
    yesterday_nps_surveys_prefix= f'customer/nps_surveys/export_historic/insert_date_ci={yesterday_date_str}/'

In [35]:
    # READ TODAY DATA (HISTORIC NPS)
    s3_keys = [item.key for item in s3_resource.Bucket(S3_BUCKET_NPS).objects.filter(Prefix=today_nps_surveys_prefix)]
    preprocess_paths = [f"s3://{S3_BUCKET_NPS}/{key}" for key in s3_keys]

    df_nps_historic = pd.DataFrame()
    for file in preprocess_paths:
        df = pd.read_csv(file)
        df_nps_historic = pd.concat([df_nps_historic, df], axis=0)
    df_nps_historic = df_nps_historic.reset_index(drop=True)

In [36]:
df_nps_historic

Unnamed: 0,respondent_id,sample_id,surveyed_flight_number,date_flight_local,scheduled_departure_time_local,scheduled_arrival_time_local,tier_level,language_code,aircraft_registration_number,seat_no,volume_of_bags,number_of_child_in_the_booking,number_of_infant_in_the_booking,number_of_people_in_the_booking,infinita_customers_identifer,flag_of_ib_singular_customers,country_code,list_of_options_for_booking_channel,list_of_options_for_checkin_channel,lounge_used_at_origin_airport,customer_journey_origin,customer_journey_destination,number_of_flights_in_journey,order_of_flight_in_journey,fleet_in_surveyed_flight,marketing_airline_code,date_of_flight_gmt,scheduled_departure_time_gmt,real_departure_time_local,real_departure_time_gmt,scheduled_arrival_time_gmt,real_arrival_time_local,real_arrival_time_gmt,segment,route,overall_haul,purser,invitegroup_ib,weight_category,weekly_weight,monthly_weight,pnr_show,ff_number,id_golden_record,ticket_num,started,time_spent_hrminsec,customer_email_show,origin_of_surveyed_flight,destination_of_surveyed_flight,operating_airline_code,cabin_in_surveyed_flight,haul,first_name_show,last_name_show,second_last_name_show,nps_category,nps_100,survey_type,invitegroup,group_age_survey,pun_100_punctuality,inm_200_issues_prior_checkin,inm_200_issues_prior_ticket_change,inm_200_issues_prior_schedule_change,inm_200_issues_prior_contact_center,inm_200_issues_prior_special_serv,inm_200_issues_prior_special_req,inm_200_issues_prior_avios,inm_200_issues_prior_voucher,inm_206_issues_checkin_long_queues,inm_206_issues_checkin_wrong_info,inm_206_issues_checkin_additional_fees,inm_206_issues_checkin_overbooking,inm_206_issues_checkin_downgrade,inm_206_issues_checkin_staff,inm_206_issues_checkin_social_distance,inm_206_issues_checkin_face_masks,inm_206_issues_checkin_documentation,inm_206_issues_checkin_other,inm_207_issues_lounge_denied,inm_207_issues_lounge_overcrowded,inm_207_issues_lounge_cleanliness,inm_207_issues_lounge_wifi,inm_207_issues_lounge_staff,inm_207_issues_lounge_food_drink,inm_207_issues_lounge_face_masks,inm_207_issues_lounge_other,inm_208_issues_security_leave_sth,inm_208_issues_security_long_queues,inm_208_issues_security_staff,inm_208_issues_security_social_distance,inm_208_issues_security_face_masks,inm_208_issues_security_other,inm_209_issues_boarding_unclear,inm_209_issues_boarding_gate_changed,inm_209_issues_boarding_lack_space,inm_209_issues_boarding_disorganised,inm_209_issues_boarding_staff,inm_209_issues_boarding_social_distance,inm_209_issues_boarding_face_masks,inm_209_issues_boarding_documentation,inm_209_issues_boarding_other,inm_220_issues_timing_cancelled,inm_220_issues_timing_dep_delay,inm_220_issues_timing_arr_delay,inm_220_issues_timing_missed,inm_230_issues_onboard_staff,inm_230_issues_onboard_ife,inm_230_issues_onboard_overcrowding,inm_230_issues_onboard_face_masks,inm_235_issues_onboard_comfort_damaged,inm_235_issues_onboard_comfort_space,inm_235_issues_onboard_comfort_temperature,inm_235_issues_onboard_comfort_cleanliness,inm_235_issues_onboard_comfort_washrooms,inm_235_issues_onboard_comfort_other,inm_236_issues_meal_availability,inm_236_issues_meal_portions,inm_236_issues_meal_quality,inm_236_issues_meal_special,inm_236_issues_meal_other,inm_240_issues_baggage_lost,inm_240_issues_baggage_delayed,inm_240_issues_baggage_demaged,inm_240_issues_baggage_staff,inm_240_issues_baggage_hand,inm_240_issues_baggage_other,inm_250_issues_arrival_slow,inm_250_issues_arrival_unclear,inm_250_issues_arrival_aditional_request,inm_250_issues_arrival_staff,inm_250_issues_arrival_immigration_queues,inm_250_issues_arrival_immigration_passport,inm_250_issues_arrival_immigration_other,inm_255_issues_connecting_missed,inm_255_issues_connecting_staff,inm_255_issues_connecting_baggage,inm_255_issues_connecting_other,bkg_100_booking,bkg_200_journey_preparation,inm_400_issues_response,pfl_100_checkin,pfl_200_security,pfl_300_lounge,pfl_500_boarding,ifl_100_cabin_crew,ifl_200_flight_crew_annoucements,ifl_300_cabin,ifl_400_food_drink,ifl_600_wifi,arr_100_arrivals,con_100_connections,img_310_ease_contact_phone,img_320_ease_contact_ibplus_mail,ifl_500_ife,loy_200_loyalty_programme,inm_050_issues_t_f,dig_400_mobile_app,cov_300_appropiate_changes_to_reassure,hot_topic_verbatim,iag_ht_oe_t_scrubbed,cla_800_did_you_use_wifi_on_board,status,nps_all_t,inm_220_issues_timing_staff,inm_220_issues_timing_other,cla_200_check_in_methodiberias_mobile_app,cla_200_check_in_methodiberias_website,cla_200_check_in_methodiberias_checkin_desk_at_the_airport,cla_200_check_in_methodselfservice_kiosk_at_the_airport,cla_200_check_in_methodother,inm_100_journey_issuean_issue_prior_to_travelling,inm_100_journey_issuean_issue_at_your_departure_airport,inm_100_journey_issuea_disruption_to_your_flight_timing,inm_100_journey_issuean_issue_onboard,inm_100_journey_issuean_issue_with_your_baggage,inm_100_journey_issuean_issue_disembarking_or_at_your_arrival_airport,inm_100_journey_issuean_issue_connecting_tofrom_another_flight,inm_100_issues_other,inm_100_oth_t,inm_200_issues_prior_other,inm_200_oth_t,inm_205_issues_dep_airport_checkin,inm_205_issues_dep_airport_lounge,inm_205_issues_dep_airport_security,inm_205_issues_dep_airport_boarding,inm_205_issues_dep_airport_assistance,inm_205_issues_dep_airport_other,inm_205_oth_t,inm_206_oth_t,inm_207_oth_t,inm_208_oth_t,inm_209_oth_t,inm_220_oth_t,inm_230_issues_onboard_seat_selected,inm_230_issues_onboard_not_together,inm_230_issues_onboard_comfort,inm_230_issues_onboard_meal,inm_230_issues_onboard_left_sth,inm_230_issues_onboard_other,inm_230_oth_t,inm_235_oth_t,inm_236_oth_t,inm_240_issues_baggage_stolen,inm_240_oth_t,inm_250_oth_t,inm_255_oth_t,inm_301_how_managed_issue_representative,inm_301_how_managed_issue_call_centre,inm_301_how_managed_issue_website,inm_301_how_managed_issue_could_not,inm_301_how_managed_issue_email,inm_301_how_managed_issue_decided_not,inm_301_how_managed_issue_other,inm_301_oth_t,inm_305_issues_resolved_t_f,inm_500_issues_verbatim_translated,cla_100_booking_channel_survey,cla_400_lounge_t_f,cla_500_ife_t_f,cla_610_wifi_aware,cla_600_wifi_t_f,cla_600_wifi_other_verbatim_translated,cla_300_connection_from,cla_300_connection_to,cla_300_connection_no,img_430_solved_1st_time_social_net,cov_500_covid_verbatim,cov_500_covid_verbatim_translated,gender,res100_country_code_survey,bnd_011_looked_lowest_cost_1_5,tvl_journey_reason,tvloth_journey_reason_other_verbatim_tranlated,rea_choosing_reason,anom_anonimity,cla_450_fast_track_t_f,cla_900_boarding_how,cla_700_food_drink_provision,cla_120_food_drink_preordered,cla_711_meal_prepurchase,cla_711_meal_prepurchase_other_verbatim_translated,cla_710_food_drink_purchased,cla_710_food_drink_purchased_other,cla_712_get_preferred_food_y_n,cla_550_digital_press,cla_550_digital_press_other_verbatim_translated,usb_100_usb_use,usb_100_usb_use_other_verbatim_translated,cla_950_disembark_how,arr_400_arrival_luggage_collection,insert_date_ci,date_survey_completed,scheduled_arrival_date_local,iag_mod_702_logic,inm_200_issue_helpi_contacted_iberia_on_social_media,loy_100_ways_of_contactby_phone,loy_100_ways_of_contactvia_the_iberia_plus_email,loy_100_ways_of_contactby_whatsapp,loy_100_ways_of_contacton_social_networks_facebook_twitter_etc,loy_100_ways_of_contacti_contacted_iberia_using_other_channels_please_specify,loy_100_ways_of_contacti_did_not_contact_iberia,iag_loy_100_96_oth_t_scrubbed,loy_500_by_whatsapp,dem_700_travelling_withi_was_travelling_on_my_own,dem_700_travelling_withwith_babies_aged_under_2,dem_700_travelling_withwith_toddlers_aged_2_to_5,dem_700_travelling_withwith_children_aged_6_to_12,dem_700_travelling_withwith_teens_aged_13_to_17,dem_700_travelling_withwith_my_spouse_or_partner,dem_700_travelling_withother_adult_party_6_people_or_less,dem_700_travelling_withadult_partygroup_more_than_6_people,dem_700_travelling_withprefer_not_to_say,iag_dem_800_96_oth_t_scrubbed,perm_200_future_contact,iag_sel_000_continuesubmit,interaction_point,sel_100_module_selectionbooking_and_journey_preparation,sel_100_module_selectioncheckin,sel_100_module_selectionlounge_experience,sel_100_module_selectionboarding,sel_100_module_selectioncabin_crew,sel_100_module_selectioncabin_environment,sel_100_module_selectionfood_and_drink_on_board,sel_100_module_selectionin_flight_entertainment_and_wifi,sel_100_module_selectionin_flight_entertainment,sel_100_module_selectionwifi_service,sel_100_module_selectionarrival_experience,sel_100_module_selectionconnections_with_another_flight,sel_100_module_selectioniberia_plus_loyalty_program,iag_inm_101_96_oth_t_scrubbed,mod_102_pre_journeyease_of_booking_process_on_iberia_websitemobile_app,mod_102_pre_journeyclarity_of_information_and_conditions_during_booking_process_on_iberia_websitemobile_app,mod_102_pre_journeyease_of_managing_your_booking_on_iberia_websitemobile_app,mod_102_pre_journeyease_of_contact_with_iberia_by_phone,mod_102_pre_journeyhelpfulness_of_iberia_staff_by_phone,mod_102_pre_journeyease_of_contact_with_iberia_by_mail,mod_102_pre_journeyother_please_specify,iag_mod_102_96_oth_t_scrubbed,iag_mod_201_t_scrubbed,mod_203_checkinwaiting_time_at_airport_checkin_area,mod_203_checkinhelpfulness_of_staff_at_airport_checkin_area,mod_203_checkinsocial_distancing_during_checkin_at_the_airport,mod_203_checkinease_of_online_checkin_process,mod_203_checkinwebsiteapp_reliability_and_performance,mod_203_checkinease_of_use_of_kiosks_at_the_airport,mod_203_checkinother_please_specify,iag_mod_203_96_oth_t_scrubbed,iag_mod_301_t_scrubbed,mod_302_loungehelpfulness_of_staff_at_reception,mod_302_loungehelpfulness_of_staff_inside_the_lounge,mod_302_loungequality_of_the_food,mod_302_loungethe_variety_of_the_food_on_offer,mod_302_loungethe_variety_of_drinks_on_offer,mod_302_loungeseat_availability,mod_302_loungewifi,mod_302_loungeother_please_specify,iag_mod_302_96_oth_t_scrubbed,iag_mod_401_t_scrubbed,mod_403_boardingthe_organisation_of_the_boarding_process,mod_403_boardingspeed_of_boarding,mod_403_boardinghelpfulness_of_staff_at_the_departure_gate,mod_403_boardingannouncements_made_at_the_departure_gate,mod_403_boardingpriority_boarding,mod_403_boardingsocial_distancing_during_boarding,mod_403_boardingbiometric_boarding_capabilities,mod_403_boardingavailability_of_space_for_your_hand_luggagepersonal_belongings_in_the_aircraft,mod_403_boardingother_please_specify,iag_mod_403_96_oth_t_scrubbed,iag_mod_501_t_scrubbed,mod_502_crewhelpfulness_of_cabin_crew,mod_502_crewcrew_availability_during_the_flight,mod_502_crewempowerment_of_cabin_crew_to_resolve_problems,mod_502_crewcabin_crew_managing_other_passengers,mod_502_crewcabin_crew_managing_boarding_disembarking,mod_502_crewgrooming_and_appearance,mod_502_crewflight_information_provided_by_pilots,mod_502_crewannouncements_provided_by_cabin_crew,mod_502_crewother_please_specify,iag_mod_502_96_oth_t_scrubbed,iag_mod_601_t_scrubbed,mod_602_cabincleanliness_of_the_cabin,mod_602_cabincleanliness_of_toilets,mod_602_cabinphysical_condition_of_the_cabin,mod_602_cabinphysical_condition_of_the_toilets,mod_602_cabinamount_of_legroom,mod_602_cabinseat_comfort,mod_602_cabinsocial_distancing,mod_602_cabintemperature_onboard,mod_602_cabinother_please_specify,iag_mod_602_96_oth_t_scrubbed,iag_mod_701_t_scrubbed,mod_707_inflight_fndquality_of_food,mod_707_inflight_fndquality_of_wines,mod_707_inflight_fndselection_of_food,mod_707_inflight_fndselection_of_drinks,mod_707_inflight_fndvisual_appeal_of_food,mod_707_inflight_fndquantity_portion_size_of_food_available,mod_707_inflight_fndtimings_when_food_and_drinks_are_served,mod_707_inflight_fndvalue_for_money_of_food_and_drink_available,mod_707_inflight_fndsustainable_packaging_materials_for_food_and_drinks,mod_707_inflight_fndother_please_specify,iag_mod_707_96_oth_t_scrubbed,iag_mod_803_a_t_scrubbed,iag_mod_803_b_t_scrubbed,iag_mod_803_c_t_scrubbed,mod_806_ife_and_wifiinflight_entertainment_contents,mod_806_ife_and_wifiinflight_entertainment_ease_of_use,mod_806_ife_and_wifichoice_of_movies,mod_806_ife_and_wifiscreen_quality,mod_806_ife_and_wifiease_of_use_of_the_wifi,mod_806_ife_and_wifireliability_of_the_wifi_connection,mod_806_ife_and_wifispeed_of_the_wifi_connection,mod_806_ife_and_wifiwifi_value_for_money,mod_806_ife_and_wifiother_please_specify,iag_mod_806_96_oth_t_scrubbed,mod_807_ifeinflight_entertainment_contents,mod_807_ifeease_of_access,mod_807_ifeother_please_specify,iag_mod_807_96_oth_t_scrubbed,mod_808_wifiease_of_use_of_the_wifi,mod_808_wifireliability_of_the_wifi_connection,mod_808_wifispeed_of_the_wifi_connection,mod_808_wifiwifi_value_for_money,mod_808_wifiother_please_specify,iag_mod_808_96_oth_t_scrubbed,iag_mod_901_t_scrubbed,mod_904_arrivalsorganizationspeed_for_getting_of_the_plane,mod_904_arrivalssocial_distancing_getting_off_the_plane,mod_904_arrivalssignage_to_immigrationconnections,mod_904_arrivalsspeed_of_getting_through_immigration,mod_904_arrivalstime_to_collect_checked_baggage,mod_904_arrivalsease_of_claiming_for_lost_or_damaged_baggage,mod_904_arrivalsother_please_specify,iag_mod_904_96_oth_t_scrubbed,iag_mod_1001_t_scrubbed,mod_1002_needed_for_connectionchange_terminals,mod_1002_needed_for_connectionchange_airports,mod_1002_needed_for_connectionchange_airlines,mod_1002_needed_for_connectioncollect_and_recheck_your_luggage,mod_1002_needed_for_connectionpass_through_immigration,mod_1002_needed_for_connectionpass_through_security,mod_1002_needed_for_connectioncheckin_and_collect_boarding_pass,mod_1002_needed_for_connectioncollect_boarding_pass_only,mod_1002_needed_for_connectionseek_help_or_advice_from_a_customer_service_representative,mod_1002_needed_for_connectionnone_of_the_above,mod_1003_connectiononboard_announcements_regarding_connections,mod_1003_connectionairport_signage_to_locate_connecting_flight,mod_1003_connectiontime_available_for_you_connection,mod_1003_connectionaccessibility_of_staff_to_support_questionsqueries,mod_1003_connectionalerts_sent_to_my_mobile_with_live_updates_on_my_connection,mod_1003_connectionsimplify_document_checks_between_flights,mod_1003_connectionother_please_specify,iag_mod_1003_96_oth_t_scrubbed,iag_mod_1101_t_scrubbed,mod_1102_loyaltychances_to_use_my_avios,mod_1102_loyaltyvolume_of_accrued_avios_with_my_flight,mod_1102_loyaltytier_benefits,mod_1102_loyaltyrecognition_while_travelling_with_iberia,mod_1102_loyaltyhelpfulness_of_the_personalized_contact_centre_service,mod_1102_loyaltyother_please_specify,iag_mod_1102_96_oth_t_scrubbed,alert,img_410_phone,img_420_ibplus_email,group_age,localdeparturedateuk,date_campaign,date_sample_file,date_survey_completion,completiondateuk,device_type,browser_show,ipaddress_show,ovp_100_overall_experience,inm_100_issues_other_verbatim,inm_200_issues_prior_other_verbatim,inm_205_issues_dep_airport_other_verbatim,inm_206_issues_checkin_other_verbatim,inm_207_issues_lounge_other_verbatim,inm_208_issues_security_other_verbatim,inm_209_issues_boarding_other_verbatim,inm_220_issues_timing_other_verbatim,inm_270_flight_delay_length,inm_280_issues_delay_inf_clarity,inm_282_issues_delay_length,inm_284_issues_delay_looked_after,inm_230_issues_onboard_other_verbatim,inm_235_issues_onboard_comfort_other_verbatim,inm_236_issues_meal_other_verbatim,inm_240_issues_baggage_other_verbatim,inm_250_issues_arrival_lounge,inm_250_issues_arrival_immigration_other_verbatim,inm_255_issues_connecting_other_verbatim,inm_300_issues_spoken_t_f,inm_301_how_managed_issue_mobile_app,inm_301_how_managed_issue_other_verbatim,inm_310_issues_apology_t_f,inm_320_issues_empathy_t_f,inm_500_issues_verbatim,vfm_100_value_for_money,loy_100_likelihoodd_fly_again,tvloth_journey_reason_other_verbatim,bnd_021_willing_to_pay_more_1_5,com_200_num_trips_ly_ib,com_210_num_trips_ly_other_airlines,com_101_num_trips_ly_ib,com_201_num_trips_ly_other_airlines,cla_105_booking_agency,cla_110_booking_device,bkg_110_booking_time,bkg_120_booking_staff_service,cla_202_boarding_pass_channel,cla_250_checked_baggage_t_f,pfl_100_checkin_speed,pfl_120_checkin_staff_service,pfl_350_lounge_service,bdg_100_boarding_announcements,bdg_110_boarding_clarity,bdg_111_boarding_signage,bdg_200_boarding_speed,bdg_300_boarding_staff_service,bdg_400_boarding_hand_lugg_space,cbn_500_cabin_mood_music,crw_100_cabin_crew_helpfulness,crw_110_cabin_crew_feel_special,crw_120_cabin_crew_announcements,cbn_101_cabin_cleanliness,cbn_102_cabin_physical_condition,cbn_201_washrooms_cleanliness,cbn_202_washrooms_physical_condition,cbn_300_cabin_seat_comfort,cbn_650_ronda,usb_100_usb_use_other_verbatim,cla_171_special_meal,cla_711_meal_prepurchase_other_verbatim,cla_713_get_preferred_drink_y_n,cla_800_duty_free_t_f,fnd_110_food_quality,fnd_725_wines_quality,fnd_200_food_drink_portion,fnd_310_food_selection,fnd_320_drinks_selection,fnd_401_food_visual_appeal,fnd_600_food_drink_time_to_clear,fnd_900_food_drink_value_for_money,fnd_800_meal_prepurchase,ifl_700_duty_free_range,ife_100_ife_usability,ife_200_ife_content,cla_600_wifi_other_verbatim,wfi_200_wifi_value_for_money,wfi_300_wifi_ease_of_use,wfi_400_wifi_speed,cla_550_digital_press_other_verbatim,ife_300_digital_press,arr_200_arrivals_ease,arr_300_arrivals_immigration_speed,con_200_connection_information,con_300_connection_clarity,con_400_connection_time,cla_130_additional_needs_t_f,dig_100_web_mobile_t_f,dig_200_web_pc,dig_200_web_tablet,dig_200_web_smartphone,dig_300_web,dig_250_mobile_phone_system,img_330_ease_contact_social_net,rea_100_frequent_flyer_prog_reason,lvl_loyalty_program_survey,end_oe_suggestion_verbatim,end_oe_suggestion_verbatim_translated,sze_num_people,nat_100_nationality,nat_100_nationality_other,cbn_100_cabin,cbn_200_cabin_washrooms_clean,cbn_400_cabin_temperature,cbn_600_newspappers,cbn_450_cabin_temp_description,fnd_100_food_drink_quality,fnd_300_food_drink_choice,fnd_400_food_drink_presentation,fnd_500_food_drink_first_choice,bnd_010_looked_lowest_cost,bnd_020_willing_to_pay_more,bnd_030_iberia_cares_its_customers,prize_100_prize_draw_participate_t_f,prize_300_prize_draw_name,prize_300_prize_draw_email,prize_300_prize_draw_phone,flag_misconnection_misc,delay,flag_mishandling_ahl,flag_mishandling_dpr,num_bags_mishandling_ahl,num_bags_mishandling_dpr,issue_operative,issue_non_operative,issue_category,issue_category_calculated,customer_journey_ctry_origin,customer_journey_ctry_destination,ctry_origin_of_surveyed_flight,ctry_destination_of_surveyed_flight,gr_region,region,ticketing_carrier_orig,ticket_num_orig,coupon_num_orig,ticket_price,ctry_route,issue_category_calculated_d15,codeshare
0,46831534,10004364989,8327.000,2022-10-16,2022-10-16 18:50:00,2022-10-16 20:30:00,IB CLASSIC,ES,ECLSQ,04A,FALSE,0.000,0,1,,FALSE,ES,,,,LEU,MAD,1.000,1.000,ATX,IB,2022-10-16,2022-10-16 16:50:00,2022-10-16 19:05:00,2022-10-16 17:05:00,2022-10-16 18:30:00,2022-10-16 20:44:00,2022-10-16 18:44:00,LEU-MAD,LEU-MAD,SH,,3.000,ECONOMY_SH+MH,1.124,1.729,,,,,2022-10-19 23:54:27,00:11:23,,,MAD,YW,Economy,SH,,,,Passive,7,SURVEYS PROCESSED,7,45-54,6.000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7.000,,,6.000,8.000,,6.000,8.000,7.000,7.000,,,7.000,,,,,6.000,No,,8.000,"Estivo correction, todos con mascarillas.","Estivo correction, todos con mascarillas.",,Completed,"En este fuels no hubo problemas, pero pienso q...",,,Iberia's mobile App,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Iberia website,,"No, I did not want to use it",,,,,,No,,"Estivo correction, todos con mascarillas.","Estivo correction, todos con mascarillas.",Female,,4,Visiting friends or family,,The only airline operating this route,"Provide Iberia with my survey responses, but n...",No,,,,,,,,,,,,,,,2023-02-15,2022-10-20 00:05:50,2022-10-16,Cabin+OtherHaul,,,,,,,I did not contact Iberia,,,I was travelling on my own,,,,,,,,,,,Continue,IAG_IB,,Check-in,,,,,,,,,,,,,,,,,,,,,No se xq pero el check in en la app o en la we...,,,,Ease of online check-in process,Website/app reliability and performance,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,15.000,0,0,0.000,0.000,0,0,No issue,Operative,ES,ES,,,,,,,,,,No issue,
1,52990189,10009487200,8319.000,2023-01-04,2023-01-04 13:35:00,2023-01-04 13:35:00,,EN,ECNHU,07F,TRUE,0.000,0,3,,FALSE,,,,,CDT,MAD,1.000,1.000,CR2,IB,2023-01-04,2023-01-04 12:35:00,2023-01-04 13:55:00,2023-01-04 12:55:00,2023-01-04 13:35:00,2023-01-04 14:00:00,2023-01-04 14:00:00,CDT-MAD,CDT-MAD,SH,,2.000,ECONOMY_SH+MH,1.100,0.519,,,,,2023-01-07 15:30:09,00:06:47,,,MAD,YW,Economy,SH,,,,Passive,8,SURVEYS PROCESSED,2,45-54,7.000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10.000,8.000,,9.000,4.000,,9.000,9.000,9.000,9.000,7.000,,9.000,,,,,,No,,8.000,I didn’t need reassurance.,No necesitaba tranquilidad.,,Completed,El vuelo estuvo bien pero creo que hubiera sid...,,,,Iberia's website,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Iberia website,,"No, I did not want to use it",,,,,,No,,I didn’t need reassurance.,No necesitaba tranquilidad.,Female,Philippines,1,Holiday (more than 5 nights),,Best schedule for my needs,"Provide Iberia with my survey responses, but n...",,,,,,,,,,,,,,,,2023-04-13,2023-01-07 15:36:56,2023-01-04,Cabin+OtherHaul,,,,,,,I did not contact Iberia,,,,,,,With teens aged 13 to 17,With my spouse or partner,,,,,,Submit feedback,IAG_IB,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,20.000,0,0,0.000,0.000,0,0,No issue,Operative,ES,ES,,,,,,,,,,Operative,
2,54805040,10010991371,8327.000,2023-01-29,2023-01-29 16:50:00,2023-01-29 18:30:00,IB SILVER (OW RUBY),ES,ECLSQ,01A,FALSE,0.000,0,1,,FALSE,,,,,LEU,MAD,1.000,1.000,ATX,IB,2023-01-29,2023-01-29 15:50:00,2023-01-29 16:45:00,2023-01-29 15:45:00,2023-01-29 17:30:00,2023-01-29 18:07:00,2023-01-29 17:07:00,LEU-MAD,LEU-MAD,SH,,4.000,BUSINESS_SH+MH,0.627,0.414,,,,,2023-02-01 15:33:18,00:04:41,,,MAD,YW,Business,SH,,,,Promoter,10,SURVEYS PROCESSED,2,35-44,10.000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9.000,9.000,,10.000,9.000,,9.000,10.000,,8.000,9.000,,10.000,,,,,6.000,No,,1.000,No se realmente que mas podría hacer,No se realmente que mas podría hacer,,Completed,Muy buena iniciativa la línea Seu-Madrid.,,,Iberia's mobile App,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Iberia website,"No, I tried but the lounge was not available","No, I did not want to use it",,,,,,No,,No se realmente que mas podría hacer,No se realmente que mas podría hacer,Male,Spain,2,Other (please specify),vacaciones 2 días,The only airline operating this route,"Provide Iberia with my survey responses, but n...",,,Complimentary complete meals,,,,,,,,,,,,,2023-04-28,2023-02-01 15:37:59,2023-01-29,OtherCabin+OtherHaul,,,,,,,I did not contact Iberia,,,I was travelling on my own,,,,,,,,,,,Submit feedback,IAG_IB,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,-5.000,0,0,0.000,0.000,0,0,No issue,No issue,ES,ES,,,,,,,,,,No issue,
3,9386758,9206003320,6841.000,2019-07-23,2019-07-23 23:55:00,2019-07-24 00:00:00,IB GOLD (OW ZAPHIRE),ES,ECJBA,,,,0,1,,N,ES,,,,MAD,BUE,1.000,1.000,346,IB,2019-07-23,2019-07-23 21:55:00,2019-07-23 00:16:00,2019-07-23 22:16:00,2019-07-24 00:00:00,2019-07-23 07:54:00,2019-07-24 00:00:00,MAD-EZE,EZE-MAD,LH,,,ECONOMY_LH,0.731,1.593,J4541,13691753,,,2019-07-26 07:08:24,00:00:00,ENRIQUE.ENPI@GMAIL.COM,MAD,EZE,IB,Economy,LH,ENRIQUE,MADRID,IZQUIERDO,Promoter,9,SURVEYS MARITZ,2,45-54,8.000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.000,8.000,,8.000,8.000,8.000,8.000,9.000,9.000,9.000,7.000,6.000,8.000,,8.000,,8.000,9.000,No,,,,,Yes,,,,,,,Airline's check-in desk,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Telephone directly with airline,Yes,Yes,Yes,Yes,,,,No,,,,M,ES,,Business/work,,Iberia is a brand I trust,Responses linked to customer details,Yes,Direct from terminal via passenger boarding br...,Complimentary complete meals,No,,,,,,,,,,Directly to the terminal - passenger boarding ...,8.000,2023-02-15,2019-07-26 07:08:24,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9.000,,16-21,2019-07-23,2019-07-24,2019-07-24,2019-07-25,2019-07-25,D,Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:6...,190.210.131.81,9.000,,,,,,,,,,,,,,,,,,,,,,,,,,9.000,10.000,,,,,,,,,8.000,8.000,Check-in desk,Yes,8.000,9.000,8.000,,,,,,,,,,,,,,,8.000,,,,,,No,6.000,,,,,6.000,,,,,8.000,8.000,,2.000,,,,,8.000,8.000,,,,2.000,Yes - website,PC/Laptop,,,8.000,,,,Oro,,,1.000,Spanish,,9.000,8.000,8.000,,,,,,,9.000,5.000,8.000,,,,,,21.000,0,0,0.000,0.000,0,0,No issue,Operative,MAD,BUE,ES,AR,AMERICA SUR,AMERICA SUR,,,,,AR,Operative,
4,9974098,9252098247,6845.000,2019-09-03,2019-09-03 12:10:00,2019-09-03 19:50:00,,ES,ECMXV,,,,0,2,,N,,,,,BCN,BUE,1.000,1.000,359,IB,2019-09-03,2019-09-03 10:10:00,2019-09-03 12:20:00,2019-09-03 10:20:00,2019-09-03 22:50:00,2019-09-03 19:41:00,2019-09-03 22:41:00,MAD-EZE,EZE-MAD,LH,,5.000,ECONOMY_LH,0.890,0.729,J47P7,,,,2019-09-10 07:25:17,00:00:00,BELTRAMINOJORGE@GMAIL.COM,MAD,EZE,IB,Economy,LH,DORAEMILIA,REDONDOZANETTA,,Detractor,6,SURVEYS MARITZ,2,65+,8.000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,8.000,8.000,,7.000,8.000,,7.000,8.000,8.000,8.000,8.000,,8.000,,6.000,,,,No,,,,,,,,,,,,Airline's check-in desk,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Airline website,No,,,,,,,No,,,,,,,Holiday,,Value for money offer on this flight,Responses linked to customer details,No,Direct from terminal via passenger boarding br...,Complimentary complete meals,No,,,,,Yes,,,,,Directly to the terminal - passenger boarding ...,8.000,2023-02-15,2019-09-10 07:25:17,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,7.000,,2019-09-03,2019-09-06,2019-09-06,2019-09-09,2019-09-09,D,Mozilla/5.0 (Windows NT 6.1; Win64; x64) Apple...,190.183.202.133,6.000,,,,,,,,,,,,,,,,,,,,,,,,,,7.000,7.000,,,,,,,,PC/Laptop,6.000,,Check-in desk,Yes,8.000,8.000,,,,,,,,,,7.000,8.000,,,,,0.000,,,,,Yes,No,7.000,8.000,8.000,8.000,8.000,7.000,7.000,,,,,,,,,,,,,,,,,2.000,No,,,,,,,,Not a member,un poquito mas de espacio en los asientos,un poquito mas de espacio en los asientos,2.000,,,8.000,8.000,8.000,,,,,,,,7.000,,,,,,,10.000,0,0,0.000,0.000,0,0,No issue,Operative,BCN,BUE,ES,AR,AMERICA SUR,AMERICA SUR,,,,,AR,No issue,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1023333,65748044,10042829410,3264.000,2024-03-28,2024-03-28 16:10:00,2024-03-28 18:25:00,IB CLASSIC,ES,ECJLI,27F,False,1.000,0,3,,False,,DIR-ONLINE,WEB,,MAD,MXP,2.000,1.000,321,IB,2024-03-28,2024-03-28 15:10:00,2024-03-28 16:11:00,2024-03-28 15:11:00,2024-03-28 17:25:00,2024-03-28 18:14:00,2024-03-28 17:14:00,MAD-MXP,MAD-MXP,MH,,4.000,ECONOMY_SH+MH,1.154,1.434,M9TRP,51465508,325519285.000,751426422701.000,2024-03-31 17:20:35,00:05:43,NEMRIOSS-IBERIA@YAHOO.ES,MAD,MXP,IB,Economy,MH,JUANDAVID,REQUEJORODRIGUEZ,,Promoter,9,SURVEYS PROCESSED,2,35-44,10.000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9.000,7.000,,8.000,9.000,,9.000,10.000,9.000,8.000,8.000,,9.000,,,,,6.000,No,,,,,"No, I didn't want to pay for it / poor value f...",Completed,Puntualidad y trato. Lo único que no me gusta ...,,,,Iberia's website,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Iberia website,,,Yes,"No, I didn't want to pay for it / poor value f...",,,Connected to another flight from [DestinationA...,,,,,Male,Spain,4,Visiting friends or family,,Value for money offer on this flight,Provide Iberia with my survey responses linked...,,,Food and drink items to purchase,,,,,,,,,,,,,2024-04-11,2024-03-31 17:26:18,2024-03-28,Cabin+OtherHaul,,,,,,,I did not contact Iberia,,,,,,With children aged 6 to 12,,With my spouse or partner,,,,,Yes,Submit feedback,IAG_IB,,Check-in,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000,1.000,0,0,0.000,0.000,0,0,No issue,Operative,ES,IT,ES,IT,EUROPA,EUROPA CEE,75.000,1426422701.000,1.000,44.150,IT,No issue,IB
1023334,65792152,10042830815,3166.000,2024-03-28,2024-03-28 15:50:00,2024-03-28 17:15:00,IB CLASSIC,ES,ECNVR,32A,False,0.000,0,1,,False,,DIR-ONLINE,MOB,,MAD,LHR,2.000,1.000,350,IB,2024-03-28,2024-03-28 14:50:00,2024-03-28 15:50:00,2024-03-28 14:50:00,2024-03-28 17:15:00,2024-03-28 17:09:00,2024-03-28 17:09:00,MAD-LHR,LHR-MAD,MH,,4.000,ECONOMY_SH+MH,1.154,1.434,,,,,2024-04-02 08:41:54,00:05:27,,MAD,LHR,IB,Economy,MH,,,,Promoter,10,SURVEYS PROCESSED,2,35-44,10.000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,9.000,,10.000,10.000,,10.000,10.000,10.000,10.000,7.000,10.000,10.000,,,,,10.000,No,,,,,"No, I didn't want to use it",Completed,Me gustan sus aviones y lo segura que me sient...,,,Iberia's mobile App,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Iberia website,,,Yes,"No, I didn't want to use it",,,Connected to another flight from [DestinationA...,,,,,Male,Spain,2,Short break (up to 5 nights),,Previous good experience with Iberia,"Provide Iberia with my survey responses, but n...",,,Food and drink items to purchase,,,,,,,,,,,,,2024-04-11,2024-04-02 08:47:21,2024-03-28,Cabin+OtherHaul,,,,,,,I did not contact Iberia,,,,,,,,With my spouse or partner,,,,,,Submit feedback,IAG_IB,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000,0,0,0.000,0.000,0,0,No issue,No issue,ES,GB,ES,GB,EUROPA,EUROPA CEE,75.000,1425965772.000,1.000,128.250,GB,No issue,IB
1023335,65793541,10042985930,6588.000,2024-03-28,2024-03-28 22:30:00,2024-03-29 14:20:00,,ES,ECOAY,43K,True,0.000,0,2,,False,,DIR-ONLINE,COUNTER,,CLO,MAD,4.000,2.000,350,IB,2024-03-29,2024-03-29 03:30:00,2024-03-28 22:30:00,2024-03-29 03:30:00,2024-03-29 13:20:00,2024-03-29 13:45:00,2024-03-29 12:45:00,BOG-MAD,BOG-MAD,LH,,4.000,ECONOMY_LH,0.796,1.072,MJFXZ,,308529810.000,751422198016.000,2024-04-02 16:01:02,00:19:21,DORAELENA985@GMAIL.COM,BOG,MAD,IB,Economy,LH,MARTAMILENA,ARTUROOROZCO,,Detractor,6,SURVEYS PROCESSED,2,55-64,9.000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Disembarking the aircraft was slow,,,,,,,,,,,10.000,9.000,,10.000,10.000,,10.000,7.000,9.000,10.000,9.000,5.000,8.000,10.000,,,,,Yes,,,,,"No, I didn't want to use it",Completed,Mi madre es una mujer de 86 años y viajó como ...,,,Iberia's mobile App,,,,,,,,,,An issue disembarking or at your arrival airport,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,"No, I decided not to take any action",,,,,Via a travel agent,,"No, I did not want to use it",Yes,"No, I didn't want to use it",,Connected from another flight into [OriginAirp...,Connected to another flight from [DestinationA...,,,,,Female,Colombia,4,Visiting friends or family,,Iberia is a brand I trust,Provide Iberia with my survey responses linked...,,,Complimentary complete meals,,,,,,,,,,,,,2024-04-11,2024-04-02 16:20:23,2024-03-29,Cabin+Haul,,,,,,I contacted Iberia using other channels (pleas...,,Ya he viajado con Iberia,,,,,,,,Other adult party (6 people or less),,,,Yes,Submit feedback,IAG_IB,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000,0.000,0,0,0.000,0.000,1,0,Operative,Operative,CO,ES,CO,ES,AMERICA SUR,AMERICA SUR,75.000,1422198016.000,2.000,397.430,CO,Operative,IB
1023336,65748943,10042836715,8756.000,2024-03-28,2024-03-28 09:10:00,2024-03-28 11:30:00,IB CLASSIC,ES,ECMPA,03F,True,0.000,0,2,,False,,DIR-ONLINE,WEB,,MAD,BLQ,1.000,1.000,CRK,IB,2024-03-28,2024-03-28 08:10:00,2024-03-28 09:10:00,2024-03-28 08:10:00,2024-03-28 10:30:00,2024-03-28 11:15:00,2024-03-28 10:15:00,MAD-BLQ,BLQ-MAD,MH,,4.000,BUSINESS_SH+MH,0.806,1.030,H6SVS,32051724,329642606.000,751427369112.000,2024-03-31 21:39:21,00:03:12,DANIELMOLERA@YAHOO.ES,MAD,BLQ,YW,Business,MH,DANIELA,MOLERASEVILLANO,,Promoter,10,SURVEYS PROCESSED,2,45-54,10.000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,10.000,10.000,,10.000,10.000,10.000,10.000,6.000,6.000,8.000,5.000,,10.000,,10.000,,,10.000,No,,,,,,Completed,,,,Iberia's mobile App,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Iberia website,Yes,"No, I did not want to use it",,,,,,No,,,,Female,Spain,3,Short break (up to 5 nights),,Value for money offer on this flight,Provide Iberia with my survey responses linked...,,,Complimentary complete meals,,,,,,,,,,,,,2024-04-11,2024-03-31 21:42:33,2024-03-28,OtherCabin+OtherHaul,,By Phone,,,,,,,,,,,With children aged 6 to 12,,,,,,,Yes,Continue,IAG_IB,,,,,,,,,,,,,Iberia Plus loyalty program,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Volume of accrued Avios with my flight,,,,,,,10.000,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.000,0.000,0,0,0.000,0.000,0,0,No issue,No issue,ES,IT,ES,IT,EUROPA,EUROPA CEE,75.000,1427369112.000,1.000,98.800,IT,No issue,IB


In [37]:


    # READ PREVIOUS NPS DATA (FOR INCREMENTAL)
    yesterday_s3_keys = [item.key for item in s3_resource.Bucket(S3_BUCKET_NPS).objects.filter(Prefix=yesterday_nps_surveys_prefix)]
    yesterday_preprocess_paths = [f"s3://{S3_BUCKET_NPS}/{key}" for key in yesterday_s3_keys]

    df_nps_yesterday = pd.DataFrame()
    for file in yesterday_preprocess_paths:
        df = pd.read_csv(file)
        df_nps_yesterday = pd.concat([df_nps_yesterday, df], axis=0)
    df_nps_yesterday = df_nps_yesterday.reset_index(drop=True)



In [38]:
    # INCREMENTAL NPS  
    df_nps_incremental = pd.merge(df_nps_historic, df_nps_yesterday, how='left', indicator=True, on=df_nps_historic.columns.tolist())
    df_nps_incremental = df_nps_incremental[df_nps_incremental['_merge'] == 'left_only']
    df_nps_incremental = df_nps_incremental.drop(columns=['_merge'])
    df_nps_incremental = df_nps_incremental.reset_index(drop=True)

In [39]:
# READ LF DATA SOURCE
    # lf_dir = 's3://ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/'    
load_factor_prefix = 's3://ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/' 

    # Assume rol for prod
sts_client = boto3.client('sts')
assumed_role = sts_client.assume_role(
    RoleArn="arn:aws:iam::320714865578:role/ibdata-prod-role-assume-customer-services-from-ibdata-aip-prod",
    RoleSessionName="test"
)
credentials = assumed_role['Credentials']
fs = s3fs.S3FileSystem(key=credentials['AccessKeyId'], secret=credentials['SecretAccessKey'], token=credentials['SessionToken'])

    # Listall the files
load_factor_list = fs.ls(load_factor_prefix)
print(load_factor_list)
    
dataframes = []
for file_path in load_factor_list:
    try:
        file_info = fs.info(file_path)
        if file_info['Size'] == 0:
            continue

        with fs.open(f's3://{file_path}') as f:
            if today_date_str in file_path:
                df_lf_incremental = pd.read_csv(f)
            df = pd.read_csv(f)
            dataframes.append(df)
    except pd.errors.EmptyDataError:
        print(f"Caught EmptyDataError for file: {file_path}, skipping...")
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")

if dataframes:
    df_lf_historic = pd.concat(dataframes, ignore_index=True)
else:
    df_lf_historic = pd.DataFrame()

['ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-02000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-03000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-04000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-05000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-06000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-07000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-08000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-09000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-03-10000', 'ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/nps_operation_data_2024-

In [40]:
df_lf_historic

Unnamed: 0,flight_date_local,flight_date_utc,operating_carrier,op_flight_num,boardpoint_stn_code_actual,offpoint_stn_code_actual,haul,calc_dep_diff,punctuality,capacity_business,pax_business,capacity_premium_ec,pax_premium_ec,capacity_economy,pax_economy
0,2023-09-20,2023-09-20,IB,3237,FCO,MAD,SH,72,OTP15,12,12,0,0,199,192
1,2023-10-15,2023-10-15,IB,6301,MAD,SJU,LH,3,,19,18,0,0,269,266
2,2023-10-16,2023-10-16,IB,3148,MAD,PRG,SH,-2,,16,16,0,0,156,150
3,2023-10-16,2023-10-16,IB,6461,MAD,GYE,LH,-1,,19,19,0,0,269,268
4,2023-10-16,2023-10-16,IB,3149,PRG,MAD,SH,-12,,12,12,0,0,162,162
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681819,2024-04-14,2024-04-15,IB,6402,MEX,MAD,LH,-1,,31,31,28,28,293,286
681820,2024-04-14,2024-04-15,IB,6342,SAL,MAD,LH,3,,19,19,0,0,269,243
681821,2024-04-14,2024-04-15,IB,6118,MIA,MAD,LH,13,,29,26,21,20,242,216
681822,2024-04-14,2024-04-15,IB,6148,DFW,MAD,LH,16,OTP15,29,18,21,14,242,192


In [41]:
    # 1. Filter dataframes by carrier code.
    print("userlog: ETL 1.0 Filter dataframes by carrier code.")
    df_nps_historic['haul'] = df_nps_historic['haul'].replace('MH', 'SH')
    df_nps_incremental['haul'] = df_nps_incremental['haul'].replace('MH', 'SH')
    # NPS HISTORIC
    condition_1 = (df_nps_historic['operating_airline_code'].isin(['IB', 'YW']))
    condition_2 = ((df_nps_historic['invitegroup_ib'] != 3) | (df_nps_historic['invitegroup_ib'].isnull()))
    condition_3 = (df_nps_historic['invitegroup'] == 2)
    
    df_nps_historic = df_nps_historic.loc[condition_1 & (condition_2 & condition_3)]

    # NPS INCREMENTAL
    condition_1 = (df_nps_incremental['operating_airline_code'].isin(['IB', 'YW']))
    condition_2 = ((df_nps_incremental['invitegroup_ib'] != 3) | (df_nps_incremental['invitegroup_ib'].isnull()))
    condition_3 = (df_nps_incremental['invitegroup'] == 2)

    df_nps_incremental = df_nps_incremental.loc[condition_1 & (condition_2 & condition_3)]

    # LOAD FACTOR HISTORIC
    df_lf_historic = df_lf_historic.loc[(df_lf_historic['operating_carrier'].isin(['IB', 'YW']))]

    # LOAD FACTOR INCREMENTAL
    df_lf_incremental = df_lf_incremental.loc[(df_lf_incremental['operating_carrier'].isin(['IB', 'YW']))]


    # 2. Transform date column to datetime format
    print("userlog: ETL 2.0 Transform date column to datetime format.")
    delay_features = ['real_departure_time_local', 'scheduled_departure_time_local']
    for feat in delay_features:
        df_nps_historic[feat] = pd.to_datetime(df_nps_historic[feat], format="%Y%m%d %H:%M:%S", errors = 'coerce')
        df_nps_incremental[feat] = pd.to_datetime(df_nps_incremental[feat], format="%Y%m%d %H:%M:%S", errors = 'coerce')
            
    df_nps_historic['delay_departure'] = (df_nps_historic['real_departure_time_local'] - df_nps_historic['scheduled_departure_time_local']).dt.total_seconds()/60
    df_nps_incremental['delay_departure'] = (df_nps_incremental['real_departure_time_local'] - df_nps_incremental['scheduled_departure_time_local']).dt.total_seconds()/60
    
    # NPS
    df_nps_historic['date_flight_local'] = pd.to_datetime(df_nps_historic['date_flight_local'])
    df_nps_incremental['date_flight_local'] = pd.to_datetime(df_nps_incremental['date_flight_local'])

    # Load Factor
    df_lf_historic['flight_date_local'] = pd.to_datetime(df_lf_historic['flight_date_local'])
    df_lf_incremental['flight_date_local'] = pd.to_datetime(df_lf_incremental['flight_date_local'])

    # 3. Filter out covid years
    print("userlog: ETL 3.0 Filter out covid years.")
    # NPS (historic)
    df_nps_historic = df_nps_historic[df_nps_historic['date_flight_local'].dt.year >= 2019]
    df_nps_historic = df_nps_historic[~df_nps_historic['date_flight_local'].dt.year.isin([2020, 2021])]
    df_nps_incremental = df_nps_incremental[df_nps_incremental['date_flight_local'].dt.year >= 2019]
    df_nps_incremental = df_nps_incremental[~df_nps_incremental['date_flight_local'].dt.year.isin([2020, 2021])]
    # Load factor (historic)
    df_lf_historic = df_lf_historic[df_lf_historic['flight_date_local'].dt.year >= 2019]
    df_lf_historic = df_lf_historic[~df_lf_historic['flight_date_local'].dt.year.isin([2020, 2021])]

    # 4. Create otp, promoter, detractor and load factor columns.
    print("userlog: ETL 4.0 Create otp, promoter, detractor and load factor columns.")
    # OTP
    df_nps_historic['otp15_takeoff'] = (df_nps_historic['delay_departure'] > 15).astype(int)
    df_nps_incremental['otp15_takeoff'] = (df_nps_incremental['delay_departure'] > 15).astype(int)

    # Promoter and Detractor columns
    df_nps_historic["promoter_binary"] = df_nps_historic["nps_category"].apply(lambda x: 1 if x == "Promoter" else 0)
    df_nps_historic["detractor_binary"] = df_nps_historic["nps_category"].apply(lambda x: 1 if x == "Detractor" else 0)
    df_nps_incremental["promoter_binary"] = df_nps_incremental["nps_category"].apply(lambda x: 1 if x == "Promoter" else 0)
    df_nps_incremental["detractor_binary"] = df_nps_incremental["nps_category"].apply(lambda x: 1 if x == "Detractor" else 0)

    # Load Factor
    df_lf_historic['load_factor_business'] = df_lf_historic['pax_business'] / df_lf_historic['capacity_business']
    df_lf_historic['load_factor_premium_ec'] = df_lf_historic['pax_premium_ec'] / df_lf_historic['capacity_premium_ec']
    df_lf_historic['load_factor_economy'] = df_lf_historic['pax_economy'] / df_lf_historic['capacity_economy']

    df_lf_incremental['load_factor_business'] = df_lf_incremental['pax_business'] / df_lf_incremental['capacity_business']
    df_lf_incremental['load_factor_premium_ec'] = df_lf_incremental['pax_premium_ec'] / df_lf_incremental['capacity_premium_ec']
    df_lf_incremental['load_factor_economy'] = df_lf_incremental['pax_economy'] / df_lf_incremental['capacity_economy']

    

userlog: ETL 1.0 Filter dataframes by carrier code.
userlog: ETL 2.0 Transform date column to datetime format.
userlog: ETL 3.0 Filter out covid years.
userlog: ETL 4.0 Create otp, promoter, detractor and load factor columns.


In [42]:
    # 5. Merge dataframes.
    print("userlog: ETL 5.0 Merge dataframes.")
    cabin_to_load_factor_column = {
        'Economy': 'load_factor_economy',
        'Business': 'load_factor_business',
        'Premium Economy': 'load_factor_premium_ec'
    }

    # HISTORIC
    df_lf_historic.columns = ['date_flight_local' if x=='flight_date_local' else 
                                    'operating_airline_code' if x=='operating_carrier' else
                                    'surveyed_flight_number' if x=='op_flight_num' else
                                    x for x in df_lf_historic.columns]
    
    print(df_lf_historic.dtypes)


    

userlog: ETL 5.0 Merge dataframes.
date_flight_local             datetime64[ns]
flight_date_utc                       object
operating_airline_code                object
surveyed_flight_number                 int64
boardpoint_stn_code_actual            object
offpoint_stn_code_actual              object
haul                                  object
calc_dep_diff                          int64
punctuality                           object
capacity_business                      int64
pax_business                           int64
capacity_premium_ec                    int64
pax_premium_ec                         int64
capacity_economy                       int64
pax_economy                            int64
load_factor_business                 float64
load_factor_premium_ec               float64
load_factor_economy                  float64
dtype: object


In [43]:
df_lf_historic

Unnamed: 0,date_flight_local,flight_date_utc,operating_airline_code,surveyed_flight_number,boardpoint_stn_code_actual,offpoint_stn_code_actual,haul,calc_dep_diff,punctuality,capacity_business,pax_business,capacity_premium_ec,pax_premium_ec,capacity_economy,pax_economy,load_factor_business,load_factor_premium_ec,load_factor_economy
0,2023-09-20,2023-09-20,IB,3237,FCO,MAD,SH,72,OTP15,12,12,0,0,199,192,1.000,,0.965
1,2023-10-15,2023-10-15,IB,6301,MAD,SJU,LH,3,,19,18,0,0,269,266,0.947,,0.989
2,2023-10-16,2023-10-16,IB,3148,MAD,PRG,SH,-2,,16,16,0,0,156,150,1.000,,0.962
3,2023-10-16,2023-10-16,IB,6461,MAD,GYE,LH,-1,,19,19,0,0,269,268,1.000,,0.996
4,2023-10-16,2023-10-16,IB,3149,PRG,MAD,SH,-12,,12,12,0,0,162,162,1.000,,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
681819,2024-04-14,2024-04-15,IB,6402,MEX,MAD,LH,-1,,31,31,28,28,293,286,1.000,1.000,0.976
681820,2024-04-14,2024-04-15,IB,6342,SAL,MAD,LH,3,,19,19,0,0,269,243,1.000,,0.903
681821,2024-04-14,2024-04-15,IB,6118,MIA,MAD,LH,13,,29,26,21,20,242,216,0.897,0.952,0.893
681822,2024-04-14,2024-04-15,IB,6148,DFW,MAD,LH,16,OTP15,29,18,21,14,242,192,0.621,0.667,0.793


In [44]:
df_lf_historic['date_flight_local']=pd.to_datetime(df_lf_historic['date_flight_local'])
df_lf_historic['surveyed_flight_number'] = df_lf_historic['surveyed_flight_number'].astype('float64')

df=df_lf_historic.copy()

# List of columns to transform
load_factor_columns = ['load_factor_business', 'load_factor_premium_ec', 'load_factor_economy']

# Automatically determine id_vars by excluding load_factor_columns from all columns
id_vars = [col for col in df.columns if col not in load_factor_columns]

# Reshaping the DataFrame while dynamically keeping all other columns
df = pd.melt(df, id_vars=id_vars, 
                  value_vars=load_factor_columns,
                  var_name='cabin_in_surveyed_flight', value_name='load_factor')



In [45]:
df

Unnamed: 0,date_flight_local,flight_date_utc,operating_airline_code,surveyed_flight_number,boardpoint_stn_code_actual,offpoint_stn_code_actual,haul,calc_dep_diff,punctuality,capacity_business,pax_business,capacity_premium_ec,pax_premium_ec,capacity_economy,pax_economy,cabin_in_surveyed_flight,load_factor
0,2023-09-20,2023-09-20,IB,3237.000,FCO,MAD,SH,72,OTP15,12,12,0,0,199,192,load_factor_business,1.000
1,2023-10-15,2023-10-15,IB,6301.000,MAD,SJU,LH,3,,19,18,0,0,269,266,load_factor_business,0.947
2,2023-10-16,2023-10-16,IB,3148.000,MAD,PRG,SH,-2,,16,16,0,0,156,150,load_factor_business,1.000
3,2023-10-16,2023-10-16,IB,6461.000,MAD,GYE,LH,-1,,19,19,0,0,269,268,load_factor_business,1.000
4,2023-10-16,2023-10-16,IB,3149.000,PRG,MAD,SH,-12,,12,12,0,0,162,162,load_factor_business,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1548550,2024-04-14,2024-04-15,IB,6402.000,MEX,MAD,LH,-1,,31,31,28,28,293,286,load_factor_economy,0.976
1548551,2024-04-14,2024-04-15,IB,6342.000,SAL,MAD,LH,3,,19,19,0,0,269,243,load_factor_economy,0.903
1548552,2024-04-14,2024-04-15,IB,6118.000,MIA,MAD,LH,13,,29,26,21,20,242,216,load_factor_economy,0.893
1548553,2024-04-14,2024-04-15,IB,6148.000,DFW,MAD,LH,16,OTP15,29,18,21,14,242,192,load_factor_economy,0.793


In [46]:
# Replacing the column names in 'cabin_in_surveyed_flight' with the desired cabin types
df['cabin_in_surveyed_flight'] = df['cabin_in_surveyed_flight'].map({
    'load_factor_business': 'Business',
    'load_factor_premium_ec': 'Premium Economy',
    'load_factor_economy': 'Economy'
})



In [47]:
df

Unnamed: 0,date_flight_local,flight_date_utc,operating_airline_code,surveyed_flight_number,boardpoint_stn_code_actual,offpoint_stn_code_actual,haul,calc_dep_diff,punctuality,capacity_business,pax_business,capacity_premium_ec,pax_premium_ec,capacity_economy,pax_economy,cabin_in_surveyed_flight,load_factor
0,2023-09-20,2023-09-20,IB,3237.000,FCO,MAD,SH,72,OTP15,12,12,0,0,199,192,Business,1.000
1,2023-10-15,2023-10-15,IB,6301.000,MAD,SJU,LH,3,,19,18,0,0,269,266,Business,0.947
2,2023-10-16,2023-10-16,IB,3148.000,MAD,PRG,SH,-2,,16,16,0,0,156,150,Business,1.000
3,2023-10-16,2023-10-16,IB,6461.000,MAD,GYE,LH,-1,,19,19,0,0,269,268,Business,1.000
4,2023-10-16,2023-10-16,IB,3149.000,PRG,MAD,SH,-12,,12,12,0,0,162,162,Business,1.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1548550,2024-04-14,2024-04-15,IB,6402.000,MEX,MAD,LH,-1,,31,31,28,28,293,286,Economy,0.976
1548551,2024-04-14,2024-04-15,IB,6342.000,SAL,MAD,LH,3,,19,19,0,0,269,243,Economy,0.903
1548552,2024-04-14,2024-04-15,IB,6118.000,MIA,MAD,LH,13,,29,26,21,20,242,216,Economy,0.893
1548553,2024-04-14,2024-04-15,IB,6148.000,DFW,MAD,LH,16,OTP15,29,18,21,14,242,192,Economy,0.793


In [48]:
df_nps_historic[['date_flight_local', 'operating_airline_code', 'surveyed_flight_number', 'cabin_in_surveyed_flight', 'haul']]



Unnamed: 0,date_flight_local,operating_airline_code,surveyed_flight_number,cabin_in_surveyed_flight,haul
1,2023-01-04,YW,8319.000,Economy,SH
2,2023-01-29,YW,8327.000,Business,SH
3,2019-07-23,IB,6841.000,Economy,LH
4,2019-09-03,IB,6845.000,Economy,LH
6,2019-02-16,IB,440.000,Economy,SH
...,...,...,...,...,...
1023333,2024-03-28,IB,3264.000,Economy,SH
1023334,2024-03-28,IB,3166.000,Economy,SH
1023335,2024-03-28,IB,6588.000,Economy,LH
1023336,2024-03-28,YW,8756.000,Business,SH


In [49]:
df[['date_flight_local', 'operating_airline_code', 'surveyed_flight_number', 'cabin_in_surveyed_flight', 'haul']]

Unnamed: 0,date_flight_local,operating_airline_code,surveyed_flight_number,cabin_in_surveyed_flight,haul
0,2023-09-20,IB,3237.000,Business,SH
1,2023-10-15,IB,6301.000,Business,LH
2,2023-10-16,IB,3148.000,Business,SH
3,2023-10-16,IB,6461.000,Business,LH
4,2023-10-16,IB,3149.000,Business,SH
...,...,...,...,...,...
1548550,2024-04-14,IB,6402.000,Economy,LH
1548551,2024-04-14,IB,6342.000,Economy,LH
1548552,2024-04-14,IB,6118.000,Economy,LH
1548553,2024-04-14,IB,6148.000,Economy,LH


In [55]:
df_nps_incremental.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7535 entries, 0 to 7534
Columns: 580 entries, respondent_id to detractor_binary
dtypes: datetime64[ns](3), float64(113), int64(13), object(451)
memory usage: 33.4+ MB


In [56]:
intersection_df = pd.merge(
    df_nps_incremental,
    df,
    how='inner',
    on=['date_flight_local', 'surveyed_flight_number', 'cabin_in_surveyed_flight']
)


In [57]:
intersection_df['load_factor'].notna().sum()

1060

In [53]:
df_nps_incremental['haul'] = df_nps_incremental['haul'].replace('MH', 'SH')

In [54]:
df_long['date_flight_local']=pd.to_datetime(df_long['date_flight_local'])
df_long['surveyed_flight_number'] = df_long['surveyed_flight_number'].astype('float64')

NameError: name 'df_long' is not defined

In [None]:
    df_historic = pd.merge(df_nps_historic, df_long, 
                        how='left', 
                        on=['date_flight_local', 'operating_airline_code', 'surveyed_flight_number', 'cabin_in_surveyed_flight', 'haul'])

In [None]:
df_historic['load_factor'].notna().sum()

In [None]:
    # 6. Filter out final columns for the model
    print("userlog: ETL 6.0 Filter out final columns for the model")
    features_dummy = ['ticket_price', 'load_factor', 'otp15_takeoff'] + ['bkg_200_journey_preparation', 'pfl_100_checkin', 
                  'pfl_200_security', 'pfl_300_lounge', 'pfl_500_boarding', 'ifl_300_cabin', 
                  'ifl_200_flight_crew_annoucements', 'ifl_600_wifi', 'ifl_500_ife', 'ifl_400_food_drink', 
                  'ifl_100_cabin_crew', 'arr_100_arrivals', 'con_100_connections', 
                  'loy_200_loyalty_programme', 'img_310_ease_contact_phone']

    labels = ['promoter_binary', 'detractor_binary']

    df_historic = df_historic[['respondent_id' , 'date_flight_local'] + features_dummy + labels]
    df_incremental = df_incremental[['respondent_id' , 'date_flight_local'] + features_dummy + labels]

    df_historic = df_historic.drop_duplicates()
    df_incremental = df_incremental.drop_duplicates()
    
    print("userlog: Size of resulting df_historic:", df_historic.shape)
    print("userlog: Size of resulting df_incremental:", df_incremental.shape)