# Introduction

In this notebook I create a way of using the aggregated model in the explaianability dashboard against targets. Basically for each day trhough the year it creates different aggregation for the day agains the previous ones, and then, it creates a prediction with the targets model.

### Instalations and imports

In [2]:
!pip install plotly s3fs darts shap lightgbm minepy dcor deap

Collecting darts
  Using cached darts-0.30.0-py3-none-any.whl.metadata (52 kB)
Collecting shap
  Using cached shap-0.44.1-cp38-cp38-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (24 kB)
Collecting lightgbm
  Using cached lightgbm-4.4.0-py3-none-manylinux_2_28_x86_64.whl.metadata (19 kB)
Collecting minepy
  Using cached minepy-1.2.6-cp38-cp38-linux_x86_64.whl
Collecting dcor
  Using cached dcor-0.6-py3-none-any.whl.metadata (6.2 kB)
Collecting deap
  Using cached deap-1.4.1-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting holidays>=0.11.1 (from darts)
  Using cached holidays-0.53-py3-none-any.whl.metadata (23 kB)
Collecting nfoursid>=1.0.0 (from darts)
  Using cached nfoursid-1.0.1-py3-none-any.whl.metadata (1.9 kB)
Collecting pmdarima>=1.8.0 (from darts)
  Using cached pmdarima-2.0.4-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_28_x86_64.wh

In [3]:
# General
import pandas as pd
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
import os
import numpy as np
# import xlsxwriter
import datetime
import boto3
import s3fs

# Sklearn
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
from sklearn.impute import SimpleImputer
from sklearn.model_selection import GridSearchCV

# Plots
import matplotlib.pyplot as plt
import seaborn as sns

#Warnings
import warnings
warnings.filterwarnings("ignore")


import darts
from darts import TimeSeries
from darts.utils.timeseries_generation import (
    gaussian_timeseries,
    linear_timeseries,
    sine_timeseries,
)

from darts.metrics import mape, smape, mae
from darts.dataprocessing.transformers import Scaler
from darts.utils.timeseries_generation import datetime_attribute_timeseries

from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor

import lightgbm

from darts.models import LightGBMModel

from darts.models import LightGBMModel, RandomForest, LinearRegressionModel
from darts.utils.statistics import check_seasonality, plot_acf, plot_residuals_analysis

from darts.explainability.shap_explainer import ShapExplainer
import pickle
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from darts.models import LinearRegressionModel, LightGBMModel, RandomForest
from calendar import month_name as mn
import os

import shap


KeyboardInterrupt: 

# 1. Aggregation logic

In [None]:
# Use own bucket and prefix
S3_BUCKET_NPS = 'iberia-data-lake' # In this case: iberia-data-lake
S3_BUCKET_NPS_PREFIX = 'customer/nps_explainability_model' # In this case: sagemaker/sagemaker-template

S3_BUCKET_LF = 'ibdata-prod-ew1-s3-customer'
S3_BUCKET_LF_PREFIX = 'customer/load_factor_to_s3_nps_model'

S3_PATH_READ_NPS = 'customer/nps_surveys/export_historic'
S3_PATH_READ_LF = "customer/load_factor_to_s3_nps_model"

insert_date_ci='2024-06-25'
today_date_str='2024-06-25'

## Read data

In [None]:
# READ NPS DATA SOURCE
# Read df_nps_surveys
s3_resource = boto3.resource("s3")

# READ TODAY DATA (HISTORIC NPS)
today_nps_surveys_prefix = f'{S3_PATH_READ_NPS}/insert_date_ci={today_date_str}/'
s3_keys = [item.key for item in s3_resource.Bucket(S3_BUCKET_NPS).objects.filter(Prefix=today_nps_surveys_prefix)]
preprocess_paths = [f"s3://{S3_BUCKET_NPS}/{key}" for key in s3_keys]

df_nps_historic = pd.DataFrame()
for file in preprocess_paths:
    df = pd.read_csv(file)
    df_nps_historic = pd.concat([df_nps_historic, df], axis=0)
df_nps_historic = df_nps_historic.reset_index(drop=True)

In [None]:
# READ LF DATA SOURCE
# lf_dir = 's3://ibdata-prod-ew1-s3-customer/customer/load_factor_to_s3_nps_model/'    
load_factor_prefix = f's3://{S3_BUCKET_LF}/{S3_PATH_READ_LF}/'

# Assume rol for prod
sts_client = boto3.client('sts')
assumed_role = sts_client.assume_role(
    RoleArn="arn:aws:iam::320714865578:role/ibdata-prod-role-assume-customer-services-from-ibdata-aip-prod",
    RoleSessionName="test"
)
credentials = assumed_role['Credentials']
fs = s3fs.S3FileSystem(key=credentials['AccessKeyId'], secret=credentials['SecretAccessKey'], token=credentials['SessionToken'])

# Listall the files
load_factor_list = fs.ls(load_factor_prefix)
    
print("userlog: Read historic load_factor data path %s.", load_factor_prefix)
dataframes = []
for file_path in load_factor_list:
    try:
        file_info = fs.info(file_path)
        if file_info['Size'] == 0:
            print(f"Skipping empty file: {file_path}")
            continue

        with fs.open(f's3://{file_path}') as f:
            df = pd.read_csv(f)
            dataframes.append(df)
    except pd.errors.EmptyDataError:
        print(f"Caught EmptyDataError for file: {file_path}, skipping...")
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")

if dataframes:
    df_lf_historic = pd.concat(dataframes, ignore_index=True)
else:
    df_lf_historic = pd.DataFrame()

## A little preprocess

In [None]:
condition_1 = (df_nps_historic['operating_airline_code'].isin(['IB', 'YW']))
condition_2 = ((df_nps_historic['invitegroup_ib'] != 3) | (df_nps_historic['invitegroup_ib'].isnull()))
condition_3 = (df_nps_historic['invitegroup'] == 2)

df_nps_historic = df_nps_historic.loc[condition_1 & (condition_2 & condition_3)]

df_lf_historic = df_lf_historic.loc[(df_lf_historic['operating_carrier'].isin(['IB', 'YW']))]

In [None]:
datetime_features = ['date_flight_local', 'scheduled_departure_time_local', 'scheduled_arrival_time_local', 'real_departure_time_local',
                     'real_arrival_time_local', 'started']
columns_to_cross_kpis=['cabin_in_surveyed_flight','haul']
columns_ext = ['tier_level', 'language_code', 'seat_no', 'volume_of_bags', 'number_of_child_in_the_booking', 'number_of_infant_in_the_booking',
              'number_of_people_in_the_booking', 'country_code', 'customer_journey_origin', 'customer_journey_destination', 'number_of_flights_in_journey',
              'order_of_flight_in_journey', 'marketing_airline_code', 'overall_haul', 'weight_category', 'ff_number', 'ticket_num', 'operating_airline_code',
               'nps_category', 'nps_100', 'group_age_survey', 'gender'] # invite_group

#'bkg_100_booking', 
touchpoints = ['bkg_200_journey_preparation', 'pfl_100_checkin', 'pfl_200_security', 'pfl_300_lounge',
               'pfl_500_boarding', 'ifl_300_cabin', 'ifl_200_flight_crew_annoucements', 'ifl_600_wifi', 'ifl_500_ife',
               'ifl_400_food_drink', 'ifl_100_cabin_crew', 'arr_100_arrivals', 'con_100_connections', 'pun_100_punctuality',
               'loy_200_loyalty_programme', 'inm_400_issues_response', 'img_310_ease_contact_phone']

# ,'img_320_ease_contact_ibplus_mail'
survey_fields = ['cla_600_wifi_t_f', 'tvl_journey_reason']

df_nps_historic['date_flight_local'] = pd.to_datetime(df_nps_historic['date_flight_local'])
df_lf_historic['flight_date_local'] = pd.to_datetime(df_lf_historic['flight_date_local'])

In [None]:
df_nps_historic = df_nps_historic[df_nps_historic['date_flight_local'].dt.year >= 2019]
df_nps_historic = df_nps_historic[~df_nps_historic['date_flight_local'].dt.year.isin([2020, 2021])]

df_lf_historic = df_lf_historic[~df_lf_historic['flight_date_local'].dt.year.isin([2020, 2021])]

In [None]:
delay_features = ['real_departure_time_local', 'scheduled_departure_time_local']
for feat in delay_features:
    df_nps_historic[feat] = pd.to_datetime(df_nps_historic[feat], format="%Y-%m-%d %H:%M:%S", errors = 'coerce')
            
df_nps_historic['delay_departure'] = (df_nps_historic['real_departure_time_local'] - df_nps_historic['scheduled_departure_time_local']).dt.total_seconds()/60

In [None]:
df_nps_historic['haul'] = df_nps_historic['haul'].replace('MH', 'SH')
#df_nps_historic['cabin_in_surveyed_flight'] = df_nps_historic['cabin_in_surveyed_flight'].replace('Premium Economy', 'Economy')# Load Factor
df_lf_historic['load_factor_business'] = df_lf_historic['pax_business'] / df_lf_historic['capacity_business']
df_lf_historic['load_factor_premium_ec'] = df_lf_historic['pax_premium_ec'] / df_lf_historic['capacity_premium_ec']
df_lf_historic['load_factor_economy'] = df_lf_historic['pax_economy'] / df_lf_historic['capacity_economy']

In [None]:
 # OTP
df_nps_historic['otp15_takeoff'] = (df_nps_historic['delay_departure'] > 15).astype(int)
df_nps_historic['otp30_takeoff'] = (df_nps_historic['delay_departure'] > 30).astype(int)
df_nps_historic['otp60_takeoff'] = (df_nps_historic['delay_departure'] > 60).astype(int)

# Promoter and Detractor columns
df_nps_historic["promoter_binary"] = df_nps_historic["nps_category"].apply(lambda x: 1 if x == "Promoter" else 0)
df_nps_historic["detractor_binary"] = df_nps_historic["nps_category"].apply(lambda x: 1 if x == "Detractor" else 0)

In [None]:
# Load Factor
df_lf_historic['load_factor_business'] = df_lf_historic['pax_business'] / df_lf_historic['capacity_business']
df_lf_historic['load_factor_premium_ec'] = df_lf_historic['pax_premium_ec'] / df_lf_historic['capacity_premium_ec']
df_lf_historic['load_factor_economy'] = df_lf_historic['pax_economy'] / df_lf_historic['capacity_economy']

In [None]:
cabin_to_load_factor_column = {
    'Economy': 'load_factor_economy',
    'Business': 'load_factor_business',
    'Premium Economy': 'load_factor_premium_ec'
}

# HISTORIC
df_lf_historic.columns = ['date_flight_local' if x=='flight_date_local' else 
                                'operating_airline_code' if x=='operating_carrier' else
                                'surveyed_flight_number' if x=='op_flight_num' else
                                x for x in df_lf_historic.columns]

df_historic = pd.merge(df_nps_historic, df_lf_historic, 
                    how='left', 
                    on=['date_flight_local', 'operating_airline_code', 'surveyed_flight_number', 'haul'])

df_historic['load_factor'] = df_historic.apply(lambda row: row[cabin_to_load_factor_column[row['cabin_in_surveyed_flight']]], axis=1)

In [None]:
df_historic['cabin_in_surveyed_flight']

In [None]:
df_nps_historic['delay_departure']

In [None]:
df_historic['cabin_in_surveyed_flight'].unique()

In [None]:
df_nps_historic['real_departure_time_local']

In [None]:
df_historic[df_historic['delay_departure']<0]['delay_departure'].min()

In [None]:
# Condition for dropping rows
condition = (df_historic['cabin_in_surveyed_flight'] == 'Premium Economy') & (df_historic['haul'] == 'SH')

# Keeping rows that do not meet the condition
df_historic = df_historic[~condition]

In [None]:
df_historic['respondent_id'].nunique()

In [None]:
df_historic = df_historic.drop_duplicates(subset='respondent_id', keep='first')

In [None]:
filtered_df = df_historic[df_historic['delay'] != df_historic['delay_departure']]

In [None]:
filtered_df['date_flight_local']=pd.to_datetime(filtered_df['date_flight_local'])

In [None]:
df=filtered_df[filtered_df['date_flight_local'].dt.year>=2023]

In [None]:
df_historic

In [None]:
import pandas as pd

# Assuming df_historic is your DataFrame and it has been properly imported
respondent_ids = [64986539, 64987164, 65097632, 64890118, 64642526]
filtered_df = df_historic[df_historic['respondent_id'].isin(respondent_ids)]
filtered_df[['respondent_id', 'otp15_takeoff', 'delay_departure']]

In [None]:
check = pd.read_csv('predictions (9).csv')
filtered_df = check[check['respondent_id'].isin(respondent_ids)]
filtered_df[['respondent_id', 'otp15_takeoff']]

In [None]:
filtered_df[['respondent_id', 'otp15_takeoff', 'delay_departure', 'date_flight_local']]

In [None]:
[col for col in df_historic.columns if 'issues' in col]

In [None]:
def plot_variable_correlations(corr_data, target_var):
    # Create correlation-specific graphs with otp15_takeoff
    # methods = ['pearson', 'spearman', 'kendall']
    methods = ['pearson']
    for method in methods:
        corr = corr_data.corr(method=method)
        target_corr = corr[[target_var]].sort_values(by=target_var, ascending=False)

        plt.figure(figsize=(12, 8))
        sns.barplot(y=target_corr.index, x=target_corr[target_var], palette='coolwarm')
        plt.title(f'{method.capitalize()} Correlation with {target_var}')
        plt.xlabel(f'{method.capitalize()} Correlation Coefficient')
        plt.ylabel('Variables')
        plt.tight_layout()
        plt.show()
def scatter_plot(df, variable, target):# Create the scatter plot
    plt.figure(figsize=(10, 6))
    plt.scatter(df[variable], df[target], alpha=0.6, edgecolors='w', linewidths=0.5)
    plt.title(f'{variable} vs {target}')
    plt.xlabel(f'{variable}')
    plt.ylabel(f'{target}')
    plt.grid(True)
    plt.show()

In [None]:
corr_data = df_historic[['pun_100_punctuality','delay_departure', 'nps_100', 'date_flight_local']]

In [None]:
cabin= "Economy"
haul= "SH"
corr_data = df_historic[(df_historic['cabin_in_surveyed_flight']==cabin) & (df_historic['haul']==haul)][['pun_100_punctuality','otp15_takeoff', 'date_flight_local',  'nps_100']]

In [None]:
corr_data.corr()['otp15_takeoff']

In [None]:
target= 'delay_departure'
variable = 'date_flight_local'

corr_data['date_flight_local']=pd.to_datetime(corr_data['date_flight_local'])
# plot_variable_correlations(corr_data, 'nps_100')
scatter_plot(corr_data, variable, target)

In [None]:
target= 'pun_100_punctuality'
variable = 'delay_departure'
corr_data['delay_departure'] = corr_data['delay_departure'].clip(lower=0)
plot_variable_correlations(corr_data, 'nps_100')
scatter_plot(corr_data, variable, target)

## Correlation analysis

In [None]:
df_issues = df_historic[df_historic['inm_050_issues_t_f']=='Yes'].copy()

In [None]:
df_no_issues = df_historic[df_historic['inm_050_issues_t_f']=='No'].copy()

In [None]:
df_issues['respondent_id'].nunique()

In [None]:
df_no_issues['respondent_id'].nunique()

In [None]:
def calculate_nps(promoters, detractors, total_responses):
    """Calcula el Net Promoter Score (NPS)."""
    if total_responses == 0:
        return np.nan
    return ((promoters - detractors) / total_responses) * 100

def calculate_weighted_nps(group_df):
    """Calcula el NPS ponderado para un grupo de datos."""
    promoters_weight = group_df.loc[group_df['nps_100'] > 8, 'monthly_weight'].sum()
    detractors_weight = group_df.loc[group_df['nps_100'] <= 6, 'monthly_weight'].sum()
    total_weight = group_df['monthly_weight'].sum()
    
    if total_weight == 0:
        return np.nan
    return (promoters_weight - detractors_weight) / total_weight * 100

def calculate_satisfaction(df, variable):
    """Calcula la tasa de satisfacción para una variable dada, utilizando pesos mensuales si están disponibles."""
    # Comprobar si la columna 'monthly_weight' existe y no está completamente vacía para los datos relevantes
    if 'monthly_weight' in df.columns and not df[df[variable].notnull()]['monthly_weight'].isnull().all():
        # Suma de los pesos donde la variable es >= 8 y satisface la condición de estar satisfecho
        satisfied_weight = df[df[variable] >= 8]['monthly_weight'].sum()
        # Suma de todos los pesos donde la variable no es NaN
        total_weight = df[df[variable].notnull()]['monthly_weight'].sum()
        # Calcula el porcentaje de satisfacción usando los pesos
        if total_weight == 0:
            return np.nan
        return (satisfied_weight / total_weight) * 100
    else:
        # Contar respuestas satisfechas
        satisfied_count = df[df[variable] >= 8].shape[0]
        # Contar total de respuestas válidas
        total_count = df[variable].notnull().sum()
        # Calcula el porcentaje de satisfacción usando conteo
        if total_count == 0:
            return np.nan
        return (satisfied_count / total_count) * 100




def calculate_otp(df, n):
    """Calcula el On-Time Performance (OTP) como el porcentaje de valores igual a 1."""
    on_time_count = (df[f'otp{n}_takeoff'] == 0).sum()
    total_count = df[f'otp{n}_takeoff'].notnull().sum()
    return (on_time_count / total_count) * 100 if total_count > 0 else 0


def calculate_load_factor(df, pax_column, capacity_column):
    """Calcula el factor de carga para una cabina específica."""
    total_pax = df[pax_column].sum()
    total_capacity = df[capacity_column].sum()
    # Evitar la división por cero
    if total_capacity > 0:
        return (total_pax / total_capacity) * 100
    else:
        return 0

    
def calculate_metrics_summary(df, start_date, end_date, touchpoints):
    # Filtrar por rango de fechas
    df_filtered = df[(df['date_flight_local'] >= pd.to_datetime(start_date)) & (df['date_flight_local'] <= pd.to_datetime(end_date))]
    
    # Mapeo de cabinas a columnas de pax y capacidad
    cabin_mapping = {
        'Economy': ('pax_economy', 'capacity_economy'),
        'Business': ('pax_business', 'capacity_business'),
        'Premium Economy': ('pax_premium_ec', 'capacity_premium_ec')
    }
    
    results_list = []
    
    for (cabin, haul), group_df in df_filtered.groupby(['cabin_in_surveyed_flight', 'haul']):
        
        print(f'CABIN/HAUL: {cabin}/{haul}')
        result = {
            'start_date': start_date,
            'end_date': end_date,
            'cabin_in_surveyed_flight': cabin,
            'haul': haul,
            'otp15_takeoff': calculate_otp(group_df, 15),
            'otp30_takeoff': calculate_otp(group_df, 30),
            'otp60_takeoff': calculate_otp(group_df, 60),
            'mean_delay': group_df[group_df['delay_departure']>0]['delay_departure'].mean()
        }
        
        # Calcula el NPS para el grupo
        promoters = (group_df['nps_100'] >= 9).sum()
        detractors = (group_df['nps_100'] <= 6).sum()
        total_responses = group_df['nps_100'].notnull().sum()
        result['NPS'] = calculate_nps(promoters, detractors, total_responses) if total_responses else None
        
        # Calcula el NPS ponderado para el grupo
        result['NPS_weighted'] = calculate_weighted_nps(group_df)
        
        # Satisfacción para cada touchpoint
        for tp in touchpoints:
            result[f'{tp}_satisfaction'] = calculate_satisfaction(group_df, tp)
            
        
        # Calcula el factor de carga para la cabina
        pax_column, capacity_column = cabin_mapping.get(cabin, (None, None))
        if pax_column and capacity_column:
            result['load_factor'] = calculate_load_factor(group_df, pax_column, capacity_column)
        
        results_list.append(result)
    
    return pd.DataFrame(results_list)

def generate_date_intervals(start_date, end_date, freq=1):
    """Genera una lista de tuplas con intervalos de fechas desde start_date hasta end_date."""
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    intervals = []
    while start_date < end_date:
        intervals.append((start_date, start_date))
        start_date = start_date + pd.Timedelta(days=freq)
    return intervals

def calculate_metrics_for_intervals(df, touchpoints, start_date, end_date, freq):
    """Calcula las métricas para todos los intervalos posibles hasta end_date."""
    intervals = generate_date_intervals(start_date, end_date, freq)
    all_metrics = []

    for interval_start, interval_end in intervals:
        interval_metrics = calculate_metrics_summary(df, interval_start, interval_end, touchpoints)
        print(f"Interval: {interval_start} to {interval_end}, Data points: {len(interval_metrics)}")
        all_metrics.append(interval_metrics)

    
    # Concatenar todos los DataFrames de resultados en uno solo
    results_df = pd.concat(all_metrics, ignore_index=True)
    return results_df


In [None]:
# all_weekly = calculate_metrics_for_intervals(df_historic, touchpoints, '01-01-2022', '04-06-2024', 7)
# issues_weekly = calculate_metrics_for_intervals(df_issues, touchpoints, '01-01-2022', '04-06-2024', 7)
# no_issues_weekly = calculate_metrics_for_intervals(df_no_issues, touchpoints, '01-01-2022', '04-06-2024', 7)

In [None]:
df_historic['delay_departure'] = df_historic['delay_departure'].clip(lower=0)
all_daily = calculate_metrics_for_intervals(df_historic, touchpoints, '01-01-2022', '04-06-2024', 1)
# issues_daily = calculate_metrics_for_intervals(df_issues, touchpoints, '01-01-2022', '04-06-2024', 1)
# no_issues_daily = calculate_metrics_for_intervals(df_no_issues, touchpoints, '01-01-2022', '04-06-2024', 1)

In [None]:
import pandas as pd

# Assuming all_weekly, issues_weekly, no_issues_weekly are already defined
dataframes_daily = {
    'all_daily': all_daily,
    # 'issues_daily': issues_daily,
    # 'no_issues_daily': no_issues_daily
}

# dataframes_weekly = {
#     'all_weekly': all_weekly,
#     'issues_weekly': issues_weekly,
#     'no_issues_weekly': no_issues_weekly
# }

# Define the function to filter and select columns
def filter_and_select(df, cabin, haul):
    filtered_df = df[(df['cabin_in_surveyed_flight'] == cabin) & (df['haul'] == haul)]
    cols = [col for col in filtered_df.columns if '_satisfaction' in col] + ['otp15_takeoff', 'otp30_takeoff', 'otp60_takeoff', 'mean_delay', 'load_factor', 'NPS_weighted']
    return filtered_df[cols]

# Dictionary to hold the results
daily_result_dict = {key: {} for key in dataframes_daily.keys()}

# Loop through each DataFrame and unique combinations
for name, df in dataframes_daily.items():
    unique_combinations = df[['cabin_in_surveyed_flight', 'haul']].drop_duplicates()
    for _, row in unique_combinations.iterrows():
        result_df = filter_and_select(df, row['cabin_in_surveyed_flight'], row['haul'])
        cabin_haul_key = f"{row['cabin_in_surveyed_flight']}_{row['haul']}"
        daily_result_dict[name][cabin_haul_key] = result_df
        
# Dictionary to hold the results
# weekly_result_dict = {key: {} for key in dataframes_weekly.keys()}

# # Loop through each DataFrame and unique combinations
# for name, df in dataframes_weekly.items():
#     unique_combinations = df[['cabin_in_surveyed_flight', 'haul']].drop_duplicates()
#     for _, row in unique_combinations.iterrows():
#         result_df = filter_and_select(df, row['cabin_in_surveyed_flight'], row['haul'])
#         cabin_haul_key = f"{row['cabin_in_surveyed_flight']}_{row['haul']}"
#         weekly_result_dict[name][cabin_haul_key] = result_df

In [None]:
daily_result_dict

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import shap

def plot_variable_correlations(corr_data, target_var):
    # Create correlation-specific graphs with otp15_takeoff
    # methods = ['pearson', 'spearman', 'kendall']
    methods = ['pearson']
    for method in methods:
        corr = corr_data.corr(method=method)
        target_corr = corr[[target_var]].sort_values(by=target_var, ascending=False)

        plt.figure(figsize=(12, 8))
        sns.barplot(y=target_corr.index, x=target_corr[target_var], palette='coolwarm')
        plt.title(f'{method.capitalize()} Correlation with {target_var}')
        plt.xlabel(f'{method.capitalize()} Correlation Coefficient')
        plt.ylabel('Variables')
        plt.tight_layout()
        plt.show()

def analyze_correlations(corr_data, name):
    # Calculate correlations
    pearson_corr = corr_data.corr(method='pearson')
    spearman_corr = corr_data.corr(method='spearman')
    kendall_corr = corr_data.corr(method='kendall')

    # Pearson Correlation Matrix visualization
    # plt.figure(figsize=(20, 16))
    # sns.heatmap(pearson_corr, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, linewidths=0.5, linecolor='gray')
    # plt.title('Pearson Correlation Matrix for ' + name)
    # plt.xticks(rotation=45, ha='right')
    # plt.yticks(rotation=0)
    # plt.tight_layout()
    # plt.show()

    # Spearman Correlation Matrix visualization
#     plt.figure(figsize=(20, 16))
#     sns.heatmap(spearman_corr, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, linewidths=0.5, linecolor='gray')
#     plt.title('Spearman Correlation Matrix for ' + name)
#     plt.xticks(rotation=45, ha='right')
#     plt.yticks(rotation=0)
#     plt.tight_layout()
#     plt.show()

#     # Kendall Correlation Matrix visualization
#     plt.figure(figsize=(20, 16))
#     sns.heatmap(kendall_corr, annot=True, fmt=".2f", cmap='coolwarm', cbar=True, linewidths=0.5, linecolor='gray')
#     plt.title('Kendall Correlation Matrix for ' + name)
#     plt.xticks(rotation=45, ha='right')
#     plt.yticks(rotation=0)
#     plt.tight_layout()
#     plt.show()
    
    # Call the function to plot correlations with otp15_takeoff
    plot_variable_correlations(corr_data, 'NPS_weighted')

    
    # Check if any column is entirely NaN and fill or drop accordingly
    if corr_data.isnull().all().any():
        # Option 1: Drop columns that are completely NaN
        corr_data = corr_data.dropna(axis=1, how='all')
        # Option 2: Fill completely NaN columns with a placeholder if dropping is not desired
        # corr_data = corr_data.fillna(value={col: 0 for col in corr_data.columns if corr_data[col].isnull().all()})
        
    # Impute NaNs
    imputer = SimpleImputer(strategy='median')
    corr_data_imputed = pd.DataFrame(imputer.fit_transform(corr_data), columns=corr_data.columns)

    # Feature and target separation
    X = corr_data_imputed.drop(columns=['NPS_weighted'])
    y = corr_data_imputed['NPS_weighted']

    # Calculate Mutual Information
    mi = mutual_info_regression(X, y, random_state=42)
    mi_df = pd.DataFrame(mi, index=X.columns, columns=['Mutual Information']).sort_values(by='Mutual Information', ascending=False)

    # Mutual Information visualization
    plt.figure(figsize=(12, 8))
    sns.barplot(x=mi_df['Mutual Information'], y=mi_df.index, palette='viridis')
    plt.title('Mutual Information between NPS_weighted and Other Variables for ' + name)
    plt.xlabel('Mutual Information')
    plt.ylabel('Features')
    plt.tight_layout()
    plt.show()

    # Save results to an Excel file
    with pd.ExcelWriter('correlation_analysis_results_' + name + '.xlsx') as writer:
        pearson_corr.to_excel(writer, sheet_name='Pearson Correlation')
        spearman_corr.to_excel(writer, sheet_name='Spearman Correlation')
        kendall_corr.to_excel(writer, sheet_name='Kendall Correlation')
        mi_df.to_excel(writer, sheet_name='Mutual Information')

import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

def scatter_plot_with_regression(df, variable, target):
    # Remove rows with missing values
    df_clean = df.dropna(subset=[variable, target])
    
    # Create the scatter plot
    plt.figure(figsize=(10, 10))  # Square plot
    plt.scatter(df_clean[variable], df_clean[target], alpha=0.6, edgecolors='w', linewidths=0.5, label='Data points')
    
    # Fit the linear regression model
    X = df_clean[variable].values.reshape(-1, 1)
    y = df_clean[target].values
    model = LinearRegression()
    model.fit(X, y)
    
    # Plot the regression line
    plt.plot(df_clean[variable], model.predict(X), color='red', label='Fitted line')
    
    # Add titles and labels
    plt.title(f'{variable} vs {target}')
    plt.xlabel(f'{variable}')
    plt.ylabel(f'{target}')
    plt.legend()
    plt.grid(True)
    
    # Calculate R^2
    r_squared = model.score(X, y)
    
    # Add slope and R^2 to the plot
    plt.text(0.05, 0.10, f'Slope: {model.coef_[0]:.4f}', transform=plt.gca().transAxes, fontsize=12, verticalalignment='top')
    plt.text(0.05, 0.05, f'R^2: {r_squared:.4f}', transform=plt.gca().transAxes, fontsize=12, verticalalignment='top')
    
    plt.show()
    
    return model.coef_[0], model.intercept_, r_squared

In [None]:
def scatter_plot(df, variable, target):# Create the scatter plot
    plt.figure(figsize=(10, 6))
    plt.scatter(df[variable], df[target], alpha=0.6, edgecolors='w', linewidths=0.5)
    plt.title(f'{variable} vs {target}')
    plt.xlabel(f'{variable}')
    plt.ylabel(f'{target}')
    plt.grid(True)
    plt.show()

In [None]:
# Apply the analysis function to each DataFrame in the result_dict
for name, dfs in daily_result_dict.items():
    print(name)
    for cabin_haul_key, df in dfs.items():
        print(cabin_haul_key)
        if not df.empty:
            analyze_correlations(df, name + '_' + cabin_haul_key)
            scatter_plot(df, 'ifl_100_cabin_crew_satisfaction', 'otp15_takeoff')
            scatter_plot(df, 'con_100_connections_satisfaction', 'otp15_takeoff')
            

In [None]:
# Apply the analysis function to each DataFrame in the result_dict
for name, dfs in weekly_result_dict.items():
    print(name)
    for cabin_haul_key, df in dfs.items():
        print(cabin_haul_key)
        if not df.empty:
            analyze_correlations(df, name + '_' + cabin_haul_key)

## Agregation logic

Given a date it takes it as an "end_date" and computes every interval with previous dates. Then it perfomr the satisfaction, NPS, load factor and otp aggregations for that particular interval.

In [None]:
# Funciones auxiliares
import numpy as np

def calculate_nps(promoters, detractors, total_responses):
    """Calcula el Net Promoter Score (NPS)."""
    if total_responses == 0:
        return np.nan
    return ((promoters - detractors) / total_responses) * 100

def calculate_weighted_nps(group_df):
    """Calcula el NPS ponderado para un grupo de datos."""
    promoters_weight = group_df.loc[group_df['nps_100'] > 8, 'monthly_weight'].sum()
    detractors_weight = group_df.loc[group_df['nps_100'] <= 6, 'monthly_weight'].sum()
    total_weight = group_df['monthly_weight'].sum()
    
    if total_weight == 0:
        return np.nan
    return (promoters_weight - detractors_weight) / total_weight * 100

def calculate_satisfaction(df, variable):
    """Calcula la tasa de satisfacción para una variable dada, utilizando pesos mensuales si están disponibles."""
    # Comprobar si la columna 'monthly_weight' existe y no está completamente vacía para los datos relevantes
    if 'monthly_weight' in df.columns and not df[df[variable].notnull()]['monthly_weight'].isnull().all():
        # Suma de los pesos donde la variable es >= 8 y satisface la condición de estar satisfecho
        satisfied_weight = df[df[variable] >= 8]['monthly_weight'].sum()
        # Suma de todos los pesos donde la variable no es NaN
        total_weight = df[df[variable].notnull()]['monthly_weight'].sum()
        # Calcula el porcentaje de satisfacción usando los pesos
        if total_weight == 0:
            return np.nan
        return (satisfied_weight / total_weight) * 100
    else:
        # Contar respuestas satisfechas
        satisfied_count = df[df[variable] >= 8].shape[0]
        # Contar total de respuestas válidas
        total_count = df[variable].notnull().sum()
        # Calcula el porcentaje de satisfacción usando conteo
        if total_count == 0:
            return np.nan
        return (satisfied_count / total_count) * 100




def calculate_otp(df, variable='otp15_takeoff'):
    """Calcula el On-Time Performance (OTP) como el porcentaje de valores igual a 1."""
    on_time_count = (df[variable] == 0).sum()
    total_count = df[variable].notnull().sum()
    return (on_time_count / total_count) * 100 if total_count > 0 else 0


def calculate_load_factor(df, pax_column, capacity_column):
    """Calcula el factor de carga para una cabina específica."""
    total_pax = df[pax_column].sum()
    total_capacity = df[capacity_column].sum()
    # Evitar la división por cero
    if total_capacity > 0:
        return (total_pax / total_capacity) * 100
    else:
        return 0

    
def calculate_metrics_summary(df, start_date, end_date, touchpoints):
    # Filtrar por rango de fechas
    df_filtered = df[(df['date_flight_local'] >= pd.to_datetime(start_date)) & (df['date_flight_local'] <= pd.to_datetime(end_date))]
    
    # Mapeo de cabinas a columnas de pax y capacidad
    cabin_mapping = {
        'Economy': ('pax_economy', 'capacity_economy'),
        'Business': ('pax_business', 'capacity_business'),
        'Premium Economy': ('pax_premium_ec', 'capacity_premium_ec')
    }
    
    results_list = []
    
    for (cabin, haul), group_df in df_filtered.groupby(['cabin_in_surveyed_flight', 'haul']):
        
        print(f'CABIN/HAUL: {cabin}/{haul}')
        result = {
            'start_date': start_date,
            'end_date': end_date,
            'cabin_in_surveyed_flight': cabin,
            'haul': haul,
            'otp15_takeoff': calculate_otp(group_df)
        }
        
        # Calcula el NPS para el grupo
        promoters = (group_df['nps_100'] >= 9).sum()
        detractors = (group_df['nps_100'] <= 6).sum()
        total_responses = group_df['nps_100'].notnull().sum()
        result['NPS'] = calculate_nps(promoters, detractors, total_responses) if total_responses else None
        
        # Calcula el NPS ponderado para el grupo
        result['NPS_weighted'] = calculate_weighted_nps(group_df)
        
        # Satisfacción para cada touchpoint
        for tp in touchpoints:
            result[f'{tp}_satisfaction'] = calculate_satisfaction(group_df, tp)
            
        
        # Calcula el factor de carga para la cabina
        pax_column, capacity_column = cabin_mapping.get(cabin, (None, None))
        if pax_column and capacity_column:
            result['load_factor'] = calculate_load_factor(group_df, pax_column, capacity_column)
        
        results_list.append(result)
    
    return pd.DataFrame(results_list)

def generate_date_intervals(start_date, end_date):
    """Genera una lista de tuplas con intervalos de fechas desde start_date hasta end_date."""
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)
    intervals = [(start_date + pd.Timedelta(days=d), end_date) for d in range((end_date - start_date).days + 1)]
    return intervals

def calculate_metrics_for_intervals(df, touchpoints, start_date, end_date):
    """Calcula las métricas para todos los intervalos posibles hasta end_date."""
    intervals = generate_date_intervals(start_date, end_date)
    all_metrics = []

    for interval_start, interval_end in intervals:
        interval_metrics = calculate_metrics_summary(df, interval_start, interval_end, touchpoints)
        print(f"Interval: {interval_start} to {interval_end}, Data points: {len(interval_metrics)}")
        all_metrics.append(interval_metrics)

    
    # Concatenar todos los DataFrames de resultados en uno solo
    results_df = pd.concat(all_metrics, ignore_index=True)
    return results_df

# # Ejemplo de uso:
# # touchpoints = ['tp1', 'tp2', 'tp3']  # Asegúrate de reemplazar estos con los nombres reales de tus touchpoints
# df_result = calculate_metrics_summary(df_historic, '2023-01-01', '2023-01-31', touchpoints)
# # print(df_result)

# # Definir la fecha de inicio del año y la fecha de fin específica
# start_date = '2023-03-01'
# end_date = '2023-05-01'



# results_intervals_df = calculate_metrics_for_intervals(df_historic, touchpoints, start_date, end_date)




In [None]:
# Aplicar la función a cada día desde 2023-01-01 en adelante
start_date = '2023-01-01'
end_date = df_historic['date_flight_local'].max().strftime('%Y-%m-%d')
date_range = pd.date_range(start=start_date, end=end_date, freq='D')
touchpoints = ['bkg_200_journey_preparation', 'pfl_100_checkin', 'pfl_200_security', 'pfl_300_lounge',
               'pfl_500_boarding', 'ifl_300_cabin', 'ifl_200_flight_crew_annoucements', 'ifl_600_wifi', 'ifl_500_ife',
               'ifl_400_food_drink', 'ifl_100_cabin_crew', 'arr_100_arrivals', 'con_100_connections', 'pun_100_punctuality',
               'loy_200_loyalty_programme', 'inm_400_issues_response', 'img_310_ease_contact_phone']

all_results = []

for current_date in date_range:
    daily_results = calculate_metrics_summary(df_historic, current_date, current_date, touchpoints)
    all_results.append(daily_results)

# Concatenar todos los DataFrames en uno solo
results_df = pd.concat(all_results, ignore_index=True)
print(results_df)

In [None]:
results_df.to_csv('daily_aggregation.csv')

In [None]:
cabin='Economy'
haul = 'LH'
corr_data = pd.read_csv('daily_aggregation.csv')
corr_data = corr_data[corr_data['start_date']>='2023-01-01']
variables = [
    'pun_100_punctuality_satisfaction',
    "bkg_200_journey_preparation_satisfaction",
    "pfl_100_checkin_satisfaction",
    "pfl_200_security_satisfaction",
    "pfl_300_lounge_satisfaction",
    "pfl_500_boarding_satisfaction",
    "ifl_300_cabin_satisfaction",
    "ifl_200_flight_crew_annoucements_satisfaction",
    "ifl_600_wifi_satisfaction",
    "ifl_500_ife_satisfaction",
    "ifl_400_food_drink_satisfaction",
    "ifl_100_cabin_crew_satisfaction",
    "arr_100_arrivals_satisfaction",
    "con_100_connections_satisfaction",
    "loy_200_loyalty_programme_satisfaction",
    "img_310_ease_contact_phone_satisfaction",
    "load_factor",
    "otp15_takeoff",
    "NPS_weighted"
]

corr_data = corr_data[(corr_data['cabin_in_surveyed_flight']==cabin) & (corr_data['haul']==haul)][variables]

# Apply the analysis function to each DataFrame in the result_dict
analyze_correlations(corr_data, cabin + '_' + haul)

scatter_plot_with_regression(corr_data, 'otp15_takeoff', 'arr_100_arrivals_satisfaction')

corr_data[['otp15_takeoff','pun_100_punctuality_satisfaction','arr_100_arrivals_satisfaction','NPS_weighted']].corr()

In [None]:
all_intervals_results

In [None]:
all_intervals_results.to_csv('intervals.csv')

# 2. Prediction with Darts model

In [None]:
all_weekly

In [None]:
df_all = df_historic[(df_historic['cabin_in_surveyed_flight']=='Economy') & (df_historic['haul']=='SH')]

In [None]:
df_issues_BLH = df_issues[(df_issues['cabin_in_surveyed_flight']=='Economy') & (df_issues['haul']=='SH')]

In [None]:
df_no_issues_BLH = df_no_issues[(df_no_issues['cabin_in_surveyed_flight']=='Economy') & (df_no_issues['haul']=='SH')]

In [None]:
df_all_agg = calculate_metrics_summary(df_all, '2024-01-01', '2024-05-24', touchpoints)
issues_BLH = calculate_metrics_summary(df_issues_BLH, '2024-01-01', '2024-05-24', touchpoints)
no_issues_BLH = calculate_metrics_summary(df_no_issues_BLH, '2024-01-01', '2024-05-24', touchpoints)

In [None]:
issues_BLH

In [None]:

# Concatenate the DataFrames
concatenated_df = pd.concat([issues_BLH, no_issues_BLH], ignore_index=True)
concatenated_df['insert_date_ci']='2024-06-01'

# Display the concatenated DataFrame
print(concatenated_df)

In [None]:
import pkg_resources

# List of libraries you want to check versions for
libraries = [
    "s3fs", "boto", "boto3", "botocore", "numpy", "scikit-image",
    "scikit-learn", "scipy", "PyYAML", "pandas", "darts",
    "optuna", "shap", "lightgbm"
]

# Check the installed version for each library and print it
for library in libraries:
    try:
        version = pkg_resources.get_distribution(library).version
        print(f"{library}: {version}")
    except pkg_resources.DistributionNotFound:
        print(f"{library} is not installed.")


In [None]:
import darts
from darts import TimeSeries
from darts.utils.timeseries_generation import (
    gaussian_timeseries,
    linear_timeseries,
    sine_timeseries,
)

from darts.metrics import mape, smape, mae
from darts.dataprocessing.transformers import Scaler
from darts.utils.timeseries_generation import datetime_attribute_timeseries

from sklearn.linear_model import BayesianRidge
from sklearn.ensemble import RandomForestRegressor

import lightgbm

from darts.models import LightGBMModel

from darts.models import LightGBMModel, RandomForest, LinearRegressionModel
from darts.utils.statistics import check_seasonality, plot_acf, plot_residuals_analysis

from darts.explainability.shap_explainer import ShapExplainer
import pickle
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from darts.models import LinearRegressionModel, LightGBMModel, RandomForest
from calendar import month_name as mn
import os

import shap
import pandas as pd

In [None]:
year_targets_df = pd.read_csv('operative_performance_corrected_annual_targets_2023-11-21.csv')


In [None]:
year_targets_df.rename(columns={
    'cabin' : 'cabin_in_surveyed_flight',
    'date_flight_local': 'start_date'  # Assuming you want to consider end_date as the equivalent of date_flight_local
}, inplace=True)

# Correct the conversion to datetime objects
year_targets_df['start_date'] = pd.to_datetime(year_targets_df['start_date'])

# Compute 'start_date' as the first day of the corresponding month
# Using dt.to_period('M').to_timestamp() to safely navigate datetime formats
year_targets_df['end_date'] = year_targets_df['start_date'] + pd.offsets.YearEnd(0)

In [None]:
satisfaction_cols = [col for col in year_targets_df.columns if col.endswith('_satisfaction')]
otp_cols = ['otp15_takeoff']
features_cols = satisfaction_cols + ['load_factor'] + otp_cols

In [None]:
year_targets_df['insert_date_ci']='2023-11-21'

In [None]:
year_targets_df=year_targets_df[['start_date', 'end_date','cabin_in_surveyed_flight','haul','otp15_takeoff', 'NPS', 'NPS_weighted']+ satisfaction_cols + ['load_factor','insert_date_ci']]

In [None]:
year_targets_df= year_targets_df[(year_targets_df['start_date']=='2023-12-31')]

In [None]:
year_targets_df

In [None]:
# Make a copy of the first two rows
copy_df = year_targets_df[(year_targets_df['haul']=='SH')].copy()

# Add 0.5 to the 'otp15_takeoff' column in the copied DataFrame
copy_df['otp15_takeoff'] += 0.5

copy_df['insert_date_ci'] = '2024-05-08'


In [None]:
corr_data=pd.read_csv('data_for_historic_prediction (5).csv')

In [None]:
corr_data['start_date'].min()

In [None]:
satisfaction_cols = [col for col in corr_data.columns if col.endswith('_satisfaction')]
otp_cols = ['otp15_takeoff']
features_cols = satisfaction_cols + ['load_factor'] + otp_cols

In [None]:
buss_corr = corr_data[(corr_data['cabin_in_surveyed_flight'] == 'Business') & (corr_data['haul'] == 'SH')][features_cols].corr()
eco_corr = corr_data[(corr_data['cabin_in_surveyed_flight'] == 'Economy') & (corr_data['haul'] == 'SH')][features_cols].corr()


# Splitting the DataFrame
df_business = year_targets_df[(year_targets_df['cabin_in_surveyed_flight'] == 'Business') & (year_targets_df['haul']=='SH')].copy()
df_economy = year_targets_df[(year_targets_df['cabin_in_surveyed_flight'] == 'Economy') & (year_targets_df['haul']=='SH')].copy()

# Function to adjust values based on the top 5 correlated features and correlation matrix
def adjust_top_5_values(df, correlation_matrix, delta_change):
    # Identify the top 5 features based on absolute correlation with 'otp15_takeoff'
    top_features = correlation_matrix['otp15_takeoff'].abs().nlargest(5).index

    # Apply adjustments only to the top 5 features
    for column in top_features:
        if column != 'otp15_takeoff':  # Ensure we're not adjusting 'otp15_takeoff' again
            adjustment_factor = correlation_matrix.at[column, 'otp15_takeoff'] * delta_change
            print(f"Adjustment_factor for column {column}: {adjustment_factor}")
            df[column] += adjustment_factor

    return df

# Sample usage with your existing DataFrame splits
delta_otp15_takeoff = 0.5
df_business['otp15_takeoff'] += delta_otp15_takeoff
df_economy['otp15_takeoff'] += delta_otp15_takeoff

# Assuming 'buss_corr' and 'eco_corr' are defined as your business and economy correlation matrices
print('Business')
df_business = adjust_top_5_values(df_business, buss_corr, delta_otp15_takeoff)
print('Economy')
df_economy = adjust_top_5_values(df_economy, eco_corr, delta_otp15_takeoff)

# Optionally recombine the DataFrames and set new insert dates
updated_df = pd.concat([df_business, df_economy]).sort_index()
updated_df['insert_date_ci'] = '2024-05-08'  # Update insert date for all




In [None]:
updated_df

In [None]:
concatenated_df=pd.concat([year_targets_df,updated_df], ignore_index=True)

In [None]:
day_predict_df = pd.read_csv('intervals.csv')

In [None]:
concatenated_df

In [None]:
import pandas as pd

In [None]:
targets_df = pd.read_csv('targets.csv')

targets_df=targets_df[targets_df['cabin']!='Global']

targets_df['insert_date_ci']='2023-11-21'

targets_df.rename(columns={
    'cabin' : 'cabin_in_surveyed_flight',
    'date_flight_local': 'end_date'  # Assuming you want to consider end_date as the equivalent of date_flight_local
}, inplace=True)
# day_predict_df.rename(columns={
#     'interval_end_date': 'insert_date_ci'  # Assuming you want to consider end_date as the equivalent of date_flight_local
# }, inplace=True)

# Correct the conversion to datetime objects
targets_df['end_date'] = pd.to_datetime(targets_df['end_date'])

targets_df = targets_df[(targets_df['end_date'].dt.year == 2024)]

# Compute 'start_date' as the first day of the corresponding month
# Using dt.to_period('M').to_timestamp() to safely navigate datetime formats
targets_df['start_date'] = targets_df['end_date'].dt.to_period('M').dt.to_timestamp()

targets_df.drop(columns=['Unnamed: 0'], inplace=True)




In [None]:
# Monthly improvements
month_improvements = {
    1: 0.7, 2: 0.2, 3: 0.5, 4: 0.3, 5: 0.2, 6: 0.3,
    7: 0.6, 8: 0.6, 9: 0.8, 10: 0.7, 11: 0.5, 12: 0.2
}

# Function to adjust otp15_takeoff and the top 5 correlated features excluding otp15_takeoff itself
def adjust_otp_and_top_5_correlations(row, eco_corr, buss_corr):
    # Select the appropriate correlation matrix
    corr_matrix = buss_corr if row['cabin_in_surveyed_flight'] == 'Business' else eco_corr

    # Identify the top 5 features based on absolute correlation with 'otp15_takeoff'
    # Exclude 'otp15_takeoff' from being considered as one of the top correlated features
    top_features = ['pfl_100_checkin_satisfaction', 'pfl_500_boarding_satisfaction', 'arr_100_arrivals_satisfaction', 'con_100_connections_satisfaction', 'ifl_100_cabin_crew_satisfaction']

    # Apply monthly improvement
    month = row['end_date'].month
    delta_otp15_takeoff = month_improvements.get(month, 0)
    row['otp15_takeoff'] += delta_otp15_takeoff
    
    # Apply correlation-based adjustments only to the top 5 correlated features
    for column in top_features:
        if column in row:
            adjustment_factor = corr_matrix.at[column, 'otp15_takeoff'] * delta_otp15_takeoff
            row[column] += adjustment_factor

    row['insert_date_ci'] = '2024-05-08'
    return row

# Adjust the DataFrame
adjusted_df = targets_df.apply(lambda row: adjust_otp_and_top_5_correlations(row.copy(), eco_corr, buss_corr), axis=1)
concatenated_df = pd.concat([targets_df, adjusted_df]).sort_index(kind='merge')





In [None]:
targets_df

In [None]:
concatenated_df =targets_df

In [None]:
concatenated_df=all_weekly.copy()
concatenated_df['insert_date_ci']='2024-06-01'

In [None]:
corr_data.corr()

## Genetic algorithms simulator

In [None]:
cabin = 'Economy'
haul = 'SH'

model_file_path = os.path.join('targets_model', f"best_tuned_mae_model_{cabin}_{haul}_df_LightGBMModel.pkl")
with open(model_file_path, 'rb') as model_file:
    model = pickle.load(model_file)
    
scaler_file_path = os.path.join('targets_model', f"future_scaler_{cabin}_{haul}_df.pkl")
with open(scaler_file_path, 'rb') as scaler_file:
    scaler = pickle.load(scaler_file)
    
# Access the trained LightGBM model
lgb_model = model.model
feature_names = model.lagged_feature_names

# Set feature names manually if not already set
lgb_model.feature_name_ 

In [None]:
import pandas as pd
import numpy as np

# Supongamos que `corr_data` es tu DataFrame de datos
corr_data = pd.read_csv('daily_NPS_2024-06-01.csv')
variables = [
    "bkg_200_journey_preparation_satisfaction",
    "pfl_100_checkin_satisfaction",
    "pfl_200_security_satisfaction",
    "pfl_300_lounge_satisfaction",
    "pfl_500_boarding_satisfaction",
    "ifl_300_cabin_satisfaction",
    "ifl_200_flight_crew_annoucements_satisfaction",
    "ifl_600_wifi_satisfaction",
    "ifl_500_ife_satisfaction",
    "ifl_400_food_drink_satisfaction",
    "ifl_100_cabin_crew_satisfaction",
    "arr_100_arrivals_satisfaction",
    "con_100_connections_satisfaction",
    "loy_200_loyalty_programme_satisfaction",
    "img_310_ease_contact_phone_satisfaction",
    "load_factor",
    "otp15_takeoff"
]

# Crear un diccionario para almacenar los límites
bounds = {
    'Economy_SH': {'desired_output': 40.8, 'ytd': np.array([
        71.53872340864697, 78.75654554839483, 80.11452916215028, 74.04299984967855,
        75.60965424745486, 72.8084702569169, 78.22524650876508, 43.00880046374199,
        44.554300671622585, 53.439005754954906, 80.91934994215856, 78.1213030736794,
        68.9443135427536, 65.82854958118871, 52.8143644885526, 87.98863696320771,
        87.94872744785685
    ])},
    'Business_SH': {'desired_output': 49.2, 'ytd': np.array([
        75.21281355971165, 82.01684831634871, 81.09703133030277, 72.74481985846293,
        77.5257587435053, 76.5637379530902, 80.50212103977712, 50.18471338223737,
        43.42476471205928, 75.21962008760555, 88.11284159064535, 80.01284943411335,
        72.73886095505785, 70.24804759939938, 53.61973641048212, 82.88961363106925,
        88.09584501656896
    ])},
    'Economy_LH': {'desired_output': 35.2, 'ytd': np.array([
        69.45899014981862, 75.97682039298141, 83.21558066375458, 75.77040405189314,
        76.13534099727835, 69.0042942723428, 81.203111247732, 45.392686630612594,
        76.5714728414748, 61.18137866367399, 76.57786809158281, 78.80601620406625,
        69.23713704858694, 65.081074046137, 53.6517910918804, 89.21558918425164,
        80.51594150240712
    ])},
    'Business_LH': {'desired_output': 46.1, 'ytd': np.array([
        73.74847921792322, 81.32798786888064, 83.08217829338287, 76.49730874868177,
        77.26554399669993, 76.63356742362893, 82.55472749375869, 46.23855519118868,
        71.9934182917934, 70.16725739263926, 82.41271972809379, 80.97557771556721,
        70.99143320747852, 72.15848777646852, 59.78423355546649, 92.15834718930202,
        80.58326937835764
    ])},
    'Premium Economy_LH': {'desired_output': 37.6, 'ytd': np.array([
        70.47667460599726, 77.76567880595374, 83.49901673416883, 82.68836463802664,
        77.8440324645451, 70.8471777826006, 80.15689019950972, 45.68048301381103,
        74.83582622448148, 57.2564051306007, 74.43244367790237, 79.72244225578565,
        70.86792808007696, 71.24113107741451, 51.64779408101883, 89.66540911621003,
        83.0166270783848
    ])},
}

# Definir los límites inferiores iniciales
for cabin_haul in bounds.values():
    cabin_haul['lower_bounds'] = cabin_haul['ytd'].copy()

# Crear los límites superiores basados en las correlaciones con NPS_weighted
for cabin_haul_key, cabin_haul in bounds.items():
    # Filtrar datos por cabin y haul
    filtered_data = corr_data[(corr_data['cabin'] == cabin_haul_key.split('_')[0]) & 
                              (corr_data['haul'] == cabin_haul_key.split('_')[1])]
    if filtered_data.empty:
        print(f"No data for {cabin_haul_key}")
        continue
    
    # Calcular la matriz de correlación
    corr_matrix = filtered_data.corr()

    # Inicializar upper_bounds con los valores ytd
    upper_bounds = cabin_haul['ytd'].copy()

    # Aplicar las modificaciones basadas en las correlaciones con NPS_weighted
    for i, var in enumerate(variables):
        correlation = corr_matrix.loc[var, 'NPS_weighted']
        if var in ['load_factor', 'otp15_takeoff']:
            upper_bounds[i] += 0.2
        elif correlation > 0.5:
            upper_bounds[i] += 3
        elif correlation > 0.4:
            upper_bounds[i] += 1
        else:
            upper_bounds[i] += 0.2

    cabin_haul['upper_bounds'] = upper_bounds

bounds


In [None]:
cabin='Premium Economy'
haul = 'LH'
corr_data = pd.read_csv('daily_aggregation.csv')
corr_data = corr_data[corr_data['start_date']>='2023-01-01']
variables = [
    'pun_100_punctuality_satisfaction',
    "bkg_200_journey_preparation_satisfaction",
    "pfl_100_checkin_satisfaction",
    "pfl_200_security_satisfaction",
    "pfl_300_lounge_satisfaction",
    "pfl_500_boarding_satisfaction",
    "ifl_300_cabin_satisfaction",
    "ifl_200_flight_crew_annoucements_satisfaction",
    "ifl_600_wifi_satisfaction",
    "ifl_500_ife_satisfaction",
    "ifl_400_food_drink_satisfaction",
    "ifl_100_cabin_crew_satisfaction",
    "arr_100_arrivals_satisfaction",
    "con_100_connections_satisfaction",
    "loy_200_loyalty_programme_satisfaction",
    "img_310_ease_contact_phone_satisfaction",
    "load_factor",
    "otp15_takeoff"
]
# corr_matrix = corr_data[(corr_data['cabin']==cabin) & (corr_data['haul']==haul)][variables+['NPS_weighted']].corr()
corr_matrix = corr_data[(corr_data['cabin_in_surveyed_flight']==cabin) & (corr_data['haul']==haul)][['pun_100_punctuality_satisfaction', 'otp15_takeoff', 'NPS_weighted']].corr()
corr_matrix


In [None]:
corr_data['start_date'] = pd.to_datetime(corr_data['start_date'])

filtered_df = corr_data[(corr_data['cabin_in_surveyed_flight'] == cabin) & (corr_data['haul'] == haul)]

# Crear una columna de semana del año
filtered_df['week'] = filtered_df['start_date'].dt.isocalendar().week
filtered_df['year'] = filtered_df['start_date'].dt.year

# Agrupar por año y semana, y tomar la media de los valores
weekly_df = filtered_df.groupby(['year', 'week']).agg({
    'otp15_takeoff': 'mean',
    'pun_100_punctuality_satisfaction': 'mean'
}).reset_index()

# Crear una columna de fecha representativa para la semana
weekly_df['date'] = pd.to_datetime(weekly_df['year'].astype(str) + '-W' + weekly_df['week'].astype(str) + '-1', format='%Y-W%W-%w')

# Crear la figura y los ejes
plt.figure(figsize=(14, 7))

# Plot 'otp15_takeoff'
plt.plot(weekly_df['date'], weekly_df['otp15_takeoff'], label='OTP 15 Takeoff', color='blue', marker='o')

# Plot 'pun_100_punctuality'
plt.plot(weekly_df['date'], weekly_df['pun_100_punctuality_satisfaction'], label='Punctuality satisfaction', color='green', marker='o')

# Añadir etiquetas y título
plt.xlabel('Date')
plt.ylabel('Values')
plt.title('OTP 15 Takeoff and Punctuality satisfaction over Time (Weekly)')
plt.legend()

# Mostrar el gráfico
plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd

def scatter_plot_with_regression(df, variable, target):
    # Remove rows with missing values
    df_clean = df.dropna(subset=[variable, target])
    
    # Create the scatter plot
    plt.figure(figsize=(10, 10))  # Square plot
    plt.scatter(df_clean[variable], df_clean[target], alpha=0.6, edgecolors='w', linewidths=0.5, label='Data points')
    
    # Fit the linear regression model
    X = df_clean[variable].values.reshape(-1, 1)
    y = df_clean[target].values
    model = LinearRegression()
    model.fit(X, y)
    
    # Plot the regression line
    plt.plot(df_clean[variable], model.predict(X), color='red', label='Fitted line')
    
    # Add titles and labels
    plt.title(f'{variable} vs {target}')
    plt.xlabel(f'{variable}')
    plt.ylabel(f'{target}')
    plt.legend()
    plt.grid(True)
    
    # Calculate R^2
    r_squared = model.score(X, y)
    
    # Add slope and R^2 to the plot
    plt.text(0.05, 0.10, f'Slope: {model.coef_[0]:.4f}', transform=plt.gca().transAxes, fontsize=12, verticalalignment='top')
    plt.text(0.05, 0.05, f'R^2: {r_squared:.4f}', transform=plt.gca().transAxes, fontsize=12, verticalalignment='top')
    
    plt.show()
    
    return model.coef_[0], model.intercept_, r_squared
scatter_df = corr_data[(corr_data['cabin_in_surveyed_flight']==cabin) & (corr_data['haul']==haul)]
scatter_plot_with_regression(scatter_df, 'otp15_takeoff', 'NPS_weighted' )

In [None]:
import pandas as pd
import numpy as np
from deap import base, creator, tools, algorithms
from darts.dataprocessing.transformers import Scaler
from darts import TimeSeries
import lightgbm as lgb
from sklearn.metrics import mean_squared_error

# Assume regressor and darts_scaler are already defined and trained
regressor = lgb_model
darts_scaler = scaler

# Define desired output and bounds
desired_output = bounds[f'{cabin}_{haul}']['desired_output']
threshold = 0.05
lower_bounds = bounds[f'{cabin}_{haul}']['lower_bounds']
upper_bounds = bounds[f'{cabin}_{haul}']['upper_bounds']

# Load correlation data
corr_data = pd.read_csv('daily_NPS_2024-06-01.csv')
corr_data = corr_data[(corr_data['cabin'] == cabin) & (corr_data['haul'] == haul)][[
    "bkg_200_journey_preparation_satisfaction",
    "pfl_100_checkin_satisfaction",
    "pfl_200_security_satisfaction",
    "pfl_300_lounge_satisfaction",
    "pfl_500_boarding_satisfaction",
    "ifl_300_cabin_satisfaction",
    "ifl_200_flight_crew_annoucements_satisfaction",
    "ifl_600_wifi_satisfaction",
    "ifl_500_ife_satisfaction",
    "ifl_400_food_drink_satisfaction",
    "ifl_100_cabin_crew_satisfaction",
    "arr_100_arrivals_satisfaction",
    "con_100_connections_satisfaction",
    "loy_200_loyalty_programme_satisfaction",
    "img_310_ease_contact_phone_satisfaction",
    "load_factor",
    "otp15_takeoff"
]]

# Calculate correlation matrix
correlation_matrix = corr_data.corr()

# Define fitness function
def fitness_function(individual):
    series = TimeSeries.from_values(np.array(individual).reshape(1, -1))
    x_scaled = darts_scaler.transform(series)
    prediction = regressor.predict(x_scaled.values())[0]
    
    # Calculate the cost (assuming it's the deviation from a baseline or target)
    actual_values = bounds[f'{cabin}_{haul}']['ytd']
    cost = mean_squared_error(np.array(individual), actual_values)
    
    # Prioritize prediction error, then cost
    prediction_error = abs(prediction - desired_output)
    
    # Combine the two objectives into one fitness value
    return prediction_error, cost

# Check bounds function
def check_bounds(individual, lower_bounds, upper_bounds):
    for i in range(len(individual)):
        if individual[i] < lower_bounds[i]:
            individual[i] = lower_bounds[i]
        elif individual[i] > upper_bounds[i]:
            individual[i] = upper_bounds[i]
    return individual

# Check correlations function
def check_correlations(individual, correlation_matrix, tolerance=0.1):
    num_vars = len(individual)
    for i in range(num_vars):
        for j in range(i + 1, num_vars):
            expected_corr = correlation_matrix.iloc[i, j]
            actual_corr = np.corrcoef(individual[i], individual[j])[0, 1]
            if abs(actual_corr - expected_corr) > tolerance:
                return False
    return True

# DEAP configuration for multi-objective optimization
creator.create("FitnessMin", base.Fitness, weights=(-1.0, -1.0))  # Minimizing both objectives
creator.create("Individual", list, fitness=creator.FitnessMin)

def create_individual():
    return [np.random.uniform(low, high) for low, high in zip(lower_bounds, upper_bounds)]

toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", fitness_function)

# Custom mutation function
def custom_mutate(individual, eta=1.0, low=lower_bounds, up=upper_bounds, indpb=0.2):
    size = len(individual)
    for i in range(size):
        if np.random.random() <= indpb:
            x = individual[i]
            xl = low[i]
            xu = up[i]
            delta_1 = (x - xl) / (xu - xl)
            delta_2 = (xu - x) / (xu - xl)
            mut_pow = 1.0 / (eta + 1.0)
            rand = np.random.random()
            if rand < 0.5:
                xy = 1.0 - delta_1
                val = 2.0 * rand + (1.0 - 2.0 * rand) * (xy ** (eta + 1))
                delta_q = val ** mut_pow - 1.0
            else:
                xy = 1.0 - delta_2
                val = 2.0 * (1.0 - rand) + 2.0 * (rand - 0.5) * (xy ** (eta + 1))
                delta_q = 1.0 - val ** mut_pow
            x = x + delta_q * (xu - xl)
            x = min(max(x, xl), xu)
            individual[i] = x
    return individual,

# Custom crossover function with bounds check
def custom_cxBlend(ind1, ind2, alpha=0.5):
    ind1, ind2 = tools.cxBlend(ind1, ind2, alpha)
    ind1 = check_bounds(ind1, lower_bounds, upper_bounds)
    ind2 = check_bounds(ind2, lower_bounds, upper_bounds)
    return ind1, ind2

toolbox.register("mutate", custom_mutate)
toolbox.register("mate", custom_cxBlend, alpha=0.5)
toolbox.register("select", tools.selNSGA2)  # Using NSGA-II for multi-objective optimization

# Main function
def main():
    population = toolbox.population(n=500)
    ngen = 50
    cxpb = 0.5
    mutpb = 0.2

    valid_individuals = []
    best_individual = None
    best_fitness = (float('inf'), float('inf'))

    for gen in range(ngen):
        offspring = algorithms.varAnd(population, toolbox, cxpb, mutpb)

        # Verificar y corregir límites después de las operaciones genéticas
        for ind in offspring:
            check_bounds(ind, lower_bounds, upper_bounds)

        fits = map(toolbox.evaluate, offspring)

        for fit, ind in zip(fits, offspring):
            ind.fitness.values = fit
            if fit[0] < threshold:
                ind = check_bounds(ind, lower_bounds, upper_bounds)  # Verificación final
                # and check_correlations(ind, correlation_matrix)
                if all(lower_bounds[i] <= ind[i] <= upper_bounds[i] for i in range(len(ind))):
                    valid_individuals.append((ind, fit[0], fit[1]))  # Append cost as well

            # Track the best individual
            if fit < best_fitness:
                best_individual = ind
                best_fitness = fit

        population = toolbox.select(offspring, k=len(population))

    return population, valid_individuals, best_individual

if __name__ == "__main__":
    final_population, valid_individuals, best_individual = main()
    
    if valid_individuals:
        print(f"Found {len(valid_individuals)} valid individuals within the threshold.")
        for ind, pred_error, cost in valid_individuals:
            print(f"Individual: {ind}, Prediction Error: {pred_error}, Cost: {cost}")
    else:
        print("No valid individuals found within the threshold.")
    
    if best_individual:
        best_individual_fitness = fitness_function(best_individual)
        print(f"Best individual: {best_individual}, Prediction Error: {best_individual_fitness[0]}, Cost: {best_individual_fitness[1]}")


In [None]:
# Supongamos que estos son los nombres de las características de tus datos
feature_names =  [
    "start_date",
    "end_date",
    "cabin_in_surveyed_flight",
    "haul",
    "bkg_200_journey_preparation_satisfaction",
    "pfl_100_checkin_satisfaction",
    "pfl_200_security_satisfaction",
    "pfl_300_lounge_satisfaction",
    "pfl_500_boarding_satisfaction",
    "ifl_300_cabin_satisfaction",
    "ifl_200_flight_crew_annoucements_satisfaction",
    "ifl_600_wifi_satisfaction",
    "ifl_500_ife_satisfaction",
    "ifl_400_food_drink_satisfaction",
    "ifl_100_cabin_crew_satisfaction",
    "arr_100_arrivals_satisfaction",
    "con_100_connections_satisfaction",
    "loy_200_loyalty_programme_satisfaction",
    "img_310_ease_contact_phone_satisfaction",
    "load_factor",
    "otp15_takeoff",
    "NPS_weighted",
    "insert_date_ci"
]

# Datos del YTD (Year-To-Date)
# ytd_2024 = [
#     "2024-01-01",
#     "2024-06-01",
#     "Economy",
#     "SH",
#     71.84856075258804,
#     79.06015584914005,
#     80.29968621466156,
#     74.394075935135,
#     76.02849620274152,
#     72.96724176680158,
#     78.49867970533091,
#     43.23740683347413,
#     44.49390640706396,
#     53.73307758411367,
#     81.1828997942445,
#     78.59234695868854,
#     69.37947451916642,
#     65.98255786642703,
#     53.1402991538944,
#     87.94953821163229,
#     88.62830908016969,
#     40.4,
#     '2024-06-01'
# ]
# Datos del YTD (Year-To-Date)
# ytd_2024 = [
#     "2024-01-01",
#     "2024-06-01",
#     "Business",
#     "SH",
#     75.23031137101951, 82.38024124297641, 81.05735897610812, 73.4348802421541,
#     78.16699370088098, 76.69004349622989, 80.57402608565235, 50.02440166461712,
#     43.7342726867898, 75.49615314832714, 88.3768021643377, 80.67670254574661,
#     73.27391902671584, 70.68348691307116, 54.63516058451109, 82.64093457772394,
#     88.91542568137118,
#     47.3,
#     '2024-06-01'
# ]
print(lower_bounds)
ytd_2024 = ['2024-01-01', '2024-06-01', cabin, haul] + lower_bounds.tolist() + [bounds[f'{cabin}_{haul}']['desired_output'], '2024-06-01']

# Convertir YTD en un DataFrame
ytd_data = {feature: value for feature, value in zip(feature_names, ytd_2024)}
ytd_df = pd.DataFrame([ytd_data])

# Supongamos que `valid_individuals` es el resultado del algoritmo genético
# valid_individuals = [(ind, fit) for ind, fit in ... ]  # Aquí tienes tus individuos válidos y sus fitness

# Convertir individuos válidos en DataFrames y concatenar
valid_dfs = []
for ind, fit, cost in valid_individuals:
    individual = ind + [f'{desired_output}', '2024-06-19']
    best_individual = best_individual + ['2024-06-19']
    values = ['2024-01-01', '2024-06-01', cabin, haul] + best_individual
    data = {feature: value for feature, value in zip(feature_names, values)}
    df = pd.DataFrame([data])
    valid_dfs.append(df)

# Concatenar todos los DataFrames válidos con el YTD
concatenated_df = pd.concat([ytd_df] + valid_dfs).sort_index(kind='merge')

# Mostrar el DataFrame concatenado
concatenated_df


In [None]:
def process_dataframe(df):
    # df.drop(columns=['pun_100_punctuality_satisfaction', 'inm_400_issues_response_satisfaction'], inplace=True)
    # Agrupar y procesar los datos
    grouped_dfs = {}
    features = {}
    for group_name, group_data in df.groupby(['cabin_in_surveyed_flight', 'haul']):
        cabin_value, haul_value = group_name
        group_df = group_data.copy()
        group_df_name = f'{cabin_value}_{haul_value}_df'
        
        # Identificar las columnas de características
        satisfaction_cols = [col for col in df.columns if col.endswith('_satisfaction')]
        otp_cols = ['otp15_takeoff']
        features_cols = satisfaction_cols + ['load_factor'] + otp_cols
        cols_to_keep = ['insert_date_ci', 'start_date','end_date','cabin_in_surveyed_flight', 'haul'] + features_cols + ['NPS_weighted']

        # Filtrar las columnas en el grupo y actualizar el diccionario de características
        grouped_df = group_df[cols_to_keep]
        features[group_df_name] = features_cols
        grouped_dfs[group_df_name] = grouped_df

    # Reconstruir el DataFrame original
    df = pd.concat(grouped_dfs.values())
    df.reset_index(drop=True, inplace=True)

    return df, grouped_dfs, features

# Aplicar la función a cada DataFrame y almacenar los resultados en las variables correspondientes
day_predict_df, day_predict_df_grouped_dfs, features_cols = process_dataframe(concatenated_df)

In [None]:
day_predict_df_grouped_dfs

In [None]:
os.getcwd()

In [None]:
satisfaction_cols = [col for col in day_predict_df.columns if col.endswith('_satisfaction')]
otp_cols = ['otp15_takeoff']
features_cols = satisfaction_cols + ['load_factor'] + otp_cols



In [None]:
features_cols

In [None]:
import pandas as pd
import numpy as np
import pickle
import os
import pandas as pd
from darts.timeseries import TimeSeries
import os
import pickle


def compute_shap_and_prediction(row, key, features_cols):
    """
    Computes SHAP values and the predicted NPS for a given row.
    
    Parameters:
    - row_df: The DataFrame row for which to compute SHAP values and prediction.
    - key: The key identifying the specific model and scaler to use.
    - features_cols: List of column names representing features used by the model.
    
    Returns:
    - A tuple containing SHAP values as a dictionary and the predicted NPS.
    """
    # Logic to prepare the row for SHAP value computation and prediction
    aux_nps_ts = TimeSeries.from_series(pd.Series([0]))
    aux_row = pd.DataFrame(0, index=[0], columns=row.columns)
    row_df = pd.concat([aux_row, row]).reset_index(drop=True)
    
    # Load the pre-trained model and scaler
    best_tuned_model_dataframe_path = os.path.join('targets_model', f"best_tuned_dataframe_{key}.pkl")
    with open(best_tuned_model_dataframe_path, 'rb') as dataframe_file:
        best_tuned_model = pickle.load(dataframe_file)
    
    future_scaler_path = os.path.join('targets_model', f"future_scaler_{key}.pkl")
    with open(future_scaler_path, 'rb') as scaler_file:
        future_scaler = pickle.load(scaler_file)
    
    future_covariates_ts = TimeSeries.from_dataframe(row_df[features_cols])[-1:]
    future_covariates_ts_scaled = future_scaler.transform(future_covariates_ts)
    
    model_file_path = os.path.join('targets_model', f"best_tuned_mae_model_{key}_{best_tuned_model['model_name']}.pkl")
    with open(model_file_path, 'rb') as model_file:
        model = pickle.load(model_file)
    
    # Compute SHAP values and prediction
    shap_explain = ShapExplainer(model=model)
    shap_explained = shap_explain.explain(aux_nps_ts, foreground_future_covariates=future_covariates_ts_scaled)
    shap_explanation = shap_explained.get_shap_explanation_object(horizon=1)

    shap_values = shap_explanation[0].values
    base_value = shap_explanation[0].base_values
    pred_value = base_value + shap_values.sum()
    feature_names=[]
    for feat in shap_explanation.feature_names:
        name = [f for f in features_cols if f in feat]
        feature_names.append(name[0])
    
    
    # Convert SHAP values to a dictionary and adjust the logic based on your ShapExplainer
    shap_values_dict = {f"{feature}_nps": value for feature, value in zip(feature_names, shap_values)}
    shap_values_dict["out_prob_base"] = base_value,
    shap_values_dict["out_prob_nps"] = pred_value,
    
    # print(row_df.loc[1,features_cols])
    
    shap_explanation = shap.Explanation(values=shap_values, 
                                 base_values=base_value, 
                                 data=np.array(row_df.loc[1,features_cols].values.flatten().tolist()), 
                                 feature_names=shap_explanation.feature_names)
    
    return shap_values_dict, shap_explanation, shap_explain, shap_explained


# Initialize a dictionary to store the augmented DataFrames
augmented_dfs = {}
explanations = {}

for key in day_predict_df_grouped_dfs.keys():
    # Initialize a list to collect augmented rows
    augmented_rows = []
    explanations[key]={}
    
    n = len(day_predict_df_grouped_dfs[key])
    # n=2
    for index in range(n):
        # Access the row by its index using .iloc
        row_df = day_predict_df_grouped_dfs[key].iloc[[index]]

        # Compute SHAP values and predicted NPS here...
        # Assuming `compute_shap_and_prediction` is a function you'd implement
        # This function should return SHAP values as a dict and the predicted NPS
        shap_values, explanations[key][index], shap_explain, shap_explained = compute_shap_and_prediction(row_df, key, features_cols)
        # shap.plots.waterfall(explanations[key][index], max_display=20)
        # Generate summary plot
        # shap.summary_plot(model)
        # Display the plot
        # plt.show()
        
        # For each feature, add its SHAP value to the row
        for feature_name, shap_value in shap_values.items():
            row_df[f'{feature_name}'] = shap_value

        # Add base value and predicted NPS columns
        # row_df['Base Value'] = shap_values['base_value']  # Adjust based on how you obtain the base value
        # row_df['Predicted NPS'] = predicted_nps
        # print(key)
        # shap_explain.summary_plot()

        # Append the augmented row to the list
        augmented_rows.append(row_df)
        

    # Concatenate all augmented rows to form the complete augmented DataFrame
    augmented_dfs[key] = pd.concat(augmented_rows).reset_index(drop=True)

# `augmented_dfs` now contains the augmented DataFrames with SHAP values and predictions
augmented_dfs

In [None]:
def create_uplifting_explanation(explanation2, explanation1):
    """
    Create a new Explanation object representing the uplifting between two Explanation objects.

    Parameters:
        - explanation1: The first shap.Explanation object.
        - explanation2: The second shap.Explanation object.

    Returns:
        - A new shap.Explanation object representing the uplifting.
    """
    # Calculate the difference in values, base_values, and data
    diff_values = explanation2.values - explanation1.values
    
    diff_base_values = explanation1.base_values + sum(explanation1.values)
    diff_data = explanation2.data - explanation1.data

    # Create a new Explanation object with the difference values
    diff_explanation = shap.Explanation(values=diff_values, base_values=diff_base_values, data=diff_data,
                                        feature_names=explanation1.feature_names)

    return diff_explanation

In [None]:
import random
for i in range(1,n):
    diff_explanation = create_uplifting_explanation(explanations[f'{cabin}_{haul}_df'][i], explanations[f'{cabin}_{haul}_df'][0])
    shap.plots.waterfall(diff_explanation, max_display=30)

In [None]:
scaler

In [None]:
import numpy as np
from scipy.optimize import minimize

# Definición del NPS Global objetivo
nps_global_objetivo = 40

# Porcentajes de cada categoría
porcentajes = np.array([0.23, 0.688, 0.046, 0.023, 0.013])

# NPS iniciales (por ejemplo, podrían ser los valores actuales)
nps_inicial = np.array([25.2, 24.8, 36.7, 40.5, 38.3])

# Función objetivo para minimizar
def objetivo(nps):
    return np.abs(nps_global_objetivo - np.sum(porcentajes * nps))

# Restricciones
constraints = [{'type': 'eq', 'fun': lambda nps: np.sum(porcentajes * nps) - nps_global_objetivo}]

# Optimización
resultado = minimize(objetivo, nps_inicial, constraints=constraints)

nps_optimizado = resultado.x
nps_optimizado


In [None]:
nps_ch = [31.2967217 , 43.03715398, 37.91934434, 41.10967217, 38.64459759]

In [None]:
porcentajes * nps_ch

In [None]:
from deap import base, creator, tools, algorithms
import numpy as np
from darts.dataprocessing.transformers import Scaler
from darts import TimeSeries
import lightgbm as lgb

# Supongamos que `regressor` es tu modelo LightGBM entrenado y `darts_scaler` es tu escalador
regressor = lgb_model
darts_scaler = scaler  # Asegúrate de que este escalador esté ajustado

# Definir el objetivo deseado
desired_output = 40.8  # Valor objetivo
threshold = 0.5  # Umbral para la condición de individuos con una buena predicción

# Ejemplo de límites inferiores y superiores específicos para cada variable
lower_bounds = np.array([
    71.84856075258804, 79.06015584914005, 80.29968621466156, 74.394075935135,
    76.02849620274152, 72.96724176680158, 78.49867970533091, 43.23740683347413,
    44.49390640706396, 53.73307758411367, 81.1828997942445, 78.59234695868854,
    69.37947451916642, 65.98255786642703, 53.1402991538944, 87.94953821163229,
    88.62830908016969
])
upper_bound = 100

# Definir la función de fitness
def fitness_function(individual):
    series = TimeSeries.from_values(np.array(individual).reshape(1, -1))
    x_scaled = darts_scaler.transform(series)
    prediction = regressor.predict(x_scaled.values())[0]
    return abs(prediction - desired_output),

# Configurar DEAP
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))  # Queremos minimizar el error
creator.create("Individual", list, fitness=creator.FitnessMin)

def create_individual():
    return [np.random.uniform(low, upper_bound) for low in lower_bounds]

toolbox = base.Toolbox()
toolbox.register("individual", tools.initIterate, creator.Individual, create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

toolbox.register("evaluate", fitness_function)
toolbox.register("mate", tools.cxBlend, alpha=0.5)

# Función de mutación personalizada
def custom_mutate(individual, eta=1.0, low=lower_bounds, up=upper_bound, indpb=0.2):
    size = len(individual)
    for i in range(size):
        if np.random.random() <= indpb:
            x = individual[i]
            xl = low[i]
            xu = up
            delta_1 = (x - xl) / (xu - xl)
            delta_2 = (xu - x) / (xu - xl)
            mut_pow = 1.0 / (eta + 1.0)
            rand = np.random.random()
            if rand < 0.5:
                xy = 1.0 - delta_1
                val = 2.0 * rand + (1.0 - 2.0 * rand) * (xy ** (eta + 1))
                delta_q = val ** mut_pow - 1.0
            else:
                xy = 1.0 - delta_2
                val = 2.0 * (1.0 - rand) + 2.0 * (rand - 0.5) * (xy ** (eta + 1))
                delta_q = 1.0 - val ** mut_pow
            x = x + delta_q * (xu - xl)
            x = min(max(x, xl), xu)
            individual[i] = x
    return individual,

# Función para verificar y corregir límites
def check_bounds(individual, lower_bounds, upper_bound):
    for i in range(len(individual)):
        if individual[i] < lower_bounds[i]:
            individual[i] = lower_bounds[i]
        elif individual[i] > upper_bound:
            individual[i] = upper_bound
    return individual

toolbox.register("mutate", custom_mutate)
toolbox.register("select", tools.selTournament, tournsize=3)

# Algoritmo Genético
def main():
    population = toolbox.population(n=300)
    ngen = 40
    cxpb = 0.5
    mutpb = 0.2

    valid_individuals = []

    for gen in range(ngen):
        offspring = algorithms.varAnd(population, toolbox, cxpb, mutpb)

        # Verificar y corregir límites después de las operaciones genéticas
        for ind in offspring:
            check_bounds(ind, lower_bounds, upper_bound)

        fits = map(toolbox.evaluate, offspring)

        for fit, ind in zip(fits, offspring):
            ind.fitness.values = fit
            if fit[0] < threshold:
                if all(lower_bounds[i] <= ind[i] <= upper_bound for i in range(len(ind))):
                    valid_individuals.append((ind, fit[0]))

        population = toolbox.select(offspring, k=len(population))

    return population, valid_individuals

if __name__ == "__main__":
    final_population, valid_individuals = main()
    
    if valid_individuals:
        print(f"Found {len(valid_individuals)} valid individuals within the threshold.")
        for ind, fit in valid_individuals:
            print(f"Individual: {ind}, Fitness: {fit}")
    else:
        print("No valid individuals found within the threshold.")

        

### With correlations

In [None]:
x= [97.42375995221329, 92.99244758064692, 94.66614028951585, 84.31716953093942, 64.58471808817177, 65.56549984921064, 84.65530936151131, 78.97155970760377, 41.10966437383675, 74.20977029479558, 93.49575711119695, 91.1191902578862, 75.83371697767345, 79.74900801910867, 61.43827504156183, 94.85133966251193, 97.41642359172391]


series = TimeSeries.from_values(np.array(x).reshape(1, -1))
x_scaled = darts_scaler.transform(series)
prediction = regressor.predict(x_scaled.values())[0]

prediction

In [None]:
feature_names =  [
    "start_date",
    "end_date",
    "cabin_in_surveyed_flight",
    "haul","bkg_200_journey_preparation_satisfaction",
    "pfl_100_checkin_satisfaction",
    "pfl_200_security_satisfaction",
    "pfl_300_lounge_satisfaction",
    "pfl_500_boarding_satisfaction",
    "ifl_300_cabin_satisfaction",
    "ifl_200_flight_crew_annoucements_satisfaction",
    "ifl_600_wifi_satisfaction",
    "ifl_500_ife_satisfaction",
    "ifl_400_food_drink_satisfaction",
    "ifl_100_cabin_crew_satisfaction",
    "arr_100_arrivals_satisfaction",
    "con_100_connections_satisfaction",
    "loy_200_loyalty_programme_satisfaction",
    "img_310_ease_contact_phone_satisfaction",
    "load_factor",
    "otp15_takeoff",
    "NPS_weighted",
    "insert_date_ci"
]

individual = [71.84856075258804, 95.33316761178381, 80.53743268440473, 100, 76.02849620274152, 72.96724176680158, 
        97.63242229989943, 90.21300041214877, 44.49390640706396, 99.38310607594644, 85.62715830119255, 81.48347972667862, 
        69.37947451916642, 92.85334359929657, 53.1402991538944, 97.85631448138926, 88.91503365519448]
values = ['2024-01-01', '2024-06-01', 'Economy', 'SH'] + individual + [f'{desired_output}', '2024-06-19']

ytd_2024 = [
    "2024-01-01",
    "2024-06-01",
    "Economy",
    "SH",
    71.84856075258804,
    79.06015584914005,
    80.29968621466156,
    74.394075935135,
    76.02849620274152,
    72.96724176680158,
    78.49867970533091,
    43.23740683347413,
    44.49390640706396,
    53.73307758411367,
    81.1828997942445,
    78.59234695868854,
    69.37947451916642,
    65.98255786642703,
    53.1402991538944,
    87.94953821163229,
    88.62830908016969,
    '2024-06-01'
]



# Create a dictionary
data = {feature: value for feature, value in zip(feature_names, values)}

# Convert the dictionary into a DataFrame
df = pd.DataFrame([data])

ytd_data ={feature: value for feature, value in zip(feature_names, ytd_2024)}

# Convert the dictionary into a DataFrame
ytd_df = pd.DataFrame([ytd_data])

# Display the DataFrame
concatenated_df = pd.concat([ytd_df, df]).sort_index(kind='merge')

In [None]:

lightgbm.plot_importance(lgb_model, importance_type="gain", figsize=(7,6), title="LightGBM Feature Importance (Gain)")
plt.show()

In [None]:
shap_explain = ShapExplainer(model=model)
shap_explain.summary_plot()

In [None]:
augmented_dfs

In [None]:
bsh_xlsx = augmented_dfs['Business_SH_df'][['cabin_in_surveyed_flight', 'haul', 'insert_date_ci', 'start_date', 'end_date', 'otp15_takeoff', 'out_prob_nps']]

In [None]:
bsh_xlsx.to_excel('business_sh_inc_otp_08052024.xlsx')

In [None]:
eco_xlsx = augmented_dfs['Economy_SH_df'][['cabin_in_surveyed_flight', 'haul', 'insert_date_ci', 'start_date', 'end_date', 'otp15_takeoff', 'out_prob_nps']]

In [None]:
eco_xlsx.to_excel('economy_sh_inc_otp_08052024.xlsx')

In [None]:
chinese_LH

In [None]:
augmented_dfs['Business_LH_df']

In [None]:
augmented_dfs['Business_SH_df']

In [None]:
augmented_dfs['Economy_LH_df']

In [None]:
augmented_dfs['Business_LH_df']

In [None]:
overall_LH_explanation=explanations['Economy_LH_df'][0]

In [None]:
explanations['Business_LH_df']

In [None]:
chinese_LH_explanation=explanations['Economy_LH_df'][1]

In [None]:
def create_uplifting_explanation(explanation2, explanation1):
    """
    Create a new Explanation object representing the uplifting between two Explanation objects.

    Parameters:
        - explanation1: The first shap.Explanation object.
        - explanation2: The second shap.Explanation object.

    Returns:
        - A new shap.Explanation object representing the uplifting.
    """
    # Calculate the difference in values, base_values, and data
    diff_values = explanation2.values - explanation1.values
    
    diff_base_values = explanation1.base_values + sum(explanation1.values)
    diff_data = explanation2.data - explanation1.data

    # Create a new Explanation object with the difference values
    diff_explanation = shap.Explanation(values=diff_values, base_values=diff_base_values, data=diff_data,
                                        feature_names=explanation1.feature_names)

    return diff_explanation

In [None]:
march_diff_explanation = create_uplifting_explanation(chinese_LH_explanation, overall_LH_explanation)

In [None]:
shap.plots.waterfall(overall_LH_explanation, max_display=30)

In [None]:
augmented_dfs['Business_LH_df'].to_excel('shaps_for_march_and_april_Business_LH_comparison.xlsx')

In [None]:
    # Reconstruir el DataFrame original
df = pd.concat(augmented_dfs.values())
df.reset_index(drop=True, inplace=True)

In [None]:
df.to_csv('weekly_predictions.csv')

## Importance representations

In [None]:
df = pd.read_csv('daily_predictions.csv')

In [None]:
df.drop(columns=['Unnamed: 0'])

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression

# Suponemos que df es tu DataFrame y ya está definido y cargado con los datos necesarios.
haul = 'SH'
cabin = 'Economy'
variable = 'otp15_takeoff'
real_target = 'NPS_weighted'
predicted_target = 'out_prob_nps'

# Define la función para remover outliers
def remove_outliers(data, threshold=1.5):
    q25 = np.percentile(data, 25)
    q75 = np.percentile(data, 75)
    iqr = q75 - q25
    lower_bound = q25 - threshold * iqr
    upper_bound = q75 + threshold * iqr
    return (data >= lower_bound) & (data <= upper_bound)

# Filtrado de datos
filtered_data = df[(df['haul'] == haul) & (df['cabin_in_surveyed_flight'] == cabin)]
x = filtered_data[variable].to_numpy().reshape(-1, 1)  # Características
y_real = filtered_data[real_target].to_numpy()  # Valores reales
y_pred = filtered_data[predicted_target].to_numpy()  # Valores predichos

# Identificar índices de datos válidos sin outliers para x y y_real
valid_x = remove_outliers(x.flatten())
valid_y_real = remove_outliers(y_real)

# Filtrar x, y_real, y_pred usando índices válidos
valid_indices = valid_x & valid_y_real
x_clean = x[valid_indices].reshape(-1, 1)
y_real_clean = y_real[valid_indices]
y_pred_clean = y_pred[valid_indices]

# Ajustar el modelo de regresión lineal con datos limpios
model = LinearRegression()
model.fit(x_clean, y_real_clean)

# Generar valores para la línea de regresión
x_fit = np.linspace(x_clean.min(), x_clean.max(), 100).reshape(-1, 1)
y_fit = model.predict(x_fit)

# Crear el gráfico de dispersión con la línea de regresión
plt.figure(figsize=(10,6))
plt.scatter(x_clean, y_real_clean, color='black', alpha=0.6, edgecolors='w', linewidths=0.5, label='Real NPS')
plt.title(f'Impact of {variable} on NPS - Real vs. Predicted with Regression')
plt.xlabel(f'{variable} Values')
plt.ylabel('NPS Values')
plt.grid(True)
plt.legend()



In [None]:
# Crear el gráfico de dispersión con la línea de regresión
plt.figure(figsize=(10,6))
plt.scatter(x_clean, y_real_clean, color='blue', alpha=0.6, edgecolors='w', linewidths=0.5, label='Real NPS')
plt.plot(x_fit, y_fit, color='green', linewidth=2, label='Regression Line')  # Añadir la línea de regresión
plt.title(f'Impact of {variable} on NPS - Real vs. Predicted with Regression')
plt.xlabel(f'{variable} Values')
plt.ylabel('NPS Values')
plt.grid(True)
plt.legend()

# Mover la anotación a la esquina inferior izquierda
plt.annotate(f'Slope: {model.coef_[0]:.2f}\nIntercept: {model.intercept_:.2f}\nR² Score: {model.score(x_clean, y_real_clean):.2f}', 
             xy=(0.05, 0.05), xycoords='axes fraction', verticalalignment='bottom', 
             bbox=dict(boxstyle="round,pad=0.3", edgecolor='green', facecolor='white'))
plt.show()

In [None]:
# Crear el gráfico de dispersión con la línea de regresión
plt.figure(figsize=(10,6))
plt.scatter(x_clean, y_real_clean, color='blue', alpha=0.6, edgecolors='w', linewidths=0.5, label='Real NPS')
plt.scatter(x_clean, y_pred_clean, color='red', alpha=0.6, edgecolors='w', linewidths=0.5, label='Predicted NPS')
plt.plot(x_fit, y_fit, color='green', linewidth=2, label='Regression Line')  # Añadir la línea de regresión
plt.title(f'Impact of {variable} on NPS - Real vs. Predicted with Regression')
plt.xlabel(f'{variable} Values')
plt.ylabel('NPS Values')
plt.grid(True)
plt.legend()

# Mover la anotación a la esquina inferior izquierda
plt.annotate(f'Slope: {model.coef_[0]:.2f}\nIntercept: {model.intercept_:.2f}\nR² Score: {model.score(x_clean, y_real_clean):.2f}', 
             xy=(0.05, 0.05), xycoords='axes fraction', verticalalignment='bottom', 
             bbox=dict(boxstyle="round,pad=0.3", edgecolor='green', facecolor='white'))
plt.show()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
import numpy as np

# variable = 'ifl_100_cabin_crew_satisfaction'
variable = 'otp15_takeoff'
target = f'{variable}_nps'
# target = 'out_prob_nps'
# target= 'NPS_weighted'


# Filter the data
filtered_data = df[(df['haul'] == haul) & (df['cabin_in_surveyed_flight'] == cabin)]
x = filtered_data[variable].to_numpy().reshape(-1, 1)  # Features
y = filtered_data[target].to_numpy()  # Target

# Define a function to remove outliers
def remove_outliers(data, threshold=1.5):
    q25 = np.percentile(data, 10)
    q75 = np.percentile(data, 90)
    iqr = q75 - q25
    lower_bound = q25 - threshold * iqr
    upper_bound = q75 + threshold * iqr
    return (data >= lower_bound) & (data <= upper_bound)


# Assuming 'df' is your DataFrame and x, y are already defined as numpy arrays.
import numpy as np
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt

# Filter data to get the relevant subset
filtered_data = df[(df['haul'] == haul) & (df['cabin_in_surveyed_flight'] == cabin)]
x = filtered_data[variable].values.reshape(-1, 1)  # Features as numpy array
y = filtered_data[target].values            # Target as numpy array

# Identify non-outlier indices for both x and y
valid_x = remove_outliers(x.flatten())  # flatten x to 1D for consistency with y
valid_y = remove_outliers(y)

# Get common indices where both x and y are non-outliers
valid_indices = valid_x & valid_y

# Filter both x and y using the valid_indices
x_clean = x[valid_indices].reshape(-1, 1)
y_clean = y[valid_indices]

# Fit the linear regression model with cleaned data
model_clean = LinearRegression()
model_clean.fit(x_clean, y_clean)

# Get model parameters
slope_clean = model_clean.coef_[0]
intercept_clean = model_clean.intercept_
r2_score_clean = model_clean.score(x_clean, y_clean)

# Generate values for the regression line
x_fit = np.linspace(x_clean.min(), x_clean.max(), 100).reshape(-1, 1)
y_fit = model_clean.predict(x_fit)

# Create the scatter plot with regression line
plt.figure(figsize=(10, 6))
plt.scatter(x_clean, y_clean, alpha=0.6, edgecolors='w', linewidths=0.5)
plt.plot(x_fit, y_fit, color='red', linewidth=2)  # Add the regression line
plt.title(f'{variable} vs {target}')
plt.xlabel(f'{variable} Actual Values')
plt.ylabel(f'{target}')
plt.grid(True)
plt.annotate(f'Slope: {slope_clean:.2f}\nIntercept: {intercept_clean:.2f}\nR² Score: {r2_score_clean:.2f}', xy=(0.05, 0.95), xycoords='axes fraction', 
             verticalalignment='top', bbox=dict(boxstyle="round,pad=0.3", edgecolor='red', facecolor='white'))
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor

# Generate synthetic data with a non-linear relationship
np.random.seed(0)
X1 = np.random.uniform(-3, 3, 100)
X2 = np.random.uniform(-3, 3, 100)
Y = np.sin(X1) + np.cos(X2) + np.random.normal(0, 0.1, 100)  # Non-linear relationship

# Fit a non-linear model
model = RandomForestRegressor()
model.fit(np.column_stack((X1, X2)), Y)

# Predictions for visualization
x1_range = np.linspace(-3, 3, 100)
x2_range = np.linspace(-3, 3, 100)
X1_grid, X2_grid = np.meshgrid(x1_range, x2_range)
Y_pred = model.predict(np.c_[X1_grid.ravel(), X2_grid.ravel()]).reshape(X1_grid.shape)

# Plot
plt.figure(figsize=(10, 7))
plt.contourf(X1_grid, X2_grid, Y_pred, levels=30, cmap='viridis')
plt.colorbar()
plt.xlabel('X1')
plt.ylabel('X2')
plt.title('Predicted Y from Non-Linear Model')
plt.show()





In [None]:
targets_pred = df[(df['insert_date_ci']=='2023-11-21')&(pd.to_datetime(df['end_date']).dt.year>2023)]

In [None]:
targets_pred[['start_date', 'end_date','cabin_in_surveyed_flight', 'haul', 'NPS_weighted', 'out_prob_nps', 'out_prob_base']]
targets_pred

In [None]:
df= targets_pred.copy()
# Calcular la diferencia entre NPS_weighted y out_prob_nps
df['difference'] = df['NPS_weighted'] - df['out_prob_nps']

# Obtener las columnas de SHAP que terminan en "_nps"
shap_columns = [col for col in df.columns if col.endswith('_nps') and col != 'out_prob_nps']

# Función para ajustar los valores de SHAP
def adjust_shap_values(row):
    difference = row['difference']
    if difference > 0:
        # Filtrar los valores SHAP positivos
        positive_shaps = [col for col in shap_columns if row[col] > 0]
        if positive_shaps:
            adjustment = difference / len(positive_shaps)
            for col in positive_shaps:
                row[col] += adjustment
    elif difference < 0:
        # Filtrar los valores SHAP negativos
        negative_shaps = [col for col in shap_columns if row[col] < 0]
        if negative_shaps:
            adjustment = difference / len(negative_shaps)
            for col in negative_shaps:
                row[col] += adjustment
    return row

# Aplicar la función de ajuste a cada fila del dataframe
df = df.apply(adjust_shap_values, axis=1)

# Actualizar la columna out_prob_nps con la suma de los nuevos valores SHAP y el valor base
df['out_prob_nps'] = df[shap_columns].sum(axis=1) + df['out_prob_base']

# Eliminar la columna de diferencia ya que no es necesaria
df.drop(columns=['difference'], inplace=True)
df

In [None]:
import numpy as np

def prepare_and_adjust_shap(df, shap_columns, desired_nps, base_output_prob, clamp_min, clamp_max):
    # Calculate the target SHAP sum, which is the desired NPS minus the base model output probability
    df['target_shap_sum'] = desired_nps - base_output_prob

    # Proceed to normalize and clamp SHAP values
    df = normalize_clamp_shap(df, shap_columns, df['target_shap_sum'], clamp_min, clamp_max)

    # Update 'out_prob_nps' with the new SHAP values sum and the base output probability
    df['out_prob_nps'] = df['out_prob_base'] + df[shap_columns].sum(axis=1)

    # Check if the new 'out_prob_nps' matches 'NPS_weighted'
    df['is_correct_nps'] = np.isclose(df['out_prob_nps'], df['NPS_weighted'], atol=1e-5)

    return df

def normalize_clamp_shap(df, shap_columns, target_shap_sum, clamp_min, clamp_max):
    # Calculate the initial sum of SHAP values
    current_shap_total = df[shap_columns].sum(axis=1)

    # Normalize SHAP values to match the target prediction error
    normalized_shap = df[shap_columns].div(current_shap_total, axis=0).mul(target_shap_sum, axis=0)

    # Apply proportional scaling to ensure all values are within bounds
    scaling_factor = np.maximum(np.abs(normalized_shap / clamp_max), np.abs(normalized_shap / clamp_min))
    scaling_factor = scaling_factor.max(axis=1)

    # Adjust scaling factor to avoid division by zero and ensure it's at least 1
    scaling_factor[scaling_factor < 1] = 1

    # Apply scaling
    adjusted_shap = normalized_shap.div(scaling_factor, axis=0)

    # Assign adjusted SHAP values back, ensuring they stay within bounds
    adjusted_shap = adjusted_shap.clip(lower=clamp_min, upper=clamp_max)
    df.loc[:, shap_columns] = adjusted_shap

    return df

# Example usage:
clamp_min, clamp_max = -5, 5  # Define bounds
desired_nps = targets_pred['NPS_weighted']  # Assuming this is your desired NPS
base_output_prob = targets_pred['out_prob_base']  # Assuming this is your base output probability

adjusted_df = prepare_and_adjust_shap(targets_pred.copy(), shap_columns, desired_nps, base_output_prob, clamp_min, clamp_max)
print(adjusted_df[['out_prob_nps', 'NPS_weighted', 'is_correct_nps']])




In [None]:
adjusted_df

In [None]:
import numpy as np

def prepare_and_adjust_shap(df, shap_columns, desired_nps, base_output_prob, clamp_min, clamp_max):
    # Calculate the target SHAP sum, which is the desired NPS minus the base model output probability
    df['target_shap_sum'] = desired_nps - base_output_prob

    # Normalize and clamp SHAP values
    df = normalize_clamp_shap(df, shap_columns, df['target_shap_sum'], clamp_min, clamp_max)

    # Update 'out_prob_nps' with the new SHAP values sum and the base output probability
    df['out_prob_nps'] = df['out_prob_base'] + df[shap_columns].sum(axis=1)

    # Apply a final normalization if 'out_prob_nps' does not match 'NPS_weighted'
    df = final_normalization(df, shap_columns, desired_nps)

    # Check if the new 'out_prob_nps' matches 'NPS_weighted'
    df['is_correct_nps'] = np.isclose(df['out_prob_nps'], df['NPS_weighted'], atol=1e-5)

    return df

def normalize_clamp_shap(df, shap_columns, target_shap_sum, clamp_min, clamp_max):
    current_shap_total = df[shap_columns].sum(axis=1)
    normalized_shap = df[shap_columns].div(current_shap_total, axis=0).mul(target_shap_sum, axis=0)
    clamped_shap = normalized_shap.clip(lower=clamp_min, upper=clamp_max)
    df[shap_columns] = clamped_shap
    return df

def final_normalization(df, shap_columns, desired_nps):
    # Calculate the total contribution needed from SHAP values to meet the desired NPS
    total_needed_shap = desired_nps - df['out_prob_base']
    current_shap_sum = df[shap_columns].sum(axis=1)

    # Determine the factor by which to adjust the SHAP values
    normalization_factor = total_needed_shap / current_shap_sum

    # Adjust SHAP values
    df[shap_columns] = df[shap_columns].mul(normalization_factor, axis=0)
    df['out_prob_nps'] = df['out_prob_base'] + df[shap_columns].sum(axis=1)
    return df

# Example usage:
clamp_min, clamp_max = -5, 5  # Define bounds
desired_nps = targets_pred['NPS_weighted']  # Assuming this is your desired NPS
base_output_prob = targets_pred['out_prob_base']  # Assuming this is your base output probability

adjusted_df = prepare_and_adjust_shap(targets_pred.copy(), shap_columns, desired_nps, base_output_prob, clamp_min, clamp_max)
print(adjusted_df[['out_prob_nps', 'NPS_weighted', 'is_correct_nps']])


In [None]:
# Filter the DataFrame based on the conditions
condition = (
    (adjusted_df['cabin_in_surveyed_flight'] == 'Business') & 
    (adjusted_df['haul'] == 'LH') & 
    (adjusted_df['start_date'] == '2024-01-01')
)

# Set the 'loy_200_loyalty_programme_satisfaction_nps' column value to -3.6 for these rows
adjusted_df.loc[condition, 'loy_200_loyalty_programme_satisfaction_nps'] = -3.6


In [None]:
# Filter the DataFrame based on the conditions
condition = (
    (adjusted_df['cabin_in_surveyed_flight'] == 'Business') & 
    (adjusted_df['haul'] == 'LH') & 
    (adjusted_df['start_date'] == '2024-01-01')
)

# Set the 'loy_200_loyalty_programme_satisfaction_nps' column value to -3.6 for these rows
adjusted_df.loc[condition, 'img_310_ease_contact_phone_satisfaction_nps'] = -1.2

In [None]:
# Filter the DataFrame based on the conditions
condition = (
    (adjusted_df['cabin_in_surveyed_flight'] == 'Premium Economy') & 
    (adjusted_df['haul'] == 'LH') & 
    (adjusted_df['start_date'] == '2024-08-01')
)

# Set the 'loy_200_loyalty_programme_satisfaction_nps' column value to -3.6 for these rows
adjusted_df.loc[condition, 'ifl_100_cabin_crew_satisfaction_nps'] = 6


In [None]:
# Filter the DataFrame based on the conditions
condition = (
    (adjusted_df['cabin_in_surveyed_flight'] == 'Premium Economy') & 
    (adjusted_df['haul'] == 'LH') & 
    (adjusted_df['start_date'] == '2024-08-01')
)

# Set the 'loy_200_loyalty_programme_satisfaction_nps' column value to -3.6 for these rows
adjusted_df.loc[condition, 'ifl_400_food_drink_satisfaction_nps'] = 4.332

In [None]:
# Filter the DataFrame based on the conditions
condition = (
    (adjusted_df['cabin_in_surveyed_flight'] == 'Premium Economy') & 
    (adjusted_df['haul'] == 'LH') & 
    (adjusted_df['start_date'] == '2024-08-01')
)

# Set the 'loy_200_loyalty_programme_satisfaction_nps' column value to -3.6 for these rows
adjusted_df.loc[condition, 'pfl_100_checkin_satisfaction_nps'] = -1.5

In [None]:
# Filter the DataFrame based on the conditions
condition = (
    (adjusted_df['cabin_in_surveyed_flight'] == 'Premium Economy') & 
    (adjusted_df['haul'] == 'LH') & 
    (adjusted_df['start_date'] == '2024-10-01')
)

# Set the 'loy_200_loyalty_programme_satisfaction_nps' column value to -3.6 for these rows
adjusted_df.loc[condition, 'otp15_takeoff_nps'] = 4.014

In [None]:
# Filter the DataFrame based on the conditions
condition = (
    (adjusted_df['cabin_in_surveyed_flight'] == 'Premium Economy') & 
    (adjusted_df['haul'] == 'LH') & 
    (adjusted_df['start_date'] == '2024-10-01')
)

# Set the 'loy_200_loyalty_programme_satisfaction_nps' column value to -3.6 for these rows
adjusted_df.loc[condition, 'load_factor_nps'] = 1.863

In [None]:
# Filter the DataFrame based on the conditions
condition = (
    (adjusted_df['cabin_in_surveyed_flight'] == 'Premium Economy') & 
    (adjusted_df['haul'] == 'LH') & 
    (adjusted_df['start_date'] == '2024-10-01')
)

# Set the 'loy_200_loyalty_programme_satisfaction_nps' column value to -3.6 for these rows
adjusted_df.loc[condition, 'img_310_ease_contact_phone_satisfaction_nps'] = 2.085

In [None]:
# Filter the DataFrame based on the conditions
condition = (
    (adjusted_df['cabin_in_surveyed_flight'] == 'Premium Economy') & 
    (adjusted_df['haul'] == 'LH') & 
    (adjusted_df['start_date'] == '2024-10-01')
)

# Set the 'loy_200_loyalty_programme_satisfaction_nps' column value to -3.6 for these rows
adjusted_df.loc[condition, 'ifl_400_food_drink_satisfaction_nps'] = -4.5

In [None]:
# Filter the DataFrame based on the conditions
condition = (
    (adjusted_df['cabin_in_surveyed_flight'] == 'Premium Economy') & 
    (adjusted_df['haul'] == 'LH') & 
    (adjusted_df['start_date'] == '2024-10-01')
)

# Set the 'loy_200_loyalty_programme_satisfaction_nps' column value to -3.6 for these rows
adjusted_df.loc[condition, 'pfl_100_checkin_satisfaction_nps'] = 4.092

In [None]:
# Filter the DataFrame based on the conditions
condition = (
    (adjusted_df['cabin_in_surveyed_flight'] == 'Premium Economy') & 
    (adjusted_df['haul'] == 'LH') & 
    (adjusted_df['start_date'] == '2024-10-01')
)

# Set the 'loy_200_loyalty_programme_satisfaction_nps' column value to -3.6 for these rows
adjusted_df.loc[condition, 'ifl_100_cabin_crew_satisfaction_nps'] = -2.592

In [None]:
# Filter the DataFrame based on the conditions
condition = (
    (adjusted_df['cabin_in_surveyed_flight'] == 'Premium Economy') & 
    (adjusted_df['haul'] == 'LH') & 
    (adjusted_df['start_date'] == '2024-10-01')
)

# Set the 'loy_200_loyalty_programme_satisfaction_nps' column value to -3.6 for these rows
adjusted_df.loc[condition, 'ifl_200_flight_crew_annoucements_satisfaction_nps'] = -0.254

In [None]:
def check_final(df, shap_columns, desired_nps, base_output_prob):
    # Update 'out_prob_nps' with the new SHAP values sum and the base output probability
    df['out_prob_nps'] = df['out_prob_base'] + df[shap_columns].sum(axis=1)

    # Check if the new 'out_prob_nps' matches 'NPS_weighted'
    df['is_correct_nps'] = np.isclose(df['out_prob_nps'], df['NPS_weighted'], atol=1e-5)

    return df
adjusted_df = check_final(adjusted_df.copy(), shap_columns, desired_nps, base_output_prob)

In [None]:
print(adjusted_df[['out_prob_nps', 'NPS_weighted', 'is_correct_nps']])

In [None]:
df = final_normalization(adjusted_df, shap_columns, desired_nps)

In [None]:
np.isclose(df['out_prob_nps'], df['NPS_weighted'], atol=1e-5)

In [None]:
adjusted_df.tail()

In [None]:
def add_shap_sum_column(df, shap_columns):
    # Calculate the sum of SHAP values across the specified columns for each row
    df['sum_adjusted_shaps'] = df[shap_columns].sum(axis=1) + df['out_prob_base']
    return df

# Apply the function to add the sum column to the adjusted DataFrame
adjusted_df = add_shap_sum_column(adjusted_df, shap_columns)

In [None]:
adjusted_df[['start_date', 'end_date','cabin_in_surveyed_flight', 'haul', 'NPS_weighted', 'out_prob_nps','sum_adjusted_shaps', 'out_prob_base']]

In [None]:
# Step 1: Identify all Shapley columns, excluding 'out_prob_nps'
shap_columns = [col for col in targets_pred.columns if col.endswith('_nps') and col != 'out_prob_nps']
print(shap_columns)

# Step 2: Calculate the current total Shapley values per row
current_shap_total = targets_pred[shap_columns].sum(axis=1)

# Step 3: Calculate the target Shapley sum (NPS_weighted - out_prob_base)
target_shap_sum = targets_pred['NPS_weighted'] - targets_pred['out_prob_base']

# Step 4: Calculate the adjustment ratio
adjustment_ratio = target_shap_sum / current_shap_total

# Step 5: Adjust Shapley values using the adjustment ratio
for col in shap_columns:
    targets_pred[col] *= adjustment_ratio



In [None]:
df

In [None]:
df.to_excel('final_shaps_for_targets.xlsx')

# Debug concatenated targets

In [None]:
targets_df= pd.read_csv('corrected_nps_data.csv')

In [None]:
df_historic = pd.read_csv('historic_predictions_q1 (1).csv')

In [None]:
concatenated_df = pd.concat([df_historic,targets_df.drop(columns='NPS_weighted')], ignore_index=True)

In [None]:
concatenated_df