In [None]:
from typing import Union, List

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from xai.data.reader import read_data
from xai.models import LightGBMModel, LogisticRegressionModel
from xai.validation import HoldOutValidation


pd.set_option('display.max_columns', 500)

# 1)

In [None]:
features, target = read_data('data/hotel_bookings.csv')

In [None]:
features.head()

In [None]:
validation = HoldOutValidation(test_size=0.1, random_state=42)

(X_train, y_train), (X_test, y_test) = next(validation.split(features, target))

In [None]:
model = LightGBMModel(
    n_estimators=1000,
    learning_rate=0.007,
    max_depth=-1,
    num_leaves=64,
    random_state=42)

In [None]:
model = model.fit(X_train, y_train)

In [None]:
y_pred_train = model.predict(X_train)
accuracy = np.mean(y_pred_train == y_train)
print(f'Train accuracy = {accuracy}')

In [None]:
y_pred_test = model.predict(X_test)
accuracy = np.mean(y_pred_test == y_test)
print(f'Test accuracy = {accuracy}')

# 2)

In [None]:
X_transformed = model._feature_engineering(features, target, train=False)

feature_names = X_transformed.columns
categorical_features = [feature_id for feature_id, feature in enumerate(feature_names) 
                         if X_transformed[feature].dtype.name == 'category']

categorical_mapping = {feature_names[feature]: dict(zip(features.iloc[:,feature].cat.codes, 
                                                        features.iloc[:,feature].values)) 
                       for feature in categorical_features}

for categoric_feature in categorical_features:
    X_transformed.iloc[:, categoric_feature] = X_transformed.iloc[:, categoric_feature].cat.codes

feature_names = list(X_transformed.columns)

model.model.fit(X_transformed, target)

X_transformed.head()

def predict_fn(x):
    return model.model.predict_proba(x)[::, 1]

def cp_profile(data: pd.DataFrame, 
               observations: Union[int, List[int]],
               variable_name: str,
              ) -> None:
    predictions = []
    observations = observations if isinstance(observations, list) else [observations]
    
    obs = data.loc[0, variable_name]
    obs_pred = predict_fn(X_transformed.iloc[0:1, :])[0]
    
    for observation in tqdm(observations):
        _range = (np.nanmin(data[variable_name]), np.nanmax(data[variable_name]))
        if data[variable_name].nunique() < 1000:
            linspace = np.sort(np.unique(data[variable_name].dropna()))
        else:
            linspace = np.linspace(start=_range[0], stop=_range[1], num=1000)
        clones = pd.concat(
            [data.iloc[observation:(observation+1),:]] * len(linspace), ignore_index=True)
        clones[variable_name] = linspace
        pred = predict_fn(clones)
        predictions.append(pred)
    
    plt.style.use('seaborn-whitegrid')
    plt.figure(figsize=(24, 8), facecolor='w')
    for pred_id, pred in enumerate(predictions):
        if not pred_id:
            plt.scatter(obs, obs_pred, color='#FF8C00', s=200)
            plt.plot(linspace, pred, color='#FF8C00', lw=5, zorder=10)
        else:
            plt.plot(linspace, pred, color='#00B1EB', alpha=0.3, zorder=-1)
        if variable_name in categorical_mapping:
            plt.xticks(
                linspace, 
                list(map(lambda x: categorical_mapping[variable_name][x], linspace)), 
                fontsize=18)
        else:
            plt.xticks(fontsize=18)
    plt.xlabel(variable_name, fontsize=24)
    plt.ylabel('Predykcja modelu', fontsize=24)
    plt.yticks(fontsize=18)
    plt.show()

In [None]:
prediction = predict_fn(X_transformed.iloc[0:1, :])[0]
print(f'Predykcja modelu:\t{int(prediction>0.5)} ({prediction})')

# 3)

In [None]:
INTERESTING_FEATURES = [
    'lead_time',
    'booking_changes',
    'previous_cancellations_percent',
    'previous_bookings_not_canceled_percent',
    'required_car_parking_spaces'
]


for feature in INTERESTING_FEATURES:
    cp_profile(X_transformed, list(range(50)), feature)

# 4)

In [None]:
variable_name = 'is_repeated_guest'

found_asc = False
found_desc = False

for obs_id in range(len(X_transformed)):
    obs = X_transformed.iloc[obs_id:(obs_id+1), :]
    a = obs.copy()
    b = obs.copy()
    a[variable_name] = 0
    b[variable_name] = 1
    preds = [predict_fn(a)[0], predict_fn(b)[0]]
    if not found_asc and preds[0] - preds[1] > 0.2:
        found_asc = True
        display(obs)
        print(preds)
    if not found_desc and preds[1] - preds[0] > 0.4:
        found_desc = True
        display(obs)
        print(preds)
    if found_asc and found_desc:
        break

In [None]:
print(f"Procent powracających gości: {np.mean(X_transformed[variable_name])}")

# 5)

In [None]:
model = LogisticRegressionModel()

In [None]:
model = model.fit(X_train, y_train)

In [None]:
y_pred_train = model.predict(X_train)
accuracy = np.mean(y_pred_train == y_train)
print(f'Train accuracy = {accuracy}')

In [None]:
y_pred_test = model.predict(X_test)
accuracy = np.mean(y_pred_test == y_test)
print(f'Test accuracy = {accuracy}')

In [None]:
X_transformed = model._feature_engineering(features, target, train=False)

feature_names = X_transformed.columns
categorical_features = [feature_id for feature_id, feature in enumerate(feature_names) 
                         if X_transformed[feature].dtype.name == 'category']

categorical_mapping = {feature_names[feature]: dict(zip(features.iloc[:,feature].cat.codes, 
                                                        features.iloc[:,feature].values)) 
                       for feature in categorical_features}

for categoric_feature in categorical_features:
    X_transformed.iloc[:, categoric_feature] = X_transformed.iloc[:, categoric_feature].cat.codes

feature_names = list(X_transformed.columns)

model.model.fit(X_transformed, target)

X_transformed.head()

def predict_fn(x):
    return model.model.predict_proba(x)[::, 1]

def cp_profile(data: pd.DataFrame, 
               observations: Union[int, List[int]],
               variable_name: str,
              ) -> None:
    predictions = []
    observations = observations if isinstance(observations, list) else [observations]
    
    obs = data.loc[0, variable_name]
    obs_pred = predict_fn(X_transformed.iloc[0:1, :])[0]
    
    _range = (np.nanmin(data[variable_name]), np.nanmax(data[variable_name]))
    if data[variable_name].nunique() < 1000:
        linspace = np.sort(np.unique(data[variable_name].dropna()))
    else:
        linspace = np.linspace(start=_range[0], stop=_range[1], num=1000)
    
    for observation in tqdm(observations):
        clones = pd.concat(
            [data.iloc[observation:(observation+1),:]] * len(linspace), ignore_index=True)
        clones[variable_name] = linspace
        pred = predict_fn(clones)
        predictions.append(pred)
    
    plt.style.use('seaborn-whitegrid')
    plt.figure(figsize=(24, 8), facecolor='w')
    for pred_id, pred in enumerate(predictions):
        if not pred_id:
            plt.scatter(obs, obs_pred, color='#FF8C00', s=200)
            plt.plot(linspace, pred, color='#FF8C00', lw=5, zorder=10)
        else:
            plt.plot(linspace, pred, color='#00B1EB', alpha=0.3, zorder=-1)
    plt.xticks(fontsize=18)
    plt.xlabel(variable_name, fontsize=24)
    plt.ylabel('Predykcja modelu', fontsize=24)
    plt.yticks(fontsize=18)
    plt.show()

In [None]:
prediction = predict_fn(X_transformed.iloc[0:1, :])[0]
print(f'Predykcja modelu:\t{int(prediction>0.5)} ({prediction})')

In [None]:
INTERESTING_FEATURES = [
    'lead_time',
    'booking_changes',
    'previous_cancellations_percent',
    'previous_bookings_not_canceled_percent',
    'required_car_parking_spaces'
]


for feature in INTERESTING_FEATURES:
    cp_profile(X_transformed, list(range(50)), feature)