In [3]:
import shap
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer, r2_score
import matplotlib.pyplot as plt
import joblib

# Load the dataset
df_raw = pd.read_csv('../../Train.csv')
encoder = joblib.load('encoder.joblib')
# Convert the 'date_time' column to datetime and sort the dataset
df_raw['date_time'] = pd.to_datetime(df_raw['date_time'])
df_raw.sort_values('date_time', inplace=True)

# Extracting non-numeric columns
non_numeric_cols = ['is_holiday', 'weather_type', 'weather_description']

# Group by 'date_time' and aggregate: mean for numeric columns, mode for non-numeric columns
agg_funcs = {col: 'mean' for col in df_raw.columns if col not in non_numeric_cols}
agg_funcs.update({col: lambda x: x.mode()[0] if not x.mode().empty else np.nan for col in non_numeric_cols})

df_aggregated = df_raw.groupby('date_time').agg(agg_funcs)

# Extract unique values for categorical columns from df_raw
unique_values = {col: df_raw[col].unique() for col in non_numeric_cols}

# One-hot encode categorical features using unique values
encoded_data = encoder.transform(df_aggregated[non_numeric_cols])

# Create a DataFrame with encoded data and columns
df_encode = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())

# Ensure all unique values in df_raw are included in df_encode
for col in non_numeric_cols:
    for value in unique_values[col]:
        column_name = f"{col}_{value}"
        if column_name not in df_encode.columns:
            df_encode[column_name] = 0  # Add missing column with zeros

# Reset index of df_encode
df_encode.index = df_aggregated.index

# Concatenate with df_aggregated
df = pd.concat([df_aggregated, df_encode], axis=1)

# Add hour from the 'date_time' column
df['hour'] = df.index.hour

# Feature engineering: create lagged and rolling features
target = 'traffic_volume'
for i in range(1, 4):
    df[f'traffic_volume_lag_{i}'] = df[target].shift(i)
df['traffic_volume_rolling_mean'] = df[target].rolling(window=3).mean().shift(1)
df['traffic_volume_rolling_std'] = df[target].rolling(window=3).std().shift(1)

# Remove rows with NaN values resulting from lagged features
df.dropna(inplace=True)

# Split the dataset into features and the target
X = df.drop(target, axis=1)
y = df[target]

# Save 'date_time' for later use
date_time = df['date_time']

# Drop 'date_time' column before scaling
df = df.drop(columns=['date_time'])

# Scale the numerical features
scaler = joblib.load('scaler2.joblib')
df_scaled = scaler.fit_transform(df)

# Convert scaled data back to DataFrame and add 'date_time' column back
df_scaled = pd.DataFrame(df_scaled, columns=[col for col in df.columns if col != 'date_time'])
df_scaled['date_time'] = date_time.values

# Split the data into train and test sets
total_samples = df_scaled.shape[0]
split_index = int(total_samples * 0.9)

X_train = df_scaled.iloc[:split_index].drop(columns=['date_time'])
y_train = y.iloc[:split_index]
X_test = df_scaled.iloc[split_index:].drop(columns=['date_time'])
y_test = y.iloc[split_index:]

X_train = X_train.drop(target, axis=1)
X_test = X_test.drop(target, axis=1)

best_model_grid = 'best_xgboost_model_gridsearch.joblib'

# Create a SHAP Tree Explainer for the XGBoost model
explainer = shap.TreeExplainer(joblib.load(best_model_grid))

# Calculate SHAP values - this might take some time for larger datasets
shap_values = explainer.shap_values(X_test)

shap.initjs()

# Summary plot
shap.summary_plot(shap_values, X_test, plot_type="bar")

# Mean absolute SHAP values
shap.summary_plot(shap_values, X_test)

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- weather_description_light rain and snow
- weather_description_proximity thunderstorm with drizzle
- weather_description_shower drizzle
- weather_description_shower snow
- weather_description_sleet
- ...


In [None]:
# Force plot for a single prediction
shap.force_plot(explainer.expected_value, shap_values[0, :], X_test.iloc[0, :])

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import joblib
import xgboost as xgb
import pandas as pd
from datetime import timedelta
import shap
import pandas as pd
import numpy as np

# Load the encoder, scaler, and model
encoder = joblib.load('encoder.joblib')
scaler = joblib.load('scaler2.joblib')
model = joblib.load('best_xgboost_model_gridsearch.joblib')

# Load the dataset
df_raw = pd.read_csv('../../Train.csv')

# Convert the 'date_time' column to datetime and sort the dataset
df_raw['date_time'] = pd.to_datetime(df_raw['date_time'])
df_raw.sort_values('date_time', inplace=True)

# Extracting non-numeric columns
non_numeric_cols = ['is_holiday', 'weather_type', 'weather_description']

# for col in non_numeric_cols:
#     print(f"Unique values for {col} in df_raw: {df_raw[col].unique()}")


# Group by 'date_time' and aggregate: mean for numeric columns, mode for non-numeric columns
agg_funcs = {col: 'mean' for col in df_raw.columns if col not in non_numeric_cols}
agg_funcs.update({col: lambda x: x.mode()[0] if not x.mode().empty else np.nan for col in non_numeric_cols})

df_aggregated = df_raw.groupby('date_time').agg(agg_funcs)


# Extract unique values for categorical columns from df_raw
unique_values = {col: df_raw[col].unique() for col in non_numeric_cols}

# One-hot encode categorical features using unique values
encoded_data = encoder.transform(df_aggregated[non_numeric_cols])

# Create a DataFrame with encoded data and columns
df_encode = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())

# Ensure all unique values in df_raw are included in df_encode
for col in non_numeric_cols:
    for value in unique_values[col]:
        column_name = f"{col}_{value}"
        if column_name not in df_encode.columns:
            df_encode[column_name] = 0  # Add missing column with zeros

# Reset index of df_encode
df_encode.index = df_aggregated.index

# Concatenate with df_aggregated
df = pd.concat([df_aggregated, df_encode], axis=1)

# Add hour from the 'date_time' column
df['hour'] = df['date_time'].dt.hour
df = df.drop(columns=non_numeric_cols)

# Feature engineering: create lagged and rolling features
target = 'traffic_volume'
for i in range(1, 4):
    df[f'traffic_volume_lag_{i}'] = df[target].shift(i)
df['traffic_volume_rolling_mean'] = df[target].rolling(window=3).mean().shift(1)
df['traffic_volume_rolling_std'] = df[target].rolling(window=3).std().shift(1)

# Remove rows with NaN values resulting from lagged features
df.dropna(inplace=True)

# Split the dataset into features and the target
X = df.drop(target, axis=1)
y = df[target]

# Save 'date_time' for later use
date_time = df['date_time']

# Drop 'date_time' column before scaling
df = df.drop(columns=['date_time'])

scaler = joblib.load('scaler2.joblib')
# Check categories in encoder
# print(scaler.get_feature_names_out())

# Scale the numerical features
df_scaled = scaler.transform(df)  # Use the previously loaded scaler

# Convert scaled data back to DataFrame
df_scaled = pd.DataFrame(df_scaled, columns=[col for col in df.columns if col != 'date_time'])
df_scaled['date_time'] = date_time.values

X = df_scaled.drop(columns=['date_time'])

X = X.drop(target, axis=1)
best_model_grid = 'best_xgboost_model_gridsearch.joblib'

# Create a SHAP Tree Explainer for the XGBoost model
explainer = shap.TreeExplainer(joblib.load(best_model_grid))

# Calculate SHAP values - this might take some time for larger datasets
shap_values = explainer.shap_values(X)

shap.initjs()

# Force plot for a single prediction
shap.force_plot(explainer.expected_value, shap_values[0, :], X.iloc[0, :])

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- weather_description_light rain and snow
- weather_description_proximity thunderstorm with drizzle
- weather_description_shower drizzle
- weather_description_shower snow
- weather_description_sleet
- ...


In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer, r2_score
import matplotlib.pyplot as plt
import joblib

encoder = joblib.load('encoderbaru.joblib')
scaler = joblib.load('scalerbaru.joblib')
model = joblib.load('best_xgboost_model.joblib')

# Define custom scorer for MAPE
def mape_scorer(y_true, y_pred):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    non_zero_mask = y_true != 0
    return np.mean(np.abs((y_true[non_zero_mask] - y_pred[non_zero_mask]) / y_true[non_zero_mask])) * 100


# Make scorers from custom scoring functions
mape = make_scorer(mape_scorer, greater_is_better=False)
# Load the dataset
df_raw = pd.read_csv('../../Train.csv')

# Convert the 'date_time' column to datetime and sort the dataset
df_raw['date_time'] = pd.to_datetime(df_raw['date_time'])
df_raw.sort_values('date_time', inplace=True)

# Extracting non-numeric columns
non_numeric_cols = ['is_holiday', 'weather_type', 'weather_description']

# Group by 'date_time' and aggregate: mean for numeric columns, mode for non-numeric columns
agg_funcs = {col: 'mean' for col in df_raw.columns if col not in non_numeric_cols}
agg_funcs.update({col: lambda x: x.mode()[0] if not x.mode().empty else np.nan for col in non_numeric_cols})

df_aggregated = df_raw.groupby('date_time').agg(agg_funcs)

# One-hot encode categorical features
encoded_data = encoder.fit_transform(df_aggregated[non_numeric_cols])

df_encode = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out())

# Reset index of df_encode
df_encode.index = df_aggregated.index

# Concatenate with df_aggregated
df = pd.concat([df_aggregated, df_encode], axis=1)

# Add hour from the 'date_time' column
df['hour'] = df['date_time'].dt.hour
df = df.drop(columns=non_numeric_cols)
# Feature engineering: create lagged and rolling features
target = 'traffic_volume'
for i in range(1, 4):
    df[f'traffic_volume_lag_{i}'] = df[target].shift(i)
df['traffic_volume_rolling_mean'] = df[target].rolling(window=3).mean().shift(1)
df['traffic_volume_rolling_std'] = df[target].rolling(window=3).std().shift(1)

# Remove rows with NaN values resulting from lagged features
df.dropna(inplace=True)

# Split the dataset into features and the target
X = df.drop(target, axis=1)
y = df[target]

# Save 'date_time' for later use
date_time = df['date_time']

# Drop 'date_time' column before scaling
df = df.drop(columns=['date_time'])

# Scale the numerical features
df_scaled = scaler.fit_transform(df)

# Convert scaled data back to DataFrame and add 'date_time' column back
df_scaled = pd.DataFrame(df_scaled, columns=[col for col in df.columns if col != 'date_time'])
df_scaled['date_time'] = date_time.values



In [3]:
import shap
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, make_scorer, r2_score
import matplotlib.pyplot as plt
import joblib
# Create a SHAP Tree Explainer for the XGBoost model
explainer = shap.TreeExplainer(model)

    # Calculate SHAP values - this might take some time for larger datasets
shap_values = explainer.shap_values(X)



ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:date_time: datetime64[ns]