# Moov AI - Question #2

Les insights pouvant permettre aux gestionnaires de mieux comprendre ce qui fait varier leurs ventes sont nombreux. 

*L'analyse se fera en partie par région, en supposant qu'une région correspond à peu près à un secteur auquel un cadre est affecté.

In [1]:
# Import des données et des librairies
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
from IPython.display import display
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import zscore

from lightgbm import early_stopping, log_evaluation, record_evaluation
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb

In [2]:
# Import csv
df = pd.read_csv(r'/Users/philippebeliveau/Desktop/Notebook/Moov AI/stores_sales_forecasting.csv', encoding='ISO-8859-1')

# Transform 'Order Date' and 'Ship Date' to datetime
df['Order Date'] = pd.to_datetime(df['Order Date'])
df['Ship Date'] = pd.to_datetime(df['Ship Date']) 

# Transform the postal code in a categorical variable
df['Postal Code'] = df['Postal Code'].astype('str')
df['Row ID'] = df['Row ID'].astype('str')

# Create a feature to compute the difference between the order date and the ship date
df['Order Ship Delta'] = (df['Ship Date'] - df['Order Date']).dt.days

# Create a feature regarding the profit margin 
df['Profit Margin'] = df['Profit'] / df['Sales']

print(f"Shape of the dataset: {df.shape}")
# display(df.head(5).style.set_sticky().set_properties(**{'overflow-x': 'auto'}))

Shape of the dataset: (2121, 23)


In [3]:
# Assure that the data has a rows for every single day
def adjust_dataset_for_daily_entries(df, date_col):
    """Ensures that the dataset has a row for every single day, filling missing days with NaN values."""
    df[date_col] = pd.to_datetime(df[date_col])
    all_dates = pd.date_range(start=df[date_col].min(), end=df[date_col].max(), freq='D')
    
    # Ensure all columns are retained, filling missing values with NaN
    full_df = pd.DataFrame(all_dates, columns=[date_col])
    df = full_df.merge(df, on=date_col, how='left')
    
    return df

df = adjust_dataset_for_daily_entries(df, 'Order Date')

# L’impact des “holiday”

Tout d'abord, j'étudierai les variables temporelles, telles que la saisonnalité hebdomadaire et annuelle et l'effet des “holiday” sur les ventes.

In [4]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from scipy.stats import zscore

def adjust_dataset_for_daily_entries(df, date_col):
    """Ensures that the dataset has a row for every single day, filling missing days with NaN values."""
    # Convert the date column to datetime
    df[date_col] = pd.to_datetime(df[date_col])

    # Aggregate sales data by day
    daily_data = df.groupby(date_col).sum(numeric_only=True).reset_index()

    # Create a full date range
    all_dates = pd.date_range(start=daily_data[date_col].min(), end=daily_data[date_col].max(), freq='D')

    # Create a DataFrame with the full date range
    full_dates_df = pd.DataFrame(all_dates, columns=[date_col])

    # Merge the full date range with the aggregated daily data
    df = full_dates_df.merge(daily_data, on=date_col, how='left')
    
    return df

def identify_outliers(df, sales_col):
    """Identifies outliers in the sales data using Z-score method."""
    df['Z_Score'] = zscore(df[sales_col].fillna(0))  # Compute Z-scores for sales
    outliers = df[df['Z_Score'].abs() > 3]  # Z-score > 3 indicates an outlier
    return outliers

def assess_holiday_effect(outliers, date_col, holidays):
    """Checks if outliers coincide with known holidays."""
    outliers[date_col] = pd.to_datetime(outliers[date_col])
    holidays = pd.to_datetime(holidays)
    outliers['Is_Holiday'] = outliers[date_col].isin(holidays)
    return outliers

def plot_outliers_with_holidays(df, date_col, sales_col, outliers):
    """Plots sales data and highlights outliers and holidays."""
    fig = px.line(df, x=date_col, y=sales_col, title='Sales with Outliers and Holidays')
    
    # Highlight outliers
    fig.add_trace(go.Scatter(x=outliers[date_col], y=outliers[sales_col], 
                             mode='markers', 
                             marker=dict(color='red', size=8), 
                             name='Outliers'))
    
    # Highlight holidays
    holidays = outliers[outliers['Is_Holiday']]
    fig.add_trace(go.Scatter(x=holidays[date_col], y=holidays[sales_col], 
                             mode='markers', 
                             marker=dict(color='green', size=14), 
                             name='Holidays'))
    
    fig.update_xaxes(title_text='Date')
    fig.update_yaxes(title_text='Sales')
    fig.show()


In [5]:
holidays = [
    '2014-01-01',  # New Year's Day
    '2014-01-20',  # Martin Luther King Jr. Day
    '2014-02-14',  # Valentine's Day (Sales on gifts, chocolates, flowers)
    '2014-02-17',  # Presidents' Day (Major sales on furniture, appliances, cars)
    '2014-03-17',  # St. Patrick's Day (Sales on alcohol, party supplies)
    '2014-04-20',  # Easter Sunday (Sales on candy, clothing, decorations)
    '2014-05-11',  # Mother's Day (Sales on gifts, jewelry, beauty products)
    '2014-05-26',  # Memorial Day (Major sales on home goods, cars, mattresses)
    '2014-06-15',  # Father's Day (Sales on tools, electronics, clothing)
    '2014-07-04',  # Independence Day (Sales on grills, outdoor furniture, appliances)
    '2014-09-01',  # Labor Day (Major sales on furniture, appliances, clothing)
    '2014-10-13',  # Columbus Day (Retail sales, especially clothing and outdoor gear)
    '2014-10-31',  # Halloween (Sales on costumes, candy, decorations)
    '2014-11-11',  # Veterans Day (Military discounts, retail sales)
    '2014-11-27',  # Thanksgiving
    '2014-11-28',  # Black Friday (Biggest shopping day of the year)
    '2014-12-01',  # Cyber Monday (Major online retail discounts)
    '2014-12-25',  # Christmas Day (Post-Christmas sales)
    '2014-12-26',  # Boxing Day (Retail clearance sales)
    
    '2015-01-01',  # New Year's Day
    '2015-01-19',  # Martin Luther King Jr. Day
    '2015-02-14',  # Valentine's Day
    '2015-02-16',  # Presidents' Day
    '2015-03-17',  # St. Patrick's Day
    '2015-04-05',  # Easter Sunday
    '2015-05-10',  # Mother's Day
    '2015-05-25',  # Memorial Day
    '2015-06-21',  # Father's Day
    '2015-07-04',  # Independence Day
    '2015-09-07',  # Labor Day
    '2015-10-12',  # Columbus Day
    '2015-10-31',  # Halloween
    '2015-11-11',  # Veterans Day
    '2015-11-26',  # Thanksgiving
    '2015-11-27',  # Black Friday
    '2015-11-30',  # Cyber Monday
    '2015-12-25',  # Christmas Day
    '2015-12-26',  # Boxing Day

    '2016-01-01',  # New Year's Day
    '2016-01-18',  # Martin Luther King Jr. Day
    '2016-02-14',  # Valentine's Day
    '2016-02-15',  # Presidents' Day
    '2016-03-17',  # St. Patrick's Day
    '2016-03-27',  # Easter Sunday
    '2016-05-08',  # Mother's Day
    '2016-05-30',  # Memorial Day
    '2016-06-19',  # Father's Day
    '2016-07-04',  # Independence Day
    '2016-09-05',  # Labor Day
    '2016-10-10',  # Columbus Day
    '2016-10-31',  # Halloween
    '2016-11-11',  # Veterans Day
    '2016-11-24',  # Thanksgiving
    '2016-11-25',  # Black Friday
    '2016-11-28',  # Cyber Monday
    '2016-12-25',  # Christmas Day
    '2016-12-26',  # Boxing Day

    '2017-01-01',  # New Year's Day
    '2017-01-16',  # Martin Luther King Jr. Day
    '2017-02-14',  # Valentine's Day
    '2017-02-20',  # Presidents' Day
    '2017-03-17',  # St. Patrick's Day
    '2017-04-16',  # Easter Sunday
    '2017-05-14',  # Mother's Day
    '2017-05-29',  # Memorial Day
    '2017-06-18',  # Father's Day
    '2017-07-04',  # Independence Day
    '2017-09-04',  # Labor Day
    '2017-10-09',  # Columbus Day
    '2017-10-31',  # Halloween
    '2017-11-11',  # Veterans Day
    '2017-11-23',  # Thanksgiving
    '2017-11-24',  # Black Friday
    '2017-11-27',  # Cyber Monday
    '2017-12-25',  # Christmas Day
    '2017-12-26',  # Boxing Day
]


In [6]:
# Convert the holiday dates to datetime if needed
holidays = pd.to_datetime(holidays)

df_z = adjust_dataset_for_daily_entries(df, 'Order Date')
outliers = identify_outliers(df_z, 'Sales')
outliers_with_holidays = assess_holiday_effect(outliers, 'Order Date', holidays)
plot_outliers_with_holidays(df_z, 'Order Date', 'Sales', outliers_with_holidays)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outliers[date_col] = pd.to_datetime(outliers[date_col])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outliers['Is_Holiday'] = outliers[date_col].isin(holidays)


In [7]:
def identify_outliers(df, sales_col):
    """Creates a new column indicating if a row is an outlier based on Z-scores."""
    df['Z_Score'] = zscore(df[sales_col].fillna(0))  # Compute Z-scores for sales
    df['Is_Outlier'] = df['Z_Score'].abs() > 3  # Outliers have Z-score > 3
    return df

def outlier_distribution_by_weekday(df, date_col, outlier_col):
    """Analyzes the distribution of outliers across days of the week."""
    df[date_col] = pd.to_datetime(df[date_col])
    df['Day_of_Week'] = df[date_col].dt.day_name()  # Add day of week column
    outlier_distribution = df[df[outlier_col]].groupby('Day_of_Week').size().reset_index(name='Count')
    
    # Sort by day of the week order
    days_order = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
    outlier_distribution['Day_of_Week'] = pd.Categorical(outlier_distribution['Day_of_Week'], categories=days_order, ordered=True)
    outlier_distribution = outlier_distribution.sort_values('Day_of_Week')
    
    # Plot the distribution
    fig = px.bar(outlier_distribution, x='Day_of_Week', y='Count', title='Outlier Distribution by Day of the Week', 
                 labels={'Count': 'Number of Outliers', 'Day_of_Week': 'Day of the Week'})
    fig.show()
    
    return outlier_distribution

# Example usage
df_z = adjust_dataset_for_daily_entries(df, 'Order Date')
df_z = identify_outliers(df, 'Sales')
outlier_distribution = outlier_distribution_by_weekday(df_z, 'Order Date', 'Is_Outlier')


Nous constatons que de nombreuses valeurs aberrantes sont liées à des jours fériés et que ces valeurs aberrantes se produisent principalement les lundis et vendredis.

Le client peut donc s'attendre à avoir des stocks plus importants pendant ces jours fériés.

## L’impact de la saisonnalité

In [8]:
# Ajouter une colonne pour le jour de la semaine
df['Day of Week'] = df['Order Date'].dt.dayofweek

# Créer un boxplot des ventes par jour de la semaine et par région
fig = px.box(df, x='Day of Week', y='Sales', color='Day of Week', title='Sales Distribution by Day of Week and Region')
fig.update_xaxes(title_text='Day of Week', tickmode='array', tickvals=list(range(7)), 
                 ticktext=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])
fig.update_yaxes(title_text='Sales')
fig.show()

La saisonnalité au niveau de la semaine montre que dans la région du sud, de plus forte ventes se produisent en moyenne. Les mercredis semblent plus tranquille à travers toute les régions.

Toutefois, il existe une tendance saisonnière assez forte tout au long de l'année, par région.

In [10]:
def plot_seasonality_per_group(df, date_col, sales_col, group_col):
    """Plots seasonality for each group over time using Plotly with normalized sales."""
    df[date_col] = pd.to_datetime(df[date_col])
    df['Month'] = df[date_col].dt.month
    df['Year'] = df[date_col].dt.year
    
    # Calculer la tendance de la saisonnalité
    seasonality_trend = df.groupby([group_col, 'Month'])[sales_col].mean().reset_index()
    
    # Normaliser les ventes par groupe
    seasonality_trend[sales_col] = seasonality_trend.groupby(group_col)[sales_col].transform(lambda x: (x - x.min()) / (x.max() - x.min()))
    
    # Tracer le graphique
    fig = px.line(seasonality_trend, x='Month', y=sales_col, color=group_col, title='Seasonality by Group (Normalized)')
    fig.update_xaxes(title_text='Month', tickmode='array', tickvals=list(range(1, 13)), 
                     ticktext=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])
    fig.update_yaxes(title_text='Normalized Average Sales')
    fig.show()
    
plot_seasonality_per_group(df, 'Order Date', 'Sales', 'Region')

En moyenne, les mois de février, juillet et octobre enregistrent très peu de ventes, tandis que les mois de mars, juin et novembre en enregistrent davantage. Toutefois, nous constatons également que toutes les régions n'ont pas la même saisonnalité et qu'un gestionnaire doit tenir compte du fait que chaque secteur a sa propre dynamique.

En ce qui concerne ma conclusion sur les perspectives temporelles, je pense qu'un gestionnaire peut être conscient que la dynamique des ventes entre ces régions change, et qu'au cours de l'année, ces ventes ne sont pas constantes, ce qui peut lui permettre de stocker davantage en mars, juin et novembre, et moins en février, juillet et octobre. Il en va de même pour les jours fériés.

### L’impact des rabais

Deuxièmement, je pense qu'il est important d'étudier les bénéfices négatifs, car les ventes ne reflètent pas entièrement ce que les gestionnaires recherchent, c'est-à-dire, en fin de compte, le bénéfice. Ces bénéfices négatifs semblent être dus aux rabais.

In [11]:
import plotly.express as px
import pandas as pd

def visualize_discount_effect(df):
    """
    Visualizes the effect of Discount on Sales and Profit per region using Plotly.
    
    Parameters:
    df (pd.DataFrame): The dataset with 'Discount', 'Region', 'Sales', and 'Profit'.
    """
    # Group data by discount and region
    discount_grouped = df.groupby(['Discount', 'Region'])[['Sales', 'Profit']].mean().reset_index()
    
    # Plot Sales effect
    fig_sales = px.line(discount_grouped, x='Discount', y='Sales', color='Region',
                         title='Effect of Discount on Sales per Region',
                         markers=True, labels={'Sales': 'Average Sales', 'Discount': 'Discount (%)'})
    
    # Plot Profit effect
    fig_profit = px.line(discount_grouped, x='Discount', y='Profit', color='Region',
                          title='Effect of Discount on Profit per Region',
                          markers=True, labels={'Profit': 'Average Profit', 'Discount': 'Discount (%)'})
    
    # Show plots
    fig_sales.show()
    fig_profit.show()

# Example usage
visualize_discount_effect(df)


Ce que nous remarquons, c'est que des bénéfices négatifs significatifs apparaissent pour des rabais d'environ 0,3 à 0,7, mais la question est la suivante : dans les jours/mois qui suivent ces rabais, les bénéfices augmentent-ils considérablement ? 

Analysis of if a discount is useful

In [12]:
df['Is_Discount'] = df['Discount'] > 0

# Aggregate the sales before and after the discount

df['Sales_Before_7_Days'] = df['Sales'].shift(1).rolling(window=7).sum()
df['Sales_After_7_Days'] = df['Sales'].shift(-1).rolling(window=7).sum()

# Compute the impact of discount
df['Sales_Change_After_Discount'] = (df['Sales_After_7_Days'] - df['Sales_Before_7_Days']) / df['Sales_Before_7_Days']

fig = px.line(df, x='Order Date', y='Sales_Change_After_Discount', color='Region',
              title='Sales Before, During, and After Discounts', 
              markers=True)
fig.show()


Ce graphique montre l'augmentation ou la diminution des ventes dans les 7 jours suivant une remise. Ce graphique pourrait être étudié plus en détail avec le client afin d'établir la viabilité de ces rabais et le moment opportun pour les mettre en œuvre.

## Délais de livraison

Troisièmement, les retards de livraison peuvent entraîner une baisse des ventes, voire même de profit négatifs.

In [13]:
import plotly.express as px
import pandas as pd

def visualize_shipping_delay_effect(df):
    """
    Visualizes the effect of shipping delay on Sales and Profit per region using Plotly.
    
    Parameters:
    df (pd.DataFrame): The dataset with 'Order Date', 'Ship Date', 'Region', 'Sales', and 'Profit'.
    """
    # Ensure dates are in datetime format
    df['Order Date'] = pd.to_datetime(df['Order Date'])
    df['Ship Date'] = pd.to_datetime(df['Ship Date'])
    
    # Calculate shipping delay
    df['Shipping Delay (Days)'] = (df['Ship Date'] - df['Order Date']).dt.days
    df = df[df['Shipping Delay (Days)'] >= 0]  # Remove negative values
    
    # Group data by shipping delay and region
    delay_grouped = df.groupby(['Shipping Delay (Days)', 'Region'])[['Sales', 'Profit']].mean().reset_index()
    
    # Plot Sales effect
    fig_sales = px.line(delay_grouped, x='Shipping Delay (Days)', y='Sales', color='Region',
                         title='Effect of Shipping Delay on Sales per Region',
                         markers=True, labels={'Sales': 'Average Sales'})
    
    # Plot Profit effect
    fig_profit = px.line(delay_grouped, x='Shipping Delay (Days)', y='Profit', color='Region',
                          title='Effect of Shipping Delay on Profit per Region',
                          markers=True, labels={'Profit': 'Average Profit'})
    
    # Show plots
    fig_sales.show()
    fig_profit.show()

# Example usage
visualize_shipping_delay_effect(df)


On constate une augmentation significative des ventes avec un délai de livraison de deux jours dans toutes les régions. À ce niveau de granularité, il n'y a pas de schéma clair dans l'impact des délais de livraison sur les ventes ou les bénéfices. 

## Solution ML

Grâce a un modèle en arbre, nous pouvons évaluer l'importance de certaines caractéristiques, ce qui permet une analyse de ces variables sur les ventes par région au niveau de la semaine.

In [14]:
def prepare_data_for_modeling(df, holidays):
    """
    Prepares the data for modeling by performing various transformations and feature engineering.
    """
    # Transform 'Order Date' and 'Ship Date' to datetime
    df['Order Date'] = pd.to_datetime(df['Order Date'])
    df['Ship Date'] = pd.to_datetime(df['Ship Date'])

    # Transform 'Postal Code' and 'Row ID' to categorical variables
    df['Postal Code'] = df['Postal Code'].astype('str')
    df['Row ID'] = df['Row ID'].astype('str')

    # df['Order Ship Delta'] = (df['Ship Date'] - df['Order Date']).dt.days

    # Define US holidays (example list, replace with actual holidays)
    holidays = pd.to_datetime(holidays)

    # Aggregate data by week, calculating weekly metrics
    df['Week'] = df['Order Date'].dt.to_period('W').dt.start_time
    weekly_data = df.groupby(['Week', 'Region']).agg(
        Sales=('Sales', 'sum'),
        Discount=('Discount', 'mean'),
        Quantity=('Quantity', 'sum'),
        Order_Ship_Delta=('Order Ship Delta', 'mean'),
        Profit=('Profit', 'sum')
    ).reset_index()

    # Flag weeks with holidays
    weekly_data['Is_Holiday'] = weekly_data['Week'].isin(holidays).astype(int)

    # Create temporal features for weekly data
    weekly_data['Week_Num'] = weekly_data['Week'].dt.isocalendar().week
    weekly_data['Month'] = weekly_data['Week'].dt.month
    weekly_data['Year'] = weekly_data['Week'].dt.year

    # Number of lags
    n_lags = 6

    # Create lag features for weekly sales
    for lag in range(1, n_lags + 1):
        weekly_data[f'Sales_Lag_{lag}'] = weekly_data.groupby('Region')['Sales'].shift(lag)

    # Create rolling averages for weekly sales
    weekly_data['Sales_MA_2'] = weekly_data.groupby('Region')['Sales'].shift(1).rolling(window=2, min_periods=1).mean().reset_index(0, drop=True)
    weekly_data['Sales_MA_4'] = weekly_data.groupby('Region')['Sales'].shift(1).rolling(window=4, min_periods=1).mean().reset_index(0, drop=True)

    # Create lag and rolling averages for discounts
    weekly_data['Discount_Lag_1'] = weekly_data.groupby('Region')['Discount'].shift(1)
    weekly_data['Discount_MA_2'] = weekly_data.groupby('Region')['Discount'].rolling(window=2, min_periods=1).mean().reset_index(0, drop=True)
    weekly_data['Discount_MA_4'] = weekly_data.groupby('Region')['Discount'].rolling(window=4, min_periods=1).mean().reset_index(0, drop=True)

    # MA of the delay of shipment
    weekly_data['ShipDelays_MA_2'] = weekly_data.groupby('Region')['Order_Ship_Delta'].shift(1).rolling(window=2, min_periods=1).mean().reset_index(0, drop=True)
    weekly_data['ShipDelays_MA_4'] = weekly_data.groupby('Region')['Order_Ship_Delta'].shift(1).rolling(window=4, min_periods=1).mean().reset_index(0, drop=True)

    # Drop first `n_lags` rows for each region to handle NaNs created by lag features
    weekly_data = weekly_data.groupby('Region').apply(lambda x: x.iloc[n_lags:]).reset_index(drop=True)

    # Create Average Demand Interval (ADI) and Coefficient of Variation (CV)
    region_stats = weekly_data.groupby('Region').agg(
        ADI=('Sales', lambda x: len(x) / (x > 0).sum() if (x > 0).sum() > 0 else 0),
        CV=('Sales', lambda x: x.std() / x.mean() if x.mean() > 0 else 0)
    ).reset_index()
    weekly_data = weekly_data.merge(region_stats, on='Region', how='left')

    # Train-test split based on the weekly aggregated data
    cutoff_date = '2017-01-01'
    train_data = weekly_data[weekly_data['Week'] < cutoff_date]
    test_data = weekly_data[weekly_data['Week'] >= cutoff_date]

    # Plot sales for training and testing sets by region using Plotly
    fig = go.Figure()
    for region in weekly_data['Region'].unique():
        train_region = train_data[train_data['Region'] == region]
        test_region = test_data[test_data['Region'] == region]
        fig.add_trace(go.Scatter(x=train_region['Week'], y=train_region['Sales'], mode='lines', name=f'{region} - Train'))
        fig.add_trace(go.Scatter(x=test_region['Week'], y=test_region['Sales'], mode='lines', name=f'{region} - Test'))

    fig.update_layout(title='Weekly Sales by Region - Train and Test Sets',
                      xaxis_title='Week',
                      yaxis_title='Sales')
    fig.show()

    return train_data, test_data, weekly_data

# Usage
train_data, test_data, weekly_data = prepare_data_for_modeling(df, holidays)





In [15]:
# Step 1: Choose relevant columns for features (X) and target (Y)
columns = [
    'Week', 'Region', 'Sales', 'Discount', 'Quantity', 'Profit',
    'Is_Holiday', 'Week_Num', 'Month', 'Year', 'Sales_Lag_1', 'Sales_Lag_2',
    'Sales_Lag_3', 'Sales_Lag_4', 'Sales_Lag_5', 'Sales_Lag_6',
    'Sales_MA_2', 'Sales_MA_4', 'Discount_Lag_1', 'Discount_MA_2',
    'Discount_MA_4', 'ADI', 'CV', 'ShipDelays_MA_2', 'ShipDelays_MA_4',
]

# Define features (X) and target (Y)
X_columns = [
    'Discount', 'Is_Holiday', 'Week_Num', 'Month', 'Year',
    'Sales_Lag_1', 'Sales_Lag_2', 'Sales_Lag_3', 'Sales_Lag_4', 'Sales_Lag_5', 'Sales_Lag_6',
    'Sales_MA_2', 'Sales_MA_4', 'Discount_Lag_1', 'Discount_MA_2', 'Discount_MA_4','ShipDelays_MA_2', 'ShipDelays_MA_4',
    'ADI', 'CV'
]
Y_column = 'Sales'

# Step 2: Extract features and target variable
X = weekly_data[X_columns]
Y = weekly_data[Y_column]

# Step 3: Encode categorical variables properly
# Convert 'Region' to categorical codes and include it in features
weekly_data['Region'] = weekly_data['Region'].astype('category')
X['Region'] = weekly_data['Region'].cat.codes

# Step 4: Train-Test Spblit
cutoff_date = '2017-01-01'
train_data = weekly_data[weekly_data['Week'] < cutoff_date]
test_data = weekly_data[weekly_data['Week'] >= cutoff_date]

X_train = train_data[X_columns + ['Region']]
Y_train = train_data[Y_column]
X_test = test_data[X_columns + ['Region']]
Y_test = test_data[Y_column]

# Step 5: LightGBM Dataset
lgb_train = lgb.Dataset(X_train, label=Y_train, categorical_feature=['Region'],  free_raw_data=False)  # Prevent raw data from being freed)
lgb_test = lgb.Dataset(X_test, label=Y_test, reference=lgb_train,  free_raw_data=False)  # Prevent raw data from being freed)




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [23]:
# Take whatever params
params = {'objective': 'regression', 'metric': 'rmse', 'boosting_type': 'gbdt', 'num_leaves': 15, 'learning_rate': 0.05, 'feature_fraction': 0.7, 'bagging_fraction': 0.5, 'bagging_freq':5,'min_data_in_leaf': 20, 'lambda_l1': 0.1, 'lambda_l2': 0.7, 'verbose': -1}

# Train the final model using the best parameters
evals_result_best = {}
final_model = lgb.train(
    params=params,
    train_set=lgb_train,
    num_boost_round=1000,
    valid_sets=[lgb_train],
    valid_names=['train'],
    callbacks=[
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=10), 
        lgb.record_evaluation(evals_result_best)     # Record evaluation results
    ]
)


Only training set found, disabling early stopping.



[10]	train's rmse: 1205.66
[20]	train's rmse: 1136.85
[30]	train's rmse: 1071.99
[40]	train's rmse: 1038.85
[50]	train's rmse: 1001.38
[60]	train's rmse: 970.795
[70]	train's rmse: 944.044
[80]	train's rmse: 915.61
[90]	train's rmse: 892.957
[100]	train's rmse: 867.135
[110]	train's rmse: 848.043
[120]	train's rmse: 821.772
[130]	train's rmse: 802.914
[140]	train's rmse: 784.386
[150]	train's rmse: 757.907
[160]	train's rmse: 745.822
[170]	train's rmse: 726.849
[180]	train's rmse: 716.238
[190]	train's rmse: 703.101
[200]	train's rmse: 687.058
[210]	train's rmse: 674.66
[220]	train's rmse: 660.06
[230]	train's rmse: 647.303
[240]	train's rmse: 636.771
[250]	train's rmse: 623.894
[260]	train's rmse: 611.492
[270]	train's rmse: 597.314
[280]	train's rmse: 588.186
[290]	train's rmse: 573.058
[300]	train's rmse: 564.075
[310]	train's rmse: 554.145
[320]	train's rmse: 545.847
[330]	train's rmse: 536.722
[340]	train's rmse: 525.58
[350]	train's rmse: 520.065
[360]	train's rmse: 509.018
[370]

In [17]:
# Extraire l'importance des features
feature_importance = final_model.feature_importance(importance_type='gain')
feature_names = final_model.feature_name()

# Créer un DataFrame pour organiser les features et leur importance
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importance
})

# Trier les features par importance décroissante
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Créer un graphique en barres des importances des features
fig = px.bar(importance_df, x='Importance', y='Feature', orientation='h', title='Feature Importance', labels={'Importance': 'Importance', 'Feature': 'Feature'})
fig.update_layout(yaxis={'categoryorder':'total ascending'})
fig.show()