![](https://www.extensiv.com/hubfs/Skubana/Blog%20Pages/Imported_Blog_Media/big%20data%20analytics%2C%20business%20team%20working%20on%20computer-Oct-12-2022-05-51-21-63-PM.jpg)

<div style="padding: 40px; background: linear-gradient(135deg, #f5f7fa, #cdd2d8); border: 3px groove #d1d8e0; border-radius: 30px; box-shadow: 0 10px 25px rgba(0,0,0,0.1); font-size: 120%; line-height: 1.9; color: #333; font-family: 'Georgia', serif; text-align: justify; position: relative;">
    <h2 style="color: #2c3e50; font-size: 150%; border-bottom: 3px solid #3498db; display: inline-block; padding-bottom: 10px; margin-bottom: 20px;">
        Business Problem
    </h2>
    <p style="font-size: 140%; color: #34495e; letter-spacing: 1px;">An avant-garde financial technology platform is revolutionizing the digital shopping experience.</p>
    <p>Creating a bridge between buyers and sellers, it rolls out an unparalleled payment infrastructure tailored for e-commerce entities, marketplaces, and individual aficionados. As the year draws to a close, our paramount challenge is to decipher the intricacies of daily transaction volume for each <span style="color: #c0392b; text-decoration: underline dotted #c0392b; text-decoration-thickness: 2px;">merchant_id</span> throughout the concluding trimester of 2020.</p>
</div>

<div style="border: 10px solid #3498db; border-radius: 30px; padding: 40px; box-shadow: 10px 10px 25px #2980b9; background: linear-gradient(to bottom right, #D6EAF8, #EBF5FB);">
    <h2 style="font-size: 34px; font-weight: bold; color: #0E6655; text-align: center; border-bottom: 5px solid #F1C40F; font-family: 'Arial'; padding: 20px; margin-top: 0; box-shadow: 5px 5px 15px #7D8A99; background-color: #FEF5E7; border-radius: 20px;">📊 Dataset Story</h2>
    <ul style="font-size: 18px; font-family: 'Calibri'; padding: 30px; line-height: 1.7; list-style-type: none;">
        <li>📌 <strong style="color: #0E6655;">Dataset Origin:</strong> This dataset contains the records of 7 merchant businesses from 2018 to 2020.</li>
        <li>📄 <strong style="color: #0E6655;">Attributes:</strong>
            <ul>
                <li>Transaction: Number of transactions</li>
                <li>MerchantID: IDs of the merchant businesses</li>
                <li>Paid Price: Payment amount</li>
            </ul>
        </li>
    </ul>
</div>

# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>1 |</span></b> <b>Importing Libraries</b></div>

In [None]:
import itertools
import warnings
import numpy as np
import seaborn as sns
import pandas as pd
import lightgbm as lgb
from matplotlib import pyplot as plt
import statsmodels.api as sm
import statsmodels.tsa.api as smt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.holtwinters import SimpleExpSmoothing
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error
from IPython.display import HTML as html_print
from termcolor import colored
from IPython.display import display
warnings.filterwarnings('ignore')

# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>2 |</span></b> <b>Adjusting Row & Column Settings</b></div>

In [None]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 500)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
warnings.filterwarnings('ignore')

# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>3 |</span></b> <b>Loading The Data Set</b></div>

In [None]:
df = pd.read_csv("/kaggle/input/transaction01/iyzico_data.csv")

In [None]:
df.head()

# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>4 |</span></b> <b>Exploratory Data Analysis</b></div>

In [None]:
def print_section_title(title):
    print(colored(title, 'blue', attrs=['bold', 'underline']))
    
def display_head_and_tail(dataframe, head=5):
    display(dataframe.head(head).style.set_caption("Head"))
    display(dataframe.tail(head).style.set_caption("Tail"))

def display_na(dataframe):
    na_df = dataframe.isnull().sum().reset_index()
    na_df.columns = ['Column', 'Number of NA']
    display(na_df.style.set_caption("Number of NA Values"))

def display_quantiles(dataframe):
    quantiles_df = dataframe.describe([0, 0.05, 0.50, 0.95, 0.99, 1]).T
    display(quantiles_df.style.format("{:.2f}").set_caption("Quantiles"))

def check_df(dataframe, head=5):
    print_section_title('Shape')
    print(dataframe.shape)
    print_section_title('Types')
    print(dataframe.dtypes.to_frame('Data Type').style.set_caption("Data Types"))
    print_section_title('Info')
    print(dataframe.info())
    print_section_title('Head & Tail')
    display_head_and_tail(dataframe, head)
    print_section_title('NA Values')
    display_na(dataframe)
    print_section_title('Quantiles')
    display_quantiles(dataframe)

In [None]:
check_df(df)

In [None]:
# We dropped the irrelevant variable from the dataset.

df.drop("Unnamed: 0", axis=1,inplace=True)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
# We converted the transaction_date column, which had 7667 non-null objects, to datetime.

df["transaction_date"].dtypes

In [None]:
df.dtypes

In [None]:
df["transaction_date"] = pd.to_datetime(df["transaction_date"])

In [None]:
df.dtypes

In [None]:
# We identified the start and end dates of the dataset.

df["transaction_date"].min()

In [None]:
df["transaction_date"].max()

In [None]:
# What were the ID numbers of the merchant businesses?

df["merchant_id"].unique()

In [None]:
# What is the total payment amount for each merchant business?

df.groupby("merchant_id").agg({"Total_Paid":"sum"})

In [None]:
# We created graphs for the transaction count of each merchant business within a year.

for id in df.merchant_id.unique():
    plt.figure(figsize=(15, 15))
    plt.subplot(3, 1, 1, title = "-- ""Shop " + str(id) +" --"+ ' 2018-2019 -- Transaction Count')
    df[(df.merchant_id == id) & ( df.transaction_date >= "2018-01-01" ) & (df.transaction_date < "2019-01-01")]["Total_Transaction"].plot()
    plt.xlabel('')
    plt.subplot(3, 1, 2,title ="-- ""Shop " + str(id) +" --"+ ' 2019-2020 -- Transaction Count')
    df[(df.merchant_id == id) &( df.transaction_date >= "2019-01-01" )& (df.transaction_date < "2020-01-01")]["Total_Transaction"].plot()
    plt.xlabel('')
    plt.show(block=True)

# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>5 |</span></b> <b>Feature Engineering</b></div>

<h2 style="background-color: #f2f2f2; padding: 10px; color: #0c5674;">Date Features</h2>

In [None]:
def create_date_features(df, date_column):
    df['month'] = df[date_column].dt.month
    df['day_of_month'] = df[date_column].dt.day
    df['day_of_year'] = df[date_column].dt.dayofyear
    df['week_of_year'] = df[date_column].dt.isocalendar().week  # Bu satır güncellendi
    df['day_of_week'] = df[date_column].dt.dayofweek
    df['year'] = df[date_column].dt.year
    df["is_wknd"] = df[date_column].dt.weekday // 4
    df['is_month_start'] = df[date_column].dt.is_month_start.astype(int)
    df['is_month_end'] = df[date_column].dt.is_month_end.astype(int)
    df['quarter'] = df[date_column].dt.quarter
    df['is_quarter_start'] = df[date_column].dt.is_quarter_start.astype(int)
    df['is_quarter_end'] = df[date_column].dt.is_quarter_end.astype(int)
    df['is_year_start'] = df[date_column].dt.is_year_start.astype(int)
    df['is_year_end'] = df[date_column].dt.is_year_end.astype(int)
    return df

In [None]:
df = create_date_features(df, "transaction_date")

In [None]:
df.info()

In [None]:
# We corrected a type error in the 'week_of_year' variable and converted it to int64 because it caused an error when building the model.

df['week_of_year'] = df['week_of_year'].astype('int64')

In [None]:
df.info()

In [None]:
df.head()

In [None]:
# We identified the number of transactions for each merchant business on a monthly and yearly basis.

df.groupby(["merchant_id","year","month","day_of_month"]).agg({"Total_Transaction": ["sum", "mean", "median"]}).head()

In [None]:
# We determined the total payment amounts for each merchant business on a monthly and yearly basis.

df.groupby(["merchant_id","year","month"]).agg({"Total_Paid": ["sum", "mean", "median"]}).head(12)

<h2 style="background-color: #f2f2f2; padding: 10px; color: #0c5674;">Lag/Shifted Features</h2>

<div style="background-color: #f7f7f9; padding: 10px 15px; border-radius: 5px; border: 1px solid #e0e0e0; margin: 20px 0;">
    <h3 style="color: #333;">Lagged Features and Regularization</h3>
    <p style="color: #555;">
        In this section, we implemented "lagged" features, a commonly used technique when working with time series data. These features are derived based on the previous values of the total transaction amount (<code>Total_Transaction</code>) for a merchant (<code>merchant_id</code>). The primary goal of creating lagged features is to predict future values of a time series more accurately using its past values.
    </p>
    <p style="color: #555;">
        However, to prevent the model from overfitting, we introduced random noise to these lagged features. This makes it challenging for the model to achieve a perfect fit on the training data, aiming to enhance its generalization capability.
    </p>
    <p style="color: #555;">
        In conclusion, in this section, we not only obtained lagged features crucial for time series analysis but also performed regularization by adding random noise to enhance the model's ability to generalize.
    </p>
</div>

In [None]:
def random_noise(dataframe):
    return np.random.normal(scale=1.6, size=(len(dataframe),))

In [None]:
def lag_features(dataframe, lags):
    for lag in lags:
        dataframe['sales_lag_' + str(lag)] = dataframe.groupby(["merchant_id"])['Total_Transaction'].transform(
            lambda x: x.shift(lag)) + random_noise(dataframe)
    return dataframe

In [None]:
df = lag_features(df, [91,92,170,171,172,173,174,175,176,177,178,179,180,181,182,183,184,185,186,187,188,189,190,
                       350,351,352,352,354,355,356,357,358,359,360,361,362,363,364,365,366,367,368,369,370,
                       538,539,540,541,542,
                       718,719,720,721,722])

In [None]:
df.head()

<h2 style="background-color: #f2f2f2; padding: 10px; color: #0c5674;">Rolling Mean Features</h2>

<div style="background-color: #f7f7f9; padding: 10px 15px; border-radius: 5px; border: 1px solid #e0e0e0; margin: 20px 0;">
    <h3 style="color: #333;">Rolling Mean Features</h3>
    <p style="color: #555;">
        In this section, we implemented the "rolling mean" features, commonly used in time series analysis to smooth out short-term fluctuations and clarify the overall trend. Additionally, we introduced random noise to these features with the aim of enhancing the model's generalization capability.
    </p>
</div>

In [None]:
def roll_mean_features(dataframe, windows):
    for window in windows:
        dataframe['sales_roll_mean_' + str(window)] = dataframe.groupby("merchant_id")['Total_Transaction']. \
                                                          transform(
            lambda x: x.shift(1).rolling(window=window, min_periods=10, win_type="triang").mean()) + random_noise(
            dataframe)
    return dataframe

In [None]:
df = roll_mean_features(df, [91,92,178,179,180,181,182,359,360,361,449,450,451,539,540,541,629,630,631,720])

In [None]:
df.head()

<h2 style="background-color: #f2f2f2; padding: 10px; color: #0c5674;">Exponentially Weighted Mean Features</h2>

<div style="background-color: #f7f7f9; padding: 10px 15px; border-radius: 5px; border: 1px solid #e0e0e0; margin: 20px 0;">
    <h3 style="color: #333;">Exponentially Weighted Mean Features</h3>
    <p style="color: #555;">
        In this section, we constructed features based on the "exponentially weighted mean" (EWM). Unlike traditional moving averages, EWM assigns exponentially decreasing weights to older data points, making it more responsive to recent changes in the dataset. By leveraging various alphas (smoothing factors) and lags, we generated a comprehensive set of EWM features for our time series analysis.
    </p>
</div>

In [None]:
def ewm_features(dataframe, alphas, lags):
    for alpha in alphas:
        for lag in lags:
            dataframe['sales_ewm_alpha_' + str(alpha).replace(".", "") + "_lag_" + str(lag)] = \
                dataframe.groupby("merchant_id")['Total_Transaction'].transform(lambda x: x.shift(lag).ewm(alpha=alpha).mean())
    return dataframe

In [None]:
alphas = [0.95, 0.9, 0.8, 0.7, 0.5]

In [None]:
lags = [91,92,178,179,180,181,182,359,360,361,449,450,451,539,540,541,629,630,631,720]

In [None]:
df = ewm_features(df, alphas, lags)

In [None]:
df.tail()

<h2 style="background-color: #f2f2f2; padding: 10px; color: #0c5674;">Black Friday - Summer Solstice</h2>

<div style="background-color: #f7f7f9; padding: 10px 15px; border-radius: 5px; border: 1px solid #e0e0e0; margin: 20px 0;">
    <h3 style="color: #333;">Event-based Features: Black Friday and Summer Solstice</h3>
    <p style="color: #555;">
        In this section, we introduced two binary features to identify specific events in the dataset. The <code>is_black_friday</code> feature indicates whether a given transaction date corresponds to Black Friday, a significant shopping event. Similarly, the <code>is_summer_solstice</code> feature highlights the dates around the Summer Solstice, which could have its own unique influence on transactions. By integrating these event-based indicators, we aim to capture potential anomalies or patterns associated with these dates.
    </p>
</div>

In [None]:
df["is_black_friday"] = 0

In [None]:
df.loc[df["transaction_date"].isin(["2018-11-22","2018-11-23","2019-11-29","2019-11-30"]) ,"is_black_friday"]=1

In [None]:
df["is_summer_solstice"] = 0

In [None]:
df.loc[df["transaction_date"].isin(["2018-06-19","2018-06-20","2018-06-21","2018-06-22",
                                    "2019-06-19","2019-06-20","2019-06-21","2019-06-22",]) ,"is_summer_solstice"]=1

# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>5 |</span></b> <b>One-Hot Encoding</b></div>

In [None]:
df.head()

In [None]:
df = pd.get_dummies(df, columns=['merchant_id','day_of_week', 'month'])

In [None]:
df.head()

In [None]:
df['Total_Transaction'] = np.log1p(df["Total_Transaction"].values)

In [None]:
df.head()

# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>6 |</span></b> <b>Custom Cost Function</b></div>

<div style="background-color: #f7f7f9; padding: 10px 15px; border-radius: 5px; border: 1px solid #e0e0e0; margin: 20px 0;">
    <h3 style="color: #333;">Evaluation Metrics: SMAPE</h3>
    <p style="color: #555;">
        In this section, we focus on the Symmetric Mean Absolute Percentage Error (SMAPE) as our evaluation metric. SMAPE is an adjusted version of the standard MAPE, designed to address some of its shortcomings and provide a symmetrical measure, treating both over-predictions and under-predictions equally. We have implemented a general <code>smape</code> function and adapted it for the LightGBM framework with the <code>lgbm_smape</code> function. Utilizing SMAPE helps ensure a balanced evaluation of model predictions against actual values.
    </p>
</div>

In [None]:
# MAE: mean absolute error

# MAPE: mean absolute percentage error

# SMAPE: Symmetric mean absolute percentage error (adjusted MAPE)

In [None]:
def smape(preds, target):
    n = len(preds)
    masked_arr = ~((preds == 0) & (target == 0))
    preds, target = preds[masked_arr], target[masked_arr]
    num = np.abs(preds - target)
    denom = np.abs(preds) + np.abs(target)
    smape_val = (200 * np.sum(num / denom)) / n
    return smape_val

In [None]:
def lgbm_smape(preds, train_data):
    labels = train_data.get_label()
    smape_val = smape(np.expm1(preds), np.expm1(labels))
    return 'SMAPE', smape_val, False

# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>7 |</span></b> <b>Time-Based Validation Sets</b></div>

In [None]:
import re

In [None]:
df = df.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [None]:
df.head()

In [None]:
# 2020'nin 10.ayına kadar train seti.

train = df.loc[(df["transaction_date"] < "2020-10-01"), :]

In [None]:
# 2020'nin son 3 ayı validasyon seti.

val = df.loc[(df["transaction_date"] >= "2020-10-01"), :]

In [None]:
cols = [col for col in train.columns if col not in ['transaction_date', 'id', "Total_Transaction","Total_Paid", "year" ]]

In [None]:
Y_train = train['Total_Transaction']

In [None]:
X_train = train[cols]

In [None]:
Y_val = val['Total_Transaction']

In [None]:
X_val = val[cols]

In [None]:
# kontrol
Y_train.shape, X_train.shape, Y_val.shape, X_val.shape

# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>8 |</span></b> <b>LightGBM Model</b></div>

In [None]:
# LightGBM parameters

lgb_params = {'metric': {'mae'},
              'num_leaves': 10,
              'learning_rate': 0.02,
              'feature_fraction': 0.8,
              'max_depth': 5,
              'verbose': 0,
              'num_boost_round': 1000,
              'early_stopping_rounds': 200,
              'nthread': -1}

In [None]:
lgbtrain = lgb.Dataset(data=X_train, label=Y_train, feature_name=cols)

In [None]:
lgbval = lgb.Dataset(data=X_val, label=Y_val, reference=lgbtrain, feature_name=cols)

In [None]:
df.head()

In [None]:
model = lgb.train(lgb_params, lgbtrain,
                  valid_sets=[lgbtrain, lgbval],
                  num_boost_round=lgb_params['num_boost_round'],
                  early_stopping_rounds=lgb_params['early_stopping_rounds'],
                  feval=lgbm_smape,
                  verbose_eval=100)

In [None]:
y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)

In [None]:
smape(np.expm1(y_pred_val), np.expm1(Y_val))

# <div style="padding: 30px; color:white; margin:10; font-size:75%; text-align:left; display:fill; border-radius:10px; background-color:#3b3745"><b><span style='color:#F1A424'>9 |</span></b> <b>Plot Importance</b></div>

In [None]:
def plot_lgb_importances(model, plot=False, num=10):

    gain = model.feature_importance('gain')
    feat_imp = pd.DataFrame({'feature': model.feature_name(),
                             'split': model.feature_importance('split'),
                             'gain': 100 * gain / gain.sum()}).sort_values('gain', ascending=False)
    if plot:
        plt.figure(figsize=(10, 10))
        sns.set(font_scale=1)
        sns.barplot(x="gain", y="feature", data=feat_imp[0:25])
        plt.title('feature')
        plt.tight_layout()
        plt.show()
    else:
        print(feat_imp.head(num))

In [None]:
plot_lgb_importances(model, num=30, plot=True)

In [None]:
lgb.plot_importance(model, max_num_features=20, figsize=(10, 10), importance_type="gain")
plt.show()