In [None]:
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import MaxAbsScaler
import xgboost as xgb
from sklearn.model_selection import TimeSeriesSplit
from sklearn.linear_model import BayesianRidge, RidgeCV, Lasso
from sklearn.linear_model import RANSACRegressor, Ridge, LinearRegression
from lazypredict.Supervised import LazyRegressor

In [None]:
# Load the CSV data into a Pandas DataFrame
data = pd.read_csv('AggregatedData.csv')

In [None]:
# Convert 'Transaction Date' to datetime
data['Transaction Date'] = data['Transaction Date'].str.replace('/', '-', regex=True)
data['Transaction Date'] = pd.to_datetime(data['Transaction Date'], format='%d-%m-%Y', errors='coerce')

data

In [None]:
# Check for NULL values
data.isna().sum()

In [None]:
# Display rows with NULL values (if any)
data[data['Transaction Date'].isna() == True]

In [None]:
# Extract day of the week to make it a numeric value
data['DayOfWeek'] = data['Transaction Date'].dt.dayofweek

In [None]:
# Group data by 'ATM Name' and 'Transaction Date', and sum the cash flow values
atm_data = data.groupby(['ATM Name', 'Transaction Date']).sum().reset_index()

# Use LabelEncoder for 'Festival Religion' and 'Holiday Sequence' columns
label_encoder = LabelEncoder()
atm_data['Festival Religion'] = label_encoder.fit_transform(atm_data['Festival Religion'])
atm_data['Holiday Sequence'] = label_encoder.fit_transform(atm_data['Holiday Sequence'])

# Preprocess 'Weekday' column to ensure consistent capitalization
atm_data['Weekday'] = atm_data['Weekday'].str.lower().str.capitalize()

# Use get_dummies for 'Weekday' and 'Working Day' columns
dummy_columns = pd.get_dummies(atm_data[['Weekday', 'Working Day']], columns=['Weekday', 'Working Day'])

# Convert boolean columns to integer values (0 and 1)
dummy_columns = dummy_columns.astype(int)

# Concatenate the dummy columns with the original data
atm_data = pd.concat([atm_data, dummy_columns], axis=1)

atm_data['Working Day'] = label_encoder.fit_transform(atm_data['Working Day'])

# Display the data
atm_data

In [None]:
# Extract the columns that I want to normalize
columns_after_normalize = ['Norm Total amount Withdrawn', 'Norm Amount withdrawn XYZ Card', 'Norm Amount withdrawn Other Card']
columns_to_normalize = ['Total amount Withdrawn', 'Amount withdrawn XYZ Card', 'Amount withdrawn Other Card']

# Initialize the MinMaxScaler
scaler = MinMaxScaler()

# Normalize the columns and create new columns with normalized values
atm_data[columns_after_normalize] = scaler.fit_transform(atm_data[columns_to_normalize])
atm_data

In [None]:
# Copy the data to a new DataFrame and set the index to 'Transaction Date'
atm_data_day = atm_data.copy()
atm_data_day = atm_data.set_index(['Transaction Date'])
atm_data_day.sort_index(inplace=True)
atm_data_day

In [None]:
# Plotting the No Of XYZ Card Withdrawals and No Of Other Card Withdrawals over time
plt.figure(figsize=(20, 6))
plt.plot(atm_data_day['No Of XYZ Card Withdrawals'], label='No Of XYZ Card Withdrawals')
plt.plot(atm_data_day['No Of Other Card Withdrawals'], label='No Of Other Card Withdrawals')
plt.xlabel('Transaction Date')
plt.ylabel('Count')
plt.title('Number of XYZ Card Withdrawals and Number of Other Card Withdrawals Over Time')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

"""
        The line plot provides a visualization of the trends in the number of withdrawals for both XYZ Card 
        and Other Card transactions over time. One notable observation is the big decline in the number of 
        XYZ Card withdrawals and the increase of the Other Card withdrawals compared to XYZ Card withdrawals starting in 2015. 
        This change in transaction volume for both card types suggests a potential shift in customer behavior
        or external factors affecting ATM usage.
        
        From a predictive perspective, those fluctuations in transaction volume can impact the total amount
        withdrawn for a specific ATM, so those features can be crucial for accurately predicting the future
        cash flow of ATMs.
        
        Incorporating these insights into predictive models can help improve the accuracy of predictions and
        enable proactive measures to address any challenges posed by changing transaction patterns. Analyzing
        historical trends and identifying potential drivers behind them is essential for building robust 
        predictive models that capture the complexities of ATM cash flow dynamics.
"""

In [None]:
# Creating a Line Plot for XYZ Card and Other Card Withdrawn Amounts Over Time
plt.figure(figsize=(20, 6))
plt.plot(atm_data_day['Amount withdrawn XYZ Card'], label='Amount withdrawn XYZ Card')
plt.plot(atm_data_day['Amount withdrawn Other Card'], label='Amount withdrawn Other Card')
plt.xlabel('Transaction Date')
plt.ylabel('Amount')
plt.title('Amount Withdrawn for XYZ Card and Other Card Transactions Over Time')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

"""
        The line plot provides a clear visualization of the trends in the withdrawn amount for 
        both XYZ Card and Other Card transactions over time. Throughout the depicted time frame,
        the withdrawn amount for XYZ Card transactions appears to have a more significant 
        presence compared to Other Card transactions. This trend holds except for the start of 
        2017, where the withdrawn amount for Other Card transactions begins to rise, eventually 
        surpassing the amount for XYZ Card transactions.

        This divergence around the beginning of 2017 further underscores the dynamic nature of 
        customer behavior and external factors affecting ATM usage. While the trend may seem 
        aligned for the majority of the period, this shift in the latter part of the timeline 
        highlights the necessity of analyzing comprehensive data, including both transaction 
        volume and withdrawn amounts, to build accurate predictive models for ATM cash flow. 
        The insights from both transaction volume and withdrawn amounts serve as crucial features 
        for predictive modeling, enabling the development of accurate projections and strategies 
        that respond to changing patterns in ATM usage.
"""


In [None]:
# Creating a Line Plot for Cash Flow Over Time for ATMs
plt.figure(figsize=(20, 7))
plt.plot(atm_data_day['Total amount Withdrawn'])
plt.xlabel('Transaction Date')
plt.ylabel('Total amount Withdrawn')
plt.title('Cash Flow Over Time for ATMs')
plt.show()

"""
        The line plot visually illustrates the cash flow trends over time for ATMs. 
        The plot showcases that the total amount withdrawn from ATMs exhibits a 
        relatively consistent pattern, with some noticeable deviations. Notably, 
        there is a distinct peak in cash flow observed from around 2013 to 2015, 
        indicating a period of increased ATM usage and withdrawals. However, 
        the preiod just before the start of 2017, there is a visible drop in the 
        cash flow, suggesting a potential shift in customer behavior or external 
        factors affecting cash withdrawal patterns. The overall stability of cash 
        flow, punctuated by these significant variations, highlights the importance 
        of analyzing historical trends and identifying anomalous periods for more 
        accurate prediction and proactive management of ATM cash flow dynamics.
"""

In [None]:
# Resample the data to group by month and calculate the sum of transactions
atm_data_monthly = atm_data.copy()
atm_data_monthly['Transaction Date'] = pd.to_datetime(atm_data['Transaction Date']).dt.to_period('M')
atm_data_monthly = atm_data_monthly.groupby(['ATM Name', 'Transaction Date']).sum().reset_index()

# List of columns to drop
columns_to_drop = ['Weekday', 'Festival Religion', 'Working Day', 'Holiday Sequence', 'DayOfWeek', 'Norm Total amount Withdrawn', 'Norm Amount withdrawn XYZ Card', 'Norm Amount withdrawn Other Card', 'Weekday_Friday', 'Weekday_Monday', 'Weekday_Saturday', 'Weekday_Sunday', 'Weekday_Thursday', 'Weekday_Tuesday', 'Weekday_Wednesday', 'Working Day_H', 'Working Day_W']

# Drop the specified columns
atm_data_monthly = atm_data_monthly.drop(columns=columns_to_drop)

# Set 'Transaction Date' as the index
atm_data_monthly.set_index('Transaction Date', inplace=True)

# Normalize the columns and store normalized values in separate columns using MaxAbsScaler
scaler = MaxAbsScaler()
columns_to_normalize = ['Total amount Withdrawn', 'Amount withdrawn XYZ Card', 'Amount withdrawn Other Card']
for column in columns_to_normalize:
    atm_data_monthly[column + '_normalized'] = scaler.fit_transform(atm_data_monthly[[column]])

# Display the resulting dataset with monthly aggregated data and normalized columns
atm_data_monthly


In [None]:
# Plot cash flow over time for all ATMs using bar plot (Monthly)
plt.figure(figsize=(15, 7))
plt.bar(atm_data_monthly.index.to_timestamp(), atm_data_monthly['Total amount Withdrawn'], width=20)
plt.xlabel('Transaction Month')
plt.ylabel('Total amount Withdrawn')
plt.title('Cash Flow Over Time for ATMs')
plt.xticks(rotation=45)
plt.grid(axis='y') 
plt.show()

In [None]:
# Resample the data to group by year and calculate the sum of transactions
atm_data_yearly = atm_data.copy()
atm_data_yearly['Transaction Date'] = pd.to_datetime(atm_data['Transaction Date']).dt.to_period('Y')  # Convert to Period with only year
atm_data_yearly = atm_data_yearly.groupby(['ATM Name', 'Transaction Date']).sum().reset_index()

# List of columns to drop
columns_to_drop = ['Weekday', 'Festival Religion', 'Working Day', 'Holiday Sequence', 'DayOfWeek', 'Norm Total amount Withdrawn', 'Norm Amount withdrawn XYZ Card', 'Norm Amount withdrawn Other Card', 'Weekday_Friday', 'Weekday_Monday', 'Weekday_Saturday', 'Weekday_Sunday', 'Weekday_Thursday', 'Weekday_Tuesday', 'Weekday_Wednesday', 'Working Day_H', 'Working Day_W']

# Drop the specified columns
atm_data_yearly = atm_data_yearly.drop(columns=columns_to_drop)

# Set 'Transaction Date' as the index
atm_data_yearly.set_index('Transaction Date', inplace=True)

# Normalize the columns and store normalized values in separate columns using MaxAbsScaler
scaler = MaxAbsScaler()
columns_to_normalize = ['Total amount Withdrawn', 'Amount withdrawn XYZ Card', 'Amount withdrawn Other Card']
for column in columns_to_normalize:
    atm_data_yearly[column + '_normalized'] = scaler.fit_transform(atm_data_yearly[[column]])

# Display the resulting dataset with yearly aggregated data and normalized columns
atm_data_yearly

In [None]:
# Plot cash flow over time for all ATMs using bar plot (yearly)
plt.figure(figsize=(15, 7))
plt.bar(atm_data_yearly.index.to_timestamp(), atm_data_yearly['Total amount Withdrawn'], width=100)
plt.xlabel('Year')
plt.ylabel('Total amount Withdrawn')
plt.title('Yearly Cash Flow for ATMs')
plt.xticks(rotation=45)
plt.grid(axis='y')
plt.show()

In [None]:
# Copy the atm_data to atm_data_day to remove all the changes made on atm_data_day
atm_data_day = atm_data.copy()

In [None]:
# Specifying the columns that I want to apply the lag on
lag_columns = ['Total amount Withdrawn', 'Amount withdrawn XYZ Card', 'Amount withdrawn Other Card', 'No Of Withdrawals', 'No Of XYZ Card Withdrawals', 'No Of Other Card Withdrawals']
lag_shift = 1  # the number of rows that will be shifted is 1

# A loop that goes each column and apply a shift by 1 row over each ATM group
for column in lag_columns:
    lag_column_name = f'{column}_lag_{lag_shift}'
    atm_data_day[lag_column_name] = atm_data_day.groupby('ATM Name')[column].shift(lag_shift)

# Specify the lagged columns to drop null values for because we have a row that contain NULL values at the start of each ATM group
lagged_columns_to_drop = [f'{column}_lag_{lag_shift}' for column in lag_columns]
atm_data_day.dropna(subset=lagged_columns_to_drop, inplace=True)

# Display the updated DataFrame with the lagged columns to ensure that it has been applied correctly
atm_data_day.head(1382)


In [None]:
# Reset the index and remove the old index levels
atm_data_day.reset_index(level=[0], inplace=True)
atm_data_day.drop(columns=['index'],axis=1, inplace=True)

# Display the DataFrame with the new index
atm_data_day.head(1382)

In [None]:
# Define the columns for which I want to compute the SMA (Simple Moving Average)
columns_to_smooth = ['No Of Withdrawals', 'No Of XYZ Card Withdrawals', 'No Of Other Card Withdrawals', 
                     'Total amount Withdrawn', 'Amount withdrawn XYZ Card', 'Amount withdrawn Other Card']

# Calculate the Simple Moving Average (SMA) with windows of 3, 7, and 10 days
window_size_1 = 3
window_size_2 = 7
window_size_3 = 10

for column in columns_to_smooth:
    # For window size = 3
    # Calculate SMA for each ATM separately
    atm_sma_1 = atm_data_day.groupby('ATM Name')[column].rolling(window=window_size_1, min_periods = 1).mean()
    
    # Reindex the calculated SMA to match the index of atm_data_day
    atm_sma_1 = atm_sma_1.reset_index(level=0, drop=True).reindex(atm_data_day.index)
    
    # Assign the calculated SMA to atm_data_day
    atm_data_day[column + '_SMA_3'] = atm_sma_1
    
    
    # For window size = 7
    # Calculate SMA for each ATM separately
    atm_sma_2 = atm_data_day.groupby('ATM Name')[column].rolling(window=window_size_2, min_periods = 1).mean()
    
    # Reindex the calculated SMA to match the index of atm_data_day
    atm_sma_2 = atm_sma_2.reset_index(level=0, drop=True).reindex(atm_data_day.index)
    
    # Assign the calculated SMA to atm_data_day
    atm_data_day[column + '_SMA_7'] = atm_sma_2
    
    
    # For window size = 10
     # Calculate SMA for each ATM separately
    atm_sma_3 = atm_data_day.groupby('ATM Name')[column].rolling(window=window_size_3, min_periods = 1).mean()
    
    # Reindex the calculated SMA to match the index of atm_data_day
    atm_sma_3 = atm_sma_3.reset_index(level=0, drop=True).reindex(atm_data_day.index)
    
    # Assign the calculated SMA to atm_data_day
    atm_data_day[column + '_SMA_10'] = atm_sma_3


# Display the updated DataFrame with SMA columns to ensure that it has been applied correctly
atm_data_day.head(1382)


In [None]:
# Define the columns for which you want to compute the EMA (Exponential Moving Average)
columns_to_smooth = ['No Of Withdrawals', 'No Of XYZ Card Withdrawals', 'No Of Other Card Withdrawals', 
                     'Total amount Withdrawn', 'Amount withdrawn XYZ Card', 'Amount withdrawn Other Card']

# Calculate the Exponential Moving Average (EMA) with spans of 10, 30, and 90  days
span_1 = 10
span_2 = 30
span_3 = 90
for column in columns_to_smooth:
    # For span size = 10
    # Calculate EMA for each ATM separately
    atm_ema_1 = atm_data_day.groupby('ATM Name')[column].ewm(span=span_1, min_periods=1).mean()
    
    # Reindex the calculated EMA to match the index of atm_data_day
    atm_ema_1 = atm_ema_1.reset_index(level=0, drop=True).reindex(atm_data_day.index)
    
    # Assign the calculated EMA to atm_data_day
    atm_data_day[column + '_EMA_10'] = atm_ema_1
    
    
    # For span size = 30
    # Calculate EMA for each ATM separately
    atm_ema_2 = atm_data_day.groupby('ATM Name')[column].ewm(span=span_2, min_periods=1).mean()
    
    # Reindex the calculated EMA to match the index of atm_data_day
    atm_ema_2 = atm_ema_2.reset_index(level=0, drop=True).reindex(atm_data_day.index)
    
    # Assign the calculated EMA to atm_data_day
    atm_data_day[column + '_EMA_30'] = atm_ema_2
    
    
    # For span size = 90
    # Calculate EMA for each ATM separately
    atm_ema_3 = atm_data_day.groupby('ATM Name')[column].ewm(span=span_3, min_periods=1).mean()
    
    # Reindex the calculated EMA to match the index of atm_data_day
    atm_ema_3 = atm_ema_3.reset_index(level=0, drop=True).reindex(atm_data_day.index)
    
    # Assign the calculated EMA to atm_data_day
    atm_data_day[column + '_EMA_90'] = atm_ema_3
    

# Display the updated DataFrame with EMA columns to ensure that it has been applied correctly
atm_data_day.head(1382)

In [None]:
# Reset the index of the DataFrame to default integer index
atm_data_monthly = atm_data_monthly.reset_index()

# Sort the DataFrame by the 'ATM Name' column in ascending order
atm_data_monthly = atm_data_monthly.sort_values(by='ATM Name')

# Sort the DataFrame by the 'Transaction Date' column in ascending order
atm_data_monthly = atm_data_monthly.sort_values(by='Transaction Date')

# Set the index of the DataFrame to a multi-index consisting of 'Transaction Date' and 'ATM Name'
atm_data_monthly = atm_data_monthly.set_index(['Transaction Date', 'ATM Name'])

atm_data_monthly

In [None]:
# Reset the index of the DataFrame to default integer index
atm_data_yearly = atm_data_yearly.reset_index()

# Sort the DataFrame by the 'ATM Name' column in ascending order
atm_data_yearly = atm_data_yearly.sort_values(by='ATM Name')

# Sort the DataFrame by the 'Transaction Date' column in ascending order
atm_data_yearly = atm_data_yearly.sort_values(by='Transaction Date')

# Set the index of the DataFrame to a multi-index consisting of 'Transaction Date' and 'ATM Name'
atm_data_yearly = atm_data_yearly.set_index(['Transaction Date', 'ATM Name'])

atm_data_yearly

In [None]:
# Set the Transaction Date as the index for atm_data_day
atm_data_day.set_index('Transaction Date', inplace=True)

In [None]:
atm_data_day.head(2255)

In [None]:
# Select columns for which I want to calculate correlation
selected_columns = ['DayOfWeek', 'Festival Religion', 'Working Day', 'Holiday Sequence', 'Weekday_Friday',
       'Weekday_Monday', 'Weekday_Saturday', 'Weekday_Sunday',
       'Weekday_Thursday', 'Weekday_Tuesday', 'Weekday_Wednesday',
       'Working Day_H', 'Working Day_W', 'Total amount Withdrawn_lag_1',
       'Amount withdrawn XYZ Card_lag_1', 'Amount withdrawn Other Card_lag_1',
       'No Of Withdrawals_lag_1', 'No Of XYZ Card Withdrawals_lag_1',
       'No Of Other Card Withdrawals_lag_1', 'No Of Withdrawals_SMA_3',
       'No Of Withdrawals_SMA_7', 'No Of Withdrawals_SMA_10',
       'No Of XYZ Card Withdrawals_SMA_3', 'No Of XYZ Card Withdrawals_SMA_7',
       'No Of XYZ Card Withdrawals_SMA_10',
       'No Of Other Card Withdrawals_SMA_3',
       'No Of Other Card Withdrawals_SMA_7',
       'No Of Other Card Withdrawals_SMA_10', 'Total amount Withdrawn_SMA_3',
       'Total amount Withdrawn_SMA_7', 'Total amount Withdrawn_SMA_10',
       'Amount withdrawn XYZ Card_SMA_3', 'Amount withdrawn XYZ Card_SMA_7',
       'Amount withdrawn XYZ Card_SMA_10', 'Amount withdrawn Other Card_SMA_3',
       'Amount withdrawn Other Card_SMA_7',
       'Amount withdrawn Other Card_SMA_10', 'No Of Withdrawals_EMA_10',
       'No Of Withdrawals_EMA_30', 'No Of Withdrawals_EMA_90',
       'No Of XYZ Card Withdrawals_EMA_10',
       'No Of XYZ Card Withdrawals_EMA_30',
       'No Of XYZ Card Withdrawals_EMA_90',
       'No Of Other Card Withdrawals_EMA_10',
       'No Of Other Card Withdrawals_EMA_30',
       'No Of Other Card Withdrawals_EMA_90', 'Total amount Withdrawn_EMA_10',
       'Total amount Withdrawn_EMA_30', 'Total amount Withdrawn_EMA_90',
       'Amount withdrawn XYZ Card_EMA_10', 'Amount withdrawn XYZ Card_EMA_30',
       'Amount withdrawn XYZ Card_EMA_90',
       'Amount withdrawn Other Card_EMA_10',
       'Amount withdrawn Other Card_EMA_30',
       'Amount withdrawn Other Card_EMA_90']

# Calculate the correlation matrix
correlation_matrix = atm_data_day[selected_columns].corr()

# Create a heatmap-style correlation plot using matplotlib
plt.figure(figsize=(60, 25))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='none')
plt.title('Correlation Plot')
plt.colorbar() 
plt.xticks(range(len(selected_columns)), selected_columns, rotation=90)
plt.yticks(range(len(selected_columns)), selected_columns)
plt.show()

In [None]:
# Calculate the correlation between 'Total amount Withdrawn' and selected columns
correlation_values = atm_data_day[selected_columns].corrwith(atm_data_day['Total amount Withdrawn']).sort_values()

plt.figure(figsize=(60, 25))
correlation_values.plot(kind='bar', color='blue')
plt.title('Correlation between Total Amount Withdrawn and Other Columns')
plt.ylabel('Correlation')

# Increase spacing between labels
plt.subplots_adjust(bottom=0.4)

plt.xticks(rotation=80)
plt.show()

In [None]:
# Filter columns that include 'No Of Withdrawals' or 'Total amount Withdrawn' in their names
total_amount_cards_columns_filtered = [col for col in selected_columns if 'No Of Withdrawals' in col or 'Total amount Withdrawn' in col]
# Calculate the correlation matrix
correlation_matrix = atm_data_day[total_amount_cards_columns_filtered].corr()

# Create a heatmap-style correlation plot using matplotlib
plt.figure(figsize=(15, 7))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='none')
plt.title('Correlation Plot')
plt.colorbar()  # Add a colorbar for reference
plt.xticks(range(len(total_amount_cards_columns_filtered)), total_amount_cards_columns_filtered, rotation=90)
plt.yticks(range(len(total_amount_cards_columns_filtered)), total_amount_cards_columns_filtered)
plt.show()

"""
    The presented correlation plot offers valuable insights into the relationships among various 
    features associated with ATM cash flow dynamics. The heatmap-style visualization provides a 
    clear view of the correlation coefficients between different columns representing metrics 
    related to the number of withdrawals and total amount withdrawn for both XYZ Card and Other 
    Card transactions.

    The plot reveals that there is a strong correlation between different time-based moving averages
    (SMA and EMA) and themselves, indicating the continuity and consistency of trends in withdrawal 
    patterns over time. Moreover, there is a noticeable positive correlation between SMAs and EMAs, 
    suggesting that the smoothed averages of withdrawal metrics align closely, supporting the notion 
    of stable and predictable behavior in cash flow.
"""

In [None]:
# Calculate the correlation between 'Total amount Withdrawn' and selected columns
correlation_values = atm_data_day[total_amount_cards_columns_filtered].corrwith(atm_data_day['Total amount Withdrawn']).sort_values()

plt.figure(figsize=(15, 7))
correlation_values.plot(kind='bar', color='blue')
plt.title('Correlation between Total Amount Withdrawn and Other Columns')
plt.ylabel('Correlation')

# Increase spacing between labels
plt.subplots_adjust(bottom=0.4)

plt.xticks(rotation=80)
plt.show()

"""
    The presented bar plot provides valuable insights into the correlations between 
    the target variable, which is the 'Total amount Withdrawn', and various other 
    features related to ATM transactions. The y-axis represents the correlation 
    coefficients, indicating the strength and direction of the relationship.

    The plot highlights a favorable correlation between the 'Total amount Withdrawn' 
    and different time-based moving averages (SMA and EMA) derived from the ATM 
    transaction metrics. The positive correlation indicates that changes in the 
    smoothed averages of transaction metrics, such as 'No Of Withdrawals_SMA_X' 
    and 'No Of Withdrawals_EMA_X', are associated with corresponding changes in 
    the total amount withdrawn from ATMs. This alignment suggests that trends and 
    patterns captured by these features provide valuable predictive insights into 
    the cash flow dynamics of ATMs. Incorporating these features into the model's 
    training can enhance its ability to capture the underlying patterns and 
    fluctuations in ATM transactions, leading to improved predictions and informed 
    decision-making.
"""

"""
    The presented bar plot not only reveals the strong correlation between the 'Total 
    amount Withdrawn' and various time-based moving averages (SMA and EMA) features but 
    also highlights that the correlation for the lag columns is also noteworthy. While 
    the correlation coefficients for the lag columns may not be as high as those for the 
    smoothed averages, they still indicate a meaningful relationship between historical 
    transaction metrics and the total amount withdrawn from ATMs.

    The positive correlation between the lag columns (e.g., 'Total amount Withdrawn_lag_1', 
    'No Of Withdrawals_lag_1', etc.) and the 'Total amount Withdrawn' suggests that past 
    transaction data holds predictive power for forecasting the future cash flow of ATMs. 
    This finding aligns with the concept that historical transaction behavior can provide 
    insights into the trends and patterns that drive ATM usage and, consequently, the cash flow.

    These insights underscore the importance of incorporating a combination of time-related 
    features, including lag columns and moving averages, into predictive models. By doing so, 
    these models can effectively capture both short-term fluctuations and longer-term trends, 
    leading to more accurate and robust predictions of ATM cash flow dynamics.
"""

In [None]:
# Filter columns that include 'XYZ'in their names
XYZ_amount_cards_columns_filtered = [col for col in selected_columns if 'XYZ' in col]
# Calculate the correlation matrix
correlation_matrix = atm_data_day[XYZ_amount_cards_columns_filtered].corr()

# Create a heatmap-style correlation plot using matplotlib
plt.figure(figsize=(15, 7))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='none')
plt.title('Correlation Plot')
plt.colorbar()  # Add a colorbar for reference
plt.xticks(range(len(XYZ_amount_cards_columns_filtered)), XYZ_amount_cards_columns_filtered, rotation=90)
plt.yticks(range(len(XYZ_amount_cards_columns_filtered)), XYZ_amount_cards_columns_filtered)
plt.show()

In [None]:
# Calculate the correlation between 'Total amount Withdrawn' and selected columns
correlation_values = atm_data_day[XYZ_amount_cards_columns_filtered].corrwith(atm_data_day['Total amount Withdrawn']).sort_values()

plt.figure(figsize=(15, 7))
correlation_values.plot(kind='bar', color='blue')
plt.title('Correlation between Total Amount Withdrawn and Other Columns')
plt.ylabel('Correlation')

# Increase spacing between labels
plt.subplots_adjust(bottom=0.4)

plt.xticks(rotation=80)
plt.show()

"""
    The generated correlation plots and bar plots, which focus on features related 
    to XYZ Card transactions, echo and reinforce the key insights observed in the 
    previous visualizations. These plots confirm that the correlations between the 
    'Total amount Withdrawn' and various XYZ Card-specific features, such as counts, 
    moving averages, and exponential moving averages, are consistent with the trends 
    identified earlier.

    The high positive correlation between the 'Total amount Withdrawn' and these XYZ
    Card-related features supports the notion that the transaction behaviors associated 
    with XYZ Card usage have a significant impact on the overall cash flow of ATMs. The 
    strength of these correlations underscores the potential predictive power that these 
    features hold for forecasting future cash flow dynamics.

    Collectively, these analyses emphasize the robustness and reliability of the identified 
    insights. The consistent patterns observed across different categories of features 
    (including overall transaction counts, counts by card type, moving averages, and exponential 
    moving averages) provide a comprehensive view of the factors influencing ATM cash flow. 
    By leveraging these insights, data-driven predictions can be made more accurate and actionable, 
    facilitating better resource allocation and decision-making within the context of ATM management 
    and operations.

"""

In [None]:
# Filter columns that include 'XYZ'in their names
Other_amount_cards_columns_filtered = [col for col in selected_columns if 'Other' in col]
# Calculate the correlation matrix
correlation_matrix = atm_data_day[Other_amount_cards_columns_filtered].corr()

# Create a heatmap-style correlation plot using matplotlib
plt.figure(figsize=(15, 7))
plt.imshow(correlation_matrix, cmap='coolwarm', interpolation='none')
plt.title('Correlation Plot')
plt.colorbar()  # Add a colorbar for reference
plt.xticks(range(len(Other_amount_cards_columns_filtered)), Other_amount_cards_columns_filtered, rotation=90)
plt.yticks(range(len(Other_amount_cards_columns_filtered)), Other_amount_cards_columns_filtered)
plt.show()

In [None]:
# Calculate the correlation between 'Total amount Withdrawn' and selected columns
correlation_values = atm_data_day[Other_amount_cards_columns_filtered].corrwith(atm_data_day['Total amount Withdrawn']).sort_values()

plt.figure(figsize=(15, 7))
correlation_values.plot(kind='bar', color='blue')
plt.title('Correlation between Total Amount Withdrawn and Other Columns')
plt.ylabel('Correlation')

# Increase spacing between labels
plt.subplots_adjust(bottom=0.4)

plt.xticks(rotation=80)
plt.show()

"""
    The correlation plots and bar plots generated for features associated with 
    "Other Card" transactions exhibit a similar pattern of insights as observed 
    in the previous analyses. These visualizations underscore the consistent 
    relationships between different aspects of "Other Card" transaction behavior 
    and the overall cash flow dynamics of ATMs.

    The strong positive correlations observed between the 'Total amount Withdrawn' 
    and these "Other Card"-related features reinforce the notion that transaction 
    trends linked to "Other Card" usage play a pivotal role in shaping the overall 
    cash flow. This alignment between multiple types of features and their correlations 
    with the target variable emphasizes their predictive potential for forecasting 
    future cash flow patterns accurately.

    The convergence of insights across various categories of features, including 
    transaction counts, moving averages, and exponential moving averages for "Other 
    Card" transactions, substantiates the robustness of the conclusions drawn. By 
    considering these insights collectively, data-driven predictions can be enriched 
    with a comprehensive understanding of the drivers behind ATM cash flow dynamics. 
    As a result, decision-makers can make informed choices that optimize the performance 
    and management of ATM networks.
"""

In [None]:
"""
    The next step is to build a predictive model that can accurately forecast the cash flow for each ATM.
"""
atm_data_day.sort_index(inplace=True)
atm_data_day

In [None]:
"""
    The features for the input data 'x' have been meticulously selected based 
    on a comprehensive analysis that involves evaluating various graphs and 
    conducting exploratory data analysis (EDA). Additionally, different data 
    selection techniques such as filter, wrapper, and intrinsic methods have 
    been employed to ensure the relevance and significance of the chosen features. 
    This approach aims to enhance the predictive power of the model by incorporating 
    attributes that exhibit meaningful correlations with the target variable 
    'Total amount Withdrawn'.
"""

# Selecting specific columns as features for the input data 'x'

x = atm_data_day[['Weekday_Friday', 'Weekday_Monday', 'Weekday_Saturday',
                  'Weekday_Sunday', 'Weekday_Thursday', 'Weekday_Tuesday',
                  'Weekday_Wednesday', 'Working Day_H', 'Working Day_W', 'Total amount Withdrawn_lag_1',
                  'Amount withdrawn XYZ Card_lag_1', 'Amount withdrawn Other Card_lag_1',
                  'No Of Withdrawals_lag_1', 'No Of XYZ Card Withdrawals_lag_1',
                  'No Of Other Card Withdrawals_lag_1', 'No Of Withdrawals_SMA_3',
                  'No Of Withdrawals_SMA_7', 'No Of Withdrawals_SMA_10',
                  'No Of XYZ Card Withdrawals_SMA_3', 'No Of XYZ Card Withdrawals_SMA_7',
                  'No Of XYZ Card Withdrawals_SMA_10',
                  'No Of Other Card Withdrawals_SMA_3',
                  'No Of Other Card Withdrawals_SMA_7',
                  'No Of Other Card Withdrawals_SMA_10', 'Total amount Withdrawn_SMA_3',
                  'Total amount Withdrawn_SMA_7', 'Total amount Withdrawn_SMA_10',
                  'Amount withdrawn XYZ Card_SMA_3', 'Amount withdrawn XYZ Card_SMA_7',
                  'Amount withdrawn XYZ Card_SMA_10', 'Amount withdrawn Other Card_SMA_3',
                  'Amount withdrawn Other Card_SMA_7',
                  'Amount withdrawn Other Card_SMA_10', 'No Of Withdrawals_EMA_10',
                  'No Of Withdrawals_EMA_30', 'No Of Withdrawals_EMA_90',
                  'No Of XYZ Card Withdrawals_EMA_10',
                  'No Of XYZ Card Withdrawals_EMA_30',
                  'No Of XYZ Card Withdrawals_EMA_90',
                  'No Of Other Card Withdrawals_EMA_10',
                  'No Of Other Card Withdrawals_EMA_30',
                  'No Of Other Card Withdrawals_EMA_90', 'Total amount Withdrawn_EMA_10',
                  'Total amount Withdrawn_EMA_30', 'Total amount Withdrawn_EMA_90',
                  'Amount withdrawn XYZ Card_EMA_10', 'Amount withdrawn XYZ Card_EMA_30',
                  'Amount withdrawn XYZ Card_EMA_90',
                  'Amount withdrawn Other Card_EMA_10',
                  'Amount withdrawn Other Card_EMA_30',
                  'Amount withdrawn Other Card_EMA_90']]

# Selecting the 'Total amount Withdrawn' column as the target variable 'y'
y = atm_data_day['Total amount Withdrawn']

In [None]:
# Print the length of DataFrame x
print("Length of DataFrame x:", len(x))

# Print the length of DataFrame y
print("Length of DataFrame y:", len(y))

print('\n\n')

# Print the number of missing values in DataFrame x
print("NULL values in DataFrame x:")
print(x.isna().sum())

print('\n\n')

# Print the number of missing values in DataFrame y
print("NULL values in DataFrame y:")
print(y.isna().sum())


In [None]:
# Splitting the data into training, validation, and test sets based on date ranges

# Select data up to '2016-01-02' for training
x_train = x[:'2016-01-02']
y_train = y[:'2016-01-02']

# Select data from '2016-01-02' to '2017-01-02' for validation
x_val = x['2016-01-02': '2017-01-02']
y_val = y['2016-01-02': '2017-01-02']

# Select data from '2017-01-02' onwards for testing
x_test = x['2017-01-02':]
y_test = y['2017-01-02':]

# Select data up to '2017-01-02' for K-Fold model
x_kfold = x[:'2017-01-02']
y_kfold = y[:'2017-01-02']

# Select data to '2017-01-02' for time series model
x_time = x[:'2017-01-02']
y_time = y[:'2017-01-02']

In [None]:
# Initialize Lazy Regressor to quickly evaluate multiple models
reg = LazyRegressor()
models_summary = reg.fit(x_train, x_val, y_train, y_val)

# Print the summary report of model performance
print(models_summary)

In [None]:
# Define a list of base models for stacking
base_models = [
    ('Lasso', Lasso()),
    ('BayesianRidge', BayesianRidge()),
    ('RidgeCV', RidgeCV()),
    ('RANSACRegressor', RANSACRegressor()),
    ('ridge', Ridge())
]

# Define a meta-model for stacking
meta_model = LinearRegression()

# Initialize lists to store base model predictions
base_predictions_train = []
base_predictions_val = []

# Loop through each base model
for name, model in base_models:
    model.fit(x_train, y_train)
    train_preds = model.predict(x_train)
    val_preds = model.predict(x_val)
    base_predictions_train.append(train_preds)
    base_predictions_val.append(val_preds)

# Stack the base model predictions horizontally
stacked_train_preds = np.column_stack(base_predictions_train)
stacked_val_preds = np.column_stack(base_predictions_val)

# Fit the meta-model on stacked predictions
meta_model.fit(stacked_train_preds, y_train)

# Predictions using the meta-model on validation and training sets
meta_val_preds = meta_model.predict(stacked_val_preds)
meta_train_preds = meta_model.predict(stacked_train_preds)

# Calculate and print scores and errors for validation and training sets
score_valid = meta_model.score(stacked_val_preds, y_val)
print('Validation Score:', score_valid)

score_train = meta_model.score(stacked_train_preds, y_train)
print('Training Score:', score_train)

mse_valid = mean_squared_error(y_val, meta_val_preds)
print('Validation Mean Squared Error:', mse_valid)

mae_valid = mean_absolute_error(y_val, meta_val_preds)
print('Validation Mean Absolute Error:', mae_valid)

mse_train = mean_squared_error(y_train, meta_train_preds)
print('Training Mean Squared Error:', mse_train)

mae_train = mean_absolute_error(y_train, meta_train_preds)
print('Training Mean Absolute Error:', mae_train)

In [None]:
# Initialize lists to store base model predictions for the test set
base_predictions_test = []

# Loop through each base model
for name, model in base_models:
    # Train each base model on the full training data (x_train, y_train)
    model.fit(x_train, y_train)
    
    # Make predictions on the test set
    test_preds = model.predict(x_test)
    base_predictions_test.append(test_preds)

# Stack the base model predictions horizontally for the test set
stacked_test_preds = np.column_stack(base_predictions_test)

# Predict using the meta-model on stacked predictions for the test set
meta_test_preds = meta_model.predict(stacked_test_preds)

# Calculate and print scores and errors for the test set
score_test = meta_model.score(stacked_test_preds, y_test)
print('Test Score:', score_test)

mse_test = mean_squared_error(y_test, meta_test_preds)
print('Test Mean Squared Error:', mse_test)

mae_test = mean_absolute_error(y_test, meta_test_preds)
print('Test Mean Absolute Error:', mae_test)

# Create a scatter plot for actual vs. predicted values on the test set
plt.figure(figsize=(10, 6))
plt.scatter(y_test, meta_test_preds, alpha=0.5)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values (Test)')
plt.grid()
plt.show()

# Sort the data by index for visualization
sorted_y_test = y_test.sort_index()
sorted_meta_test_preds = pd.Series(meta_test_preds, index=y_test.index)
sorted_meta_test_preds = sorted_meta_test_preds.sort_index()

# Create a line plot for sorted actual values and sorted predicted values over time
plt.figure(figsize=(20, 6))
plt.plot(sorted_y_test.index, sorted_y_test.values, label='Actual Values', linewidth=2)
plt.plot(sorted_y_test.index, sorted_meta_test_preds, label='Predicted Values', linewidth=2)

plt.xlabel('Time')
plt.ylabel('Values')
plt.title('Actual vs. Predicted Values Over Time (Test)')
plt.legend()
plt.grid()
plt.xticks(rotation=45)
plt.show()

"""
    The close alignment between the validation, training, and test scores indicates 
    consistent performance across different datasets. This consistency is a positive 
    indicator that the model is neither significantly overfitting nor underfitting.

    In conclusion, the stacked ensemble model demonstrates good generalization ability, 
    producing competitive results on both validation and test datasets. This suggests 
    that the ensemble's aggregated predictions from various base models are able to 
    effectively capture the underlying patterns in the data, leading to accurate and 
    reliable predictions on new data points.
"""

In [None]:
# Train a linear regression model
model_reg = LinearRegression()
model_reg.fit(x_train, y_train)

# Make predictions on the validation set
y_pred_valid = model_reg.predict(x_val)

# Calculate Mean Squared Error for validation set
mse_valid = mean_squared_error(y_val, y_pred_valid)
print('Validation Mean Squared Error:', mse_valid)

# Calculate Mean Absolute Error for validation set
mae_valid = mean_absolute_error(y_val, y_pred_valid)
print('Validation Mean Absolute Error:', mae_valid)

print('Train Score:', model_reg.score(x_train, y_train))
print('Validation Score:', model_reg.score(x_val, y_val))

# Create a scatter plot for actual vs. predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_val, y_pred_valid, alpha=0.5)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.grid()
plt.show()

# Sort the data by index
sorted_y_val = y_val.sort_index()
y_pred_valid = pd.Series(y_pred_valid, index=y_val.index)
sorted_y_pred_valid = y_pred_valid.sort_index()

# Create a line plot for sorted actual values and sorted predicted values over time
plt.figure(figsize=(20, 6))
plt.plot(sorted_y_val.index, sorted_y_val.values, label='Actual Values', linewidth=2)
plt.plot(sorted_y_val.index, sorted_y_pred_valid, label='Predicted Values', linewidth=2)

plt.xlabel('Time')
plt.ylabel('Values')
plt.title('Actual vs. Predicted Values Over Time')
plt.legend()
plt.grid()
plt.xticks(rotation=45)
plt.show()


"""
    The relatively close alignment between the average training and 
    validation scores is a positive sign, indicating that the model 
    is likely capturing meaningful patterns and not simply memorizing 
    the training data.
"""

In [None]:
# Test the trained model using the test set
# Make predictions on the test set
y_pred_test = model_reg.predict(x_test)

# Calculate Mean Squared Error for test set
mse_test = mean_squared_error(y_test, y_pred_test)
print('Test Mean Squared Error:', mse_test)

# Calculate Mean Absolute Error for test set
mae_test = mean_absolute_error(y_test, y_pred_test)
print('Test Mean Absolute Error:', mae_test)

# Print average score for the test set
print('Test Score:', model_reg.score(x_test, y_test))

# Create a scatter plot to visualize predicted vs. actual values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_test, alpha=0.5)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values')
plt.grid()
plt.show()

# Sort the data by index for better visualization
sorted_y_test = y_test.sort_index()
y_pred_test = pd.Series(y_pred_test, index=y_test.index)
sorted_y_pred_test = y_pred_test.sort_index()

# Create a line plot to compare sorted actual values and sorted predicted values over time
plt.figure(figsize=(20, 6))
plt.plot(sorted_y_test.index, sorted_y_test.values, label='Actual Values', linewidth=2)
plt.plot(sorted_y_test.index, sorted_y_pred_test, label='Predicted Values', linewidth=2)

plt.xlabel('Time')
plt.ylabel('Values')
plt.title('Actual vs. Predicted Values Over Time')
plt.legend()
plt.grid()
plt.xticks(rotation=45)
plt.show()


In [None]:
# Initialize and train an XGBoost regressor
xgb_regressor = xgb.XGBRegressor(random_state=42)
xgb_regressor.fit(x_train, y_train)

# Make predictions on the validation set
y_pred_valid = xgb_regressor.predict(x_val)

# Calculate Mean Squared Error for validation set
mse_valid = mean_squared_error(y_val, y_pred_valid)
print('Validation Mean Squared Error:', mse_valid)

# Calculate Mean Absolute Error for validation set
mae_valid = mean_absolute_error(y_val, y_pred_valid)
print('Validation Mean Absolute Error:', mae_valid)

# Calculate R-squared (R2) scores for training and validation sets
r2_train = xgb_regressor.score(x_train, y_train)
r2_valid = xgb_regressor.score(x_val, y_val)
print('Average Train Score:', r2_train)
print('Average Validation Score:', r2_valid)


"""
    Given the difference between the training and validation scores, 
    there exists some level of overfitting, where the model performs 
    exceptionally well on the training data but relatively less well on 
    unseen validation data.
"""

In [None]:
# Initialize k-fold cross-validation with 5 splits, shuffling, and a fixed random state
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store metrics for each fold
mse_scores = []
mae_scores = []
train_scores = []
val_scores = []

# Perform k-fold cross-validation
for train_idx, val_idx in kfold.split(x_kfold):
    # Split the data into training and validation sets based on fold indices
    x_train, x_val = x_kfold.iloc[train_idx], x_kfold.iloc[val_idx]
    y_train, y_val = y_kfold.iloc[train_idx], y_kfold.iloc[val_idx]
    
    # Train a linear regression model on the training data
    KF_linear_model = LinearRegression()
    KF_linear_model.fit(x_train, y_train)
    
    # Make predictions on the validation set
    y_pred_val = KF_linear_model.predict(x_val)
    
    # Calculate evaluation metrics for this fold
    mse = mean_squared_error(y_val, y_pred_val)
    mae = mean_absolute_error(y_val, y_pred_val)
    train_score = KF_linear_model.score(x_train, y_train)
    val_score = KF_linear_model.score(x_val, y_val)
    
    # Append metrics to respective lists
    mse_scores.append(mse)
    mae_scores.append(mae)
    train_scores.append(train_score)
    val_scores.append(val_score)

# Calculate and print the average metrics over all folds
avg_mse = sum(mse_scores) / len(mse_scores)
avg_mae = sum(mae_scores) / len(mae_scores)
avg_train_score = sum(train_scores) / len(train_scores)
avg_val_score = sum(val_scores) / len(val_scores)

print('Average MSE:', avg_mse)
print('Average MAE:', avg_mae)
print('Average Train Score:', avg_train_score)
print('Average Validation Score:', avg_val_score)

"""
    The closely aligned average training and validation scores suggest that the model 
    is effectively capturing patterns in the data and demonstrating a strong ability 
    to generalize to new data. These results are positive indicators of the model's 
    potential reliability in real-world scenarios.
"""

In [None]:
# Make predictions on the test dataset
y_pred_test = KF_linear_model.predict(x_test)

# Calculate Mean Squared Error for the test set
mse_test = mean_squared_error(y_test, y_pred_test)
print('Test Mean Squared Error:', mse_test)

# Calculate Mean Absolute Error for the test set
mae_test = mean_absolute_error(y_test, y_pred_test)
print('Test Mean Absolute Error:', mae_test)

# Calculate R-squared (R2) score for the test set
r2_test = KF_linear_model.score(x_test, y_test)
print('Average Test Score:', r2_test)

# Create a scatter plot for actual vs. predicted values on the test set
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_test, alpha=0.5)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values (Test)')
plt.grid()
plt.show()

# Sort the data by index for visualization
sorted_y_test = y_test.sort_index()
sorted_y_pred_test = pd.Series(y_pred_test, index=y_test.index)
sorted_y_pred_test = sorted_y_pred_test.sort_index()

# Create a line plot for sorted actual values and sorted predicted values over time
plt.figure(figsize=(20, 6))
plt.plot(sorted_y_test.index, sorted_y_test.values, label='Actual Values', linewidth=2)
plt.plot(sorted_y_test.index, sorted_y_pred_test, label='Predicted Values', linewidth=2)

plt.xlabel('Time')
plt.ylabel('Values')
plt.title('Actual vs. Predicted Values Over Time (Test)')
plt.legend()
plt.grid()
plt.xticks(rotation=45)
plt.show()

"""
    Given that the test R2 score is relatively close to the average train 
    and validation scores, it appears that the model's performance on the 
    test set is in line with its performance on the training and validation 
    sets. This consistency is a positive sign and suggests that the model 
    has not suffered from significant overfitting or underfitting.
"""

In [None]:
# Initialize k-fold cross-validation with 5 splits, shuffling, and a fixed random state
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics for each fold
mse_scores = []
mae_scores = []
train_scores = []
val_scores = []

# Perform k-fold cross-validation
for train_idx, val_idx in kfold.split(x_kfold):
    # Split the data into training and validation sets based on fold indices
    x_train, x_val = x_kfold.iloc[train_idx], x_kfold.iloc[val_idx]
    y_train, y_val = y_kfold.iloc[train_idx], y_kfold.iloc[val_idx]
    
    # Train an XGBoost regressor on the training data
    KF_xgb_regressor = xgb.XGBRegressor(random_state=42)
    KF_xgb_regressor.fit(x_train, y_train)
    
    # Make predictions on the validation set
    y_pred_val = KF_xgb_regressor.predict(x_val)
    
    # Calculate evaluation metrics for this fold
    mse = mean_squared_error(y_val, y_pred_val)
    mae = mean_absolute_error(y_val, y_pred_val)
    train_score = KF_xgb_regressor.score(x_train, y_train)
    val_score = KF_xgb_regressor.score(x_val, y_val)
    
    # Append metrics to respective lists
    mse_scores.append(mse)
    mae_scores.append(mae)
    train_scores.append(train_score)
    val_scores.append(val_score)

# Calculate and print the average metrics over all folds
avg_mse = sum(mse_scores) / len(mse_scores)
avg_mae = sum(mae_scores) / len(mae_scores)
avg_train_score = sum(train_scores) / len(train_scores)
avg_val_score = sum(val_scores) / len(val_scores)

print('Average MSE:', avg_mse)
print('Average MAE:', avg_mae)
print('Average Train Score:', avg_train_score)
print('Average Validation Score:', avg_val_score)

"""
    The noticeable difference between the average training and 
    validation scores suggests the possibility of overfitting. 
    The model seems to have been trained to fit the training 
    data very closely, which may lead to reduced performance 
    on new data. 
"""

In [None]:
# Initialize time series split
tscv = TimeSeriesSplit(n_splits=2)

# Initialize lists to store metrics
mse_scores = []
mae_scores = []
train_scores = []
val_scores = []

# Perform time series cross-validation
for train_idx, val_idx in tscv.split(x_time):
    # Split the data into training and validation sets based on time series split indices
    x_train, x_val = x_time.iloc[train_idx], x_time.iloc[val_idx]
    y_train, y_val = y_time.iloc[train_idx], y_time.iloc[val_idx]
    
    # Train a linear regression model on the training data
    time_series_reg_model = LinearRegression()
    time_series_reg_model.fit(x_train, y_train)
    
    # Make predictions on the validation set
    y_pred_val = time_series_reg_model.predict(x_val)
    
    # Calculate metrics for evaluation
    mse = mean_squared_error(y_val, y_pred_val)
    mae = mean_absolute_error(y_val, y_pred_val)
    train_score = time_series_reg_model.score(x_train, y_train)
    val_score = time_series_reg_model.score(x_val, y_val)
    
    mse_scores.append(mse)
    mae_scores.append(mae)
    train_scores.append(train_score)
    val_scores.append(val_score)

# Calculate and print the average metrics over all folds
avg_mse = sum(mse_scores) / len(mse_scores)
avg_mae = sum(mae_scores) / len(mae_scores)
avg_train_score = sum(train_scores) / len(train_scores)
avg_val_score = sum(val_scores) / len(val_scores)

print('Average MSE:', avg_mse)
print('Average MAE:', avg_mae)
print('Average Train Score:', avg_train_score)
print('Average Validation Score:', avg_val_score)


In [None]:
# Train the time series model on the entire training data (x_train, y_train)
time_series_reg_model.fit(x_train, y_train)

# Make predictions on the test set
y_pred_test = time_series_reg_model.predict(x_test)

# Calculate Mean Squared Error for the test set
mse_test = mean_squared_error(y_test, y_pred_test)
print('Test Mean Squared Error:', mse_test)

# Calculate Mean Absolute Error for the test set
mae_test = mean_absolute_error(y_test, y_pred_test)
print('Test Mean Absolute Error:', mae_test)

# Calculate R-squared (R2) score for the test set
r2_test = time_series_reg_model.score(x_test, y_test)
print('Average Test Score:', r2_test)

# Create a scatter plot for actual vs. predicted values on the test set
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred_test, alpha=0.5)
plt.xlabel('Actual Values')
plt.ylabel('Predicted Values')
plt.title('Actual vs. Predicted Values (Test)')
plt.grid()
plt.show()

# Sort the data by index for visualization
sorted_y_test = y_test.sort_index()
sorted_y_pred_test = pd.Series(y_pred_test, index=y_test.index)
sorted_y_pred_test = sorted_y_pred_test.sort_index()

# Create a line plot for sorted actual values and sorted predicted values over time
plt.figure(figsize=(20, 6))
plt.plot(sorted_y_test.index, sorted_y_test.values, label='Actual Values', linewidth=2)
plt.plot(sorted_y_test.index, sorted_y_pred_test, label='Predicted Values', linewidth=2)

plt.xlabel('Time')
plt.ylabel('Values')
plt.title('Actual vs. Predicted Values Over Time (Test)')
plt.legend()
plt.grid()
plt.xticks(rotation=45)
plt.show()

"""
    Given that the test R2 score is relatively close to the average train 
    and validation scores, it appears that the model's performance on the 
    test set is in line with its performance on the training and validation 
    sets. This consistency is a positive sign and suggests that the model 
    has not suffered from significant overfitting or underfitting. But,
    due to the difference between the average training and test scores
    which is not very small, the model might slighlty overfitting.
"""


In [None]:
# Initialize time series split with 2 splits
time_series_splitter = TimeSeriesSplit(n_splits=2)

# Initialize lists to store evaluation metrics
mse_scores = []
mae_scores = []
train_scores = []
val_scores = []

# Perform time series cross-validation
for train_idx, val_idx in time_series_splitter.split(x_time):
    # Split the data into training and validation sets based on fold indices
    x_train, x_val = x_time.iloc[train_idx], x_time.iloc[val_idx]
    y_train, y_val = y_time.iloc[train_idx], y_time.iloc[val_idx]
    
    # Initialize and train an XGBoost regressor
    xgb_time_split_model = xgb.XGBRegressor(random_state=42)
    xgb_time_split_model.fit(x_train, y_train)
    
    # Make predictions on the validation set
    y_pred_val = xgb_time_split_model.predict(x_val)
    
    # Calculate evaluation metrics
    mse = mean_squared_error(y_val, y_pred_val)
    mae = mean_absolute_error(y_val, y_pred_val)
    train_score = xgb_time_split_model.score(x_train, y_train)
    val_score = xgb_time_split_model.score(x_val, y_val)
    
    mse_scores.append(mse)
    mae_scores.append(mae)
    train_scores.append(train_score)
    val_scores.append(val_score)

# Calculate and print the average metrics over all folds
avg_mse = sum(mse_scores) / len(mse_scores)
avg_mae = sum(mae_scores) / len(mae_scores)
avg_train_score = sum(train_scores) / len(train_scores)
avg_val_score = sum(val_scores) / len(val_scores)

print('Average MSE:', avg_mse)
print('Average MAE:', avg_mae)
print('Average Train Score:', avg_train_score)
print('Average Validation Score:', avg_val_score)

"""
    These results indicate a significant risk of overfitting, as evidenced by the 
    substantial gap between the train and validation scores. The model seems to be 
    fitting the training data almost perfectly but struggles to generalize well to 
    unseen data, which is reflected in the lower validation score
"""

In [None]:
"""
    In conclusion, after a thorough exploration of various predictive models and a 
    comprehensive evaluation of their performance, the chosen model for forecasting 
    ATM cash flow is Linear Regression with K-Fold cross-validation. This decision 
    is grounded in several key observations that validate the reliability of this 
    model for addressing the specific problem at hand.

    Firstly, the Linear Regression model demonstrated the highest Test score (86%) 
    among the considered models. This achievement suggests that the model's predictive
    capability extends beyond the training data and performs consistently well on 
    previously unseen data, which is a crucial aspect in ensuring the model's 
    generalization to real-world scenarios.

    Furthermore, the model's performance exhibited the least disparity between Test
    and Train Scores (91.5% - 86% = 5.5%). This careful balancing act is instrumental
    in avoiding the overfitting of the model. By achieving a close alignment between 
    training and test performance, the Linear Regression model effectively captures 
    the underlying patterns in the data without succumbing to the pitfalls of 
    memorization or lack of adaptability to new information.

    In addition, the Linear Regression model showcased the least Mean Absolute Error 
    (MAE) when compared to alternative models (69368.45). MAE is a critical metric as 
    it quantifies the average absolute difference between the predicted and actual 
    values. The lower MAE indicates that the model's predictions are consistently 
    closer to the actual values, implying a higher level of accuracy in its forecasts.

    Taking into account the combined strengths of achieving the highest Test score, 
    mitigating overfitting concerns, and yielding the least MAE, the Linear Regression 
    model with K-fold cross-validation emerges as a reliable and robust solution for 
    the ATM cash flow forecasting problem. Its consistent performance across different 
    data subsets and its ability to capture meaningful patterns in the data position it 
    as a valuable tool for decision-making and planning in this context.

    It is worth noting that while the Linear Regression model has demonstrated its 
    reliability within the current context, continued monitoring and periodic reevaluation 
    of its performance against changing data patterns is essential to ensure its continued 
    accuracy and relevance.
"""