In [None]:
# general imports
import os

# conda packages
import pandas as pd
from pandas.tseries.offsets import DateOffset
import matplotlib.pyplot as plt
import matplotlib.dates as md
import numpy as np
from sktime.datasets import load_airline
from sktime.forecasting.base import ForecastingHorizon
from sktime.forecasting.arima import AutoARIMA

# local imports
from src.load_data import add_date_columns, train_val_test_split, find_incomplete_clients
from src.plotting import plot_nr_credict_applications
from src.arima import make_input, Arima1d

# constants
nr_train = 26
nr_val = 3
nr_test = 3

## Load data

In [None]:
df_customers = pd.read_csv(os.path.join('data', 'customers.csv'))
df_applications = pd.read_csv(os.path.join('data', 'credit_applications.csv'))

## Pre-process data

In [None]:
df_customers = add_date_columns(df_customers)
df_applications = add_date_columns(df_applications)

## Missing values:

### notice that the same client-yearmonth entries are missing in both tables:


In [None]:
nr_months = np.unique(df_applications.yearmonth).shape[0]

month_counts_cust = df_customers.groupby('client_nr').yearmonth.count()
month_counts_appl = df_applications.groupby('client_nr').yearmonth.count()

missing_in_cust = month_counts_cust < nr_months
missing_in_appl = month_counts_appl < nr_months

display(month_counts_cust[missing_in_cust])
display(month_counts_appl[missing_in_appl])

In [None]:
df_applications[df_applications.client_nr == 68]

### Remove customers which have missing values of nr_credit_applications

In [None]:
complete_clients, incomplete_clients = find_incomplete_clients(df_applications, df_customers)

In [None]:
df_applications = df_applications[
    df_applications.client_nr.isin(complete_clients)
]
df_customers = df_customers[
    df_customers.client_nr.isin(complete_clients)
]

## Train / Val / Test split

In [None]:
# make train/val/test splits
df_cust_train, df_cust_val, df_cust_tv, df_cust_test = train_val_test_split(
    df_customers, nr_train, nr_val, nr_test
)
df_appl_train, df_appl_val, df_appl_tv, df_appl_test = train_val_test_split(
    df_applications, nr_train, nr_val, nr_test
)

# visualize the train/val set:
display(df_cust_tv)

In [None]:
display(df_appl_tv)

## Visualize

### Total number of credit applications

In [None]:
# plot all companies:
nr_credit_applications = df_appl_tv.groupby(['yearmonth_dt']).nr_credit_applications.sum()
volume_credit_trx = df_cust_tv.groupby(['yearmonth_dt']).volume_credit_trx.sum()
volume_debit_trx = df_cust_tv.groupby(['yearmonth_dt']).volume_debit_trx.sum()
total_nr_trx = df_cust_tv.groupby(['yearmonth_dt']).total_nr_trx.sum()
# plot_nr_credict_applications(total_nr_applications, 'All companies')

In [None]:
fig, axs = plt.subplots(nrows=4, figsize=(7.5, 8.5))

plt.tight_layout(pad=3.0)

axs[0].set_title('Number of Applications')
axs[0].set_xlabel('')
axs[0].set_ylabel('Applications')
l0, = axs[0].plot(nr_credit_applications)

axs[1].set_title('Volume of Credit Transactions')
axs[1].set_xlabel('')
axs[1].set_ylabel('Euros')
l1, = axs[1].plot(volume_credit_trx)
l1.set_color('r')

axs[2].set_title('Volume of Debit Transactions')
axs[2].set_xlabel('')
axs[2].set_ylabel('Euros')
l2, = axs[2].plot(volume_debit_trx)
l2.set_color('g')

axs[3].set_title('Total Number of Transactions')
axs[3].set_xlabel('')
axs[3].set_ylabel('')
l3, = axs[3].plot(total_nr_trx)
l3.set_color('y')



# ax.xaxis.set_major_locator(
#     md.MonthLocator() # show one tick per quarter
# )
# ax.xaxis.set_major_formatter(
#     md.DateFormatter('%m-%Y')
# )
plt.savefig('plots/all_clients.png')
plt.show()

### Number of applications per client


In [None]:
appl_per_client = df_appl_tv.groupby(['client_nr']).nr_credit_applications.sum()
appl_per_client[appl_per_client > 0]

In [None]:
# plot company three:
company_three = df_appl_tv[df_appl_tv.client_nr == 3]
company_three_nr_applications = company_three.groupby(['yearmonth_dt']).nr_credit_applications.sum()
fix, ax = plt.subplots(figsize=(8, 4))
ax.plot(company_three_nr_applications, 'ro')
plt.title('Client #3')
ax.set_xlabel('')
ax.set_ylabel('Applications')
plt.savefig('plots/client_nr3_applications.png')
plt.show()