In [73]:
#import dataset from local machine
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load the dataset
df = pd.read_csv('data/statement_fact_20250325.csv')

#drop columns that are not needed
df.drop(columns=['return_check_cnt_last_mth', 'return_check_cnt_ytd'])

df = df[['current_account_nbr', 'billing_cycle_date', 'payment_hist_1_12_mths']]

# Sorting data by account number and billing cycle date
df_sorted = df.sort_values(by=['current_account_nbr', 'billing_cycle_date'])

# Create a new column 'payment_hist_last_mth' to store the last month's payment history
df_sorted['payment_hist_last_mth'] = df_sorted.groupby('current_account_nbr')['payment_hist_1_12_mths'].shift(0)

# 'payment_hist_last_mth' should contain the left most character of 'payment_hist_1_12_mths'
df_sorted['payment_hist_last_mth'] = df_sorted['payment_hist_last_mth'].str[0]

# Aggregate the 'payment_hist_last_mth' values by concatenating them for each account
df_sorted['payment_hist_n_mths'] = df_sorted.groupby('current_account_nbr')['payment_hist_last_mth'].transform(lambda x: ''.join(x))

# Drop duplicate rows for each account number, keeping only one row per account
df_unique = df_sorted.drop_duplicates(subset=['current_account_nbr'])

# Display the result
print(df_unique[['current_account_nbr', 'payment_hist_n_mths']])


df_unique.to_csv('data/payment_hist_n_mths.csv', index=False)


       current_account_nbr                                payment_hist_n_mths
196629    00iP5U82D8XwVQ9G  AQBQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ...
40225     00oyr3QppAzjLws4  AQQQQQIZZZZZAIQQR#AZZZQQQQQQQQQQIQIIIIIIIQIIII...
617691    01SEhQXHbPJRc1Go                           ZZZZZZZZZZZZZZZZZZZZZZZZ
361226    01wbqz3obYPYxLvR                                               \\\\
357731    020qpQKv0isIguXC                          AIIIIIIIIIIZZZZZZZZZZZZZZ
...                    ...                                                ...
116043    zzBy2qNM78aRV580                  IIIIIQIIQIIIIQIQQQQI-AIQQIIIIII0Z
12245     zzEuUBBmvGiVnabb  -QQAQQQRQQQQQAQQQZAQQQBQQQQQQQQQQQQQQQQQQQQQQQ...
84926     zzR9PvG7dY9u5iHU                                                 AQ
299364    zzXGgGu6ysGwGH1J                                                  \
445591    zzztTVczEiGgAEJi  IZZZZZZZZZZZZZZQQ1QIIQIIIIIIQQQIQIIIIIIIZZZZZZ...

[17613 rows x 2 columns]
