In [1]:
import numpy as np
import pandas as pd
import altair as alt
import seaborn as sns

In [2]:
inflows = pd.read_parquet('data/ucsd-inflows.pqt')
inflows.posted_date = pd.to_datetime(inflows.posted_date, format='%Y-%m-%d')
inflows.shape[0]

513115

In [3]:
inflows.loc[inflows.memo == 'PAYCHECK_PLACEHOLDER', 'category'] = 'PAYCHECK'

In [4]:
inflows.category.value_counts()

category
EXTERNAL_TRANSFER        156533
SELF_TRANSFER            110437
DEPOSIT                   61345
PAYCHECK                  59225
MISCELLANEOUS             55648
REFUND                    23220
INVESTMENT_INCOME         17325
SMALL_DOLLAR_ADVANCE      13621
OTHER_BENEFITS             7708
TAX                        3405
LOAN                       2513
UNEMPLOYMENT_BENEFITS      1961
INSURANCE                   174
Name: count, dtype: int64

- What categories to ignore, since they're not considered income:

In [5]:
ignore_cols = [
    'SMALL_DOLLAR_ADVANCE',
    'TAX',
    'INSURANCE',
    'LOAN',
    'MISCELLANEOUS',
    'REFUND',
    'SELF_TRANSFER'
]

inflows = inflows[~inflows.category.isin(ignore_cols)]


In [6]:
category_dist = inflows.category.value_counts().reset_index()

alt.Chart(category_dist).mark_bar().encode(
    alt.X('category', axis=alt.Axis(labelAngle=90, title="Category")),
    alt.Y('count', title='Count', stack=None),
).properties(
    width=350, 
    height=500,
    title="Category Proportions"
)

## Analysis on the Consumer Level:

- Seeing how many transactions a consumer has, and the sums of each consumer's transaction history, as well as the categories each consumer's bank transactions fall under:

In [7]:
inflow_accs = inflows.groupby(['prism_consumer_id']).agg({'amount':['count', 'sum'], 'category':'unique'})

- Seeing each consumer's number of transactions:

In [8]:
inflow_accs.sort_values(('amount', 'count'))

Unnamed: 0_level_0,amount,amount,category
Unnamed: 0_level_1,count,sum,unique
prism_consumer_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
5578,1,100.00,[EXTERNAL_TRANSFER]
5697,1,500.00,[DEPOSIT]
4192,1,100.00,[DEPOSIT]
5789,1,2.58,[EXTERNAL_TRANSFER]
4386,1,2344.65,[PAYCHECK]
...,...,...,...
1166,757,364343.71,"[EXTERNAL_TRANSFER, PAYCHECK, DEPOSIT]"
2153,767,16082.15,"[EXTERNAL_TRANSFER, PAYCHECK]"
2910,855,59851.31,"[DEPOSIT, PAYCHECK, EXTERNAL_TRANSFER]"
5749,885,215456.77,"[DEPOSIT, EXTERNAL_TRANSFER, PAYCHECK]"


- Seeing each consumer's sum of transactions:

In [10]:
inflow_accs.sort_values(('amount', 'sum'))

Unnamed: 0_level_0,amount,amount,category
Unnamed: 0_level_1,count,sum,unique
prism_consumer_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
5168,2,0.02,[INVESTMENT_INCOME]
5474,2,0.03,[INVESTMENT_INCOME]
5214,3,0.03,[INVESTMENT_INCOME]
824,5,0.05,[INVESTMENT_INCOME]
5718,1,0.06,[INVESTMENT_INCOME]
...,...,...,...
4901,59,2212199.35,"[EXTERNAL_TRANSFER, DEPOSIT]"
1094,376,2271179.03,"[EXTERNAL_TRANSFER, INVESTMENT_INCOME, DEPOSIT]"
1308,433,2947694.28,"[DEPOSIT, EXTERNAL_TRANSFER, PAYCHECK, INVESTM..."
158,534,4096693.97,"[EXTERNAL_TRANSFER, DEPOSIT, OTHER_BENEFITS, U..."


## Observations:
- Some inflow data for some consumers will be sparse since they only have a few transactions
- Will have to investigate EXTERNAL_TRANSFERS to actaully if they're income
    - Check for recurrence

## Investigating 'EXTERNAL_TRANSFERS'
- Checking for recurrent external transfers to mark as income or not:

In [12]:
ext_trnsfr = inflows[inflows.category == 'EXTERNAL_TRANSFER'][['prism_consumer_id', 'amount', 'posted_date']]
ext_trnsfr = ext_trnsfr.sort_values(['prism_consumer_id', 'posted_date'], ignore_index=True)

### Difference of time between transactions for each user:

In [13]:
time_diffs = ext_trnsfr.groupby('prism_consumer_id')['posted_date'].apply(
    lambda x: np.ediff1d(x.astype('int64') // 10**9, to_begin=0)
)

In [14]:
ext_trnsfr['time_diff'] = time_diffs.explode().astype(int).reset_index()['posted_date']
ext_trnsfr['time_diff'] = pd.to_timedelta(ext_trnsfr['time_diff'], unit='s')

### Difference in amounts between transactions for each user:

In [15]:
amount_diffs = ext_trnsfr.groupby('prism_consumer_id')['amount'].apply(
    lambda x: abs(np.ediff1d(x, to_begin=0))
)

In [16]:
ext_trnsfr['amount_diff'] = amount_diffs.explode().reset_index()['amount']

### Setting thresholds for the time and amount differences to mark transactions as income:

In [17]:
ext_trnsfr

Unnamed: 0,prism_consumer_id,amount,posted_date,time_diff,amount_diff
0,0,37000.00,2022-02-14,0 days,0.0
1,0,25.00,2022-02-18,4 days,36975.0
2,0,75.00,2022-02-24,6 days,50.0
3,0,100.00,2022-02-25,1 days,25.0
4,0,300.00,2022-03-17,20 days,200.0
...,...,...,...,...,...
156528,5941,8.66,2023-01-21,1 days,13.99
156529,5941,267.13,2023-01-23,2 days,258.47
156530,5941,2.00,2023-01-24,1 days,265.13
156531,5941,207.16,2023-01-24,0 days,205.16


In [18]:
ext_trnsfr['percent_change'] = ext_trnsfr['amount_diff'] / ext_trnsfr['amount'].shift(1).fillna(1) * 100

# Check if differences are within 10-20%
ext_trnsfr['amount_threshold'] = ext_trnsfr['percent_change'].between(10, 20)

In [19]:
ext_trnsfr['days_threshold'] = ext_trnsfr.time_diff <= pd.Timedelta(days=3)
ext_trnsfr

Unnamed: 0,prism_consumer_id,amount,posted_date,time_diff,amount_diff,percent_change,amount_threshold,days_threshold
0,0,37000.00,2022-02-14,0 days,0.0,0.0,False,True
1,0,25.00,2022-02-18,4 days,36975.0,99.932432,False,False
2,0,75.00,2022-02-24,6 days,50.0,200.0,False,False
3,0,100.00,2022-02-25,1 days,25.0,33.333333,False,True
4,0,300.00,2022-03-17,20 days,200.0,200.0,False,False
...,...,...,...,...,...,...,...,...
156528,5941,8.66,2023-01-21,1 days,13.99,61.766004,False,True
156529,5941,267.13,2023-01-23,2 days,258.47,2984.642032,False,True
156530,5941,2.00,2023-01-24,1 days,265.13,99.251301,False,True
156531,5941,207.16,2023-01-24,0 days,205.16,10258.0,False,True


### Marking recurrent external transfers as income:

In [21]:
ext_trnsfr

Unnamed: 0,prism_consumer_id,amount,posted_date,time_diff,amount_diff,percent_change,amount_threshold,days_threshold
0,0,37000.00,2022-02-14,0 days,0.0,0.0,False,True
1,0,25.00,2022-02-18,4 days,36975.0,99.932432,False,False
2,0,75.00,2022-02-24,6 days,50.0,200.0,False,False
3,0,100.00,2022-02-25,1 days,25.0,33.333333,False,True
4,0,300.00,2022-03-17,20 days,200.0,200.0,False,False
...,...,...,...,...,...,...,...,...
156528,5941,8.66,2023-01-21,1 days,13.99,61.766004,False,True
156529,5941,267.13,2023-01-23,2 days,258.47,2984.642032,False,True
156530,5941,2.00,2023-01-24,1 days,265.13,99.251301,False,True
156531,5941,207.16,2023-01-24,0 days,205.16,10258.0,False,True
