In [1]:
import numpy as np
import pandas as pd
import altair as alt
import seaborn as sns

In [2]:
inflows = pd.read_parquet('data/ucsd-inflows.pqt')
inflows.posted_date = pd.to_datetime(inflows.posted_date, format='%Y-%m-%d')
inflows.shape[0]

513115

In [3]:
inflows.loc[inflows.memo == 'PAYCHECK_PLACEHOLDER', 'category'] = 'PAYCHECK'

In [4]:
inflows.category.value_counts()

category
EXTERNAL_TRANSFER        156533
SELF_TRANSFER            110437
DEPOSIT                   61345
PAYCHECK                  59225
MISCELLANEOUS             55648
REFUND                    23220
INVESTMENT_INCOME         17325
SMALL_DOLLAR_ADVANCE      13621
OTHER_BENEFITS             7708
TAX                        3405
LOAN                       2513
UNEMPLOYMENT_BENEFITS      1961
INSURANCE                   174
Name: count, dtype: int64

- What categories to ignore, since they're not considered income:

In [5]:
ignore_cols = [
    'SMALL_DOLLAR_ADVANCE',
    'TAX',
    'INSURANCE',
    'LOAN',
    'MISCELLANEOUS',
    'REFUND',
    'SELF_TRANSFER'
]

inflows = inflows[~inflows.category.isin(ignore_cols)]


In [6]:
category_dist = pd.DataFrame((inflows.category.value_counts(normalize=True) * 100).reset_index())

alt.Chart(category_dist).mark_bar().encode(
    alt.X('category', axis=alt.Axis(labelAngle=270, title="Category")),
    alt.Y('proportion', title='Proportion', stack=None),
).properties(
    width=350, 
    height=500,
    title="Category Proportions"
)

## Analysis on the Consumer Level:

- Seeing how many transactions a consumer has, and the sums of each consumer's transaction history, as well as the categories each consumer's bank transactions fall under:

In [7]:
inflow_accs = inflows.groupby(['prism_consumer_id']).agg({'amount':['count', 'sum'], 'category':'unique'})

- Seeing each consumer's number of transactions:

In [8]:
inflow_accs.sort_values(('amount', 'count'))

Unnamed: 0_level_0,amount,amount,category
Unnamed: 0_level_1,count,sum,unique
prism_consumer_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
5578,1,100.00,[EXTERNAL_TRANSFER]
5697,1,500.00,[DEPOSIT]
4192,1,100.00,[DEPOSIT]
5789,1,2.58,[EXTERNAL_TRANSFER]
4386,1,2344.65,[PAYCHECK]
...,...,...,...
1166,757,364343.71,"[EXTERNAL_TRANSFER, PAYCHECK, DEPOSIT]"
2153,767,16082.15,"[EXTERNAL_TRANSFER, PAYCHECK]"
2910,855,59851.31,"[DEPOSIT, PAYCHECK, EXTERNAL_TRANSFER]"
5749,885,215456.77,"[DEPOSIT, EXTERNAL_TRANSFER, PAYCHECK]"


In [9]:
inflow_accs['num_of_categories'] = inflow_accs[('category', 'unique')].apply(len)
inflow_accs['num_of_categories'].sort_values(ascending=False)

prism_consumer_id
421     6
541     6
3802    6
1119    6
4308    6
       ..
5718    1
4608    1
4611    1
4995    1
5214    1
Name: num_of_categories, Length: 2967, dtype: int64

In [10]:
np.sum(inflow_accs['num_of_categories'] == 6) 

10

In [11]:
all_catgries = inflow_accs[(inflow_accs['num_of_categories'] == 6) == True]['num_of_categories'].index
all_catgries

Index([9, 421, 541, 1119, 1488, 1518, 1571, 3802, 4308, 4849], dtype='int64', name='prism_consumer_id')

In [12]:
inflow_accs[inflow_accs.index.isin(all_catgries)]

Unnamed: 0_level_0,amount,amount,category,num_of_categories
Unnamed: 0_level_1,count,sum,unique,Unnamed: 4_level_1
prism_consumer_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
9,207,178844.31,"[OTHER_BENEFITS, EXTERNAL_TRANSFER, INVESTMENT...",6
421,230,73899.12,"[DEPOSIT, INVESTMENT_INCOME, UNEMPLOYMENT_BENE...",6
541,452,775568.37,"[PAYCHECK, EXTERNAL_TRANSFER, OTHER_BENEFITS, ...",6
1119,231,118360.84,"[EXTERNAL_TRANSFER, PAYCHECK, OTHER_BENEFITS, ...",6
1488,145,129050.59,"[PAYCHECK, UNEMPLOYMENT_BENEFITS, OTHER_BENEFI...",6
1518,85,92012.78,"[EXTERNAL_TRANSFER, DEPOSIT, INVESTMENT_INCOME...",6
1571,114,136581.25,"[OTHER_BENEFITS, EXTERNAL_TRANSFER, PAYCHECK, ...",6
3802,66,40100.49,"[PAYCHECK, OTHER_BENEFITS, DEPOSIT, EXTERNAL_T...",6
4308,115,90921.72,"[EXTERNAL_TRANSFER, OTHER_BENEFITS, PAYCHECK, ...",6
4849,240,149071.67,"[EXTERNAL_TRANSFER, DEPOSIT, UNEMPLOYMENT_BENE...",6


    - 10 consumers' transactions fall under all the categories in consideration

In [13]:
# Top 10 Consumers Who Have the Highest Inflow: 
inflow_accs.sort_values([('amount', 'sum'), 'num_of_categories'], ascending=False)[:10]

Unnamed: 0_level_0,amount,amount,category,num_of_categories
Unnamed: 0_level_1,count,sum,unique,Unnamed: 4_level_1
prism_consumer_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
37,294,8262010.27,"[EXTERNAL_TRANSFER, INVESTMENT_INCOME, PAYCHEC...",5
158,534,4096693.97,"[EXTERNAL_TRANSFER, DEPOSIT, OTHER_BENEFITS, U...",4
1308,433,2947694.28,"[DEPOSIT, EXTERNAL_TRANSFER, PAYCHECK, INVESTM...",4
1094,376,2271179.03,"[EXTERNAL_TRANSFER, INVESTMENT_INCOME, DEPOSIT]",3
4901,59,2212199.35,"[EXTERNAL_TRANSFER, DEPOSIT]",2
1769,60,1748620.01,"[DEPOSIT, PAYCHECK, EXTERNAL_TRANSFER, INVESTM...",4
1288,65,1722381.82,"[INVESTMENT_INCOME, DEPOSIT, PAYCHECK, EXTERNA...",4
646,235,1470188.67,"[PAYCHECK, EXTERNAL_TRANSFER, INVESTMENT_INCOM...",4
1232,446,1461516.56,"[EXTERNAL_TRANSFER, DEPOSIT, PAYCHECK, INVESTM...",4
62,49,1430446.32,"[PAYCHECK, DEPOSIT, INVESTMENT_INCOME, EXTERNA...",4


    - Number of catgories that a consumer's transcations fall under doesn't mean they have a higher inflow than others 

- Seeing each consumer's sum of transactions:

In [14]:
inflow_accs.sort_values(('amount', 'sum'))

Unnamed: 0_level_0,amount,amount,category,num_of_categories
Unnamed: 0_level_1,count,sum,unique,Unnamed: 4_level_1
prism_consumer_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
5168,2,0.02,[INVESTMENT_INCOME],1
5474,2,0.03,[INVESTMENT_INCOME],1
5214,3,0.03,[INVESTMENT_INCOME],1
824,5,0.05,[INVESTMENT_INCOME],1
5718,1,0.06,[INVESTMENT_INCOME],1
...,...,...,...,...
4901,59,2212199.35,"[EXTERNAL_TRANSFER, DEPOSIT]",2
1094,376,2271179.03,"[EXTERNAL_TRANSFER, INVESTMENT_INCOME, DEPOSIT]",3
1308,433,2947694.28,"[DEPOSIT, EXTERNAL_TRANSFER, PAYCHECK, INVESTM...",4
158,534,4096693.97,"[EXTERNAL_TRANSFER, DEPOSIT, OTHER_BENEFITS, U...",4


In [15]:
inflow_accs.sort_values([('amount', 'sum'), 'num_of_categories'], ascending=False)

Unnamed: 0_level_0,amount,amount,category,num_of_categories
Unnamed: 0_level_1,count,sum,unique,Unnamed: 4_level_1
prism_consumer_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
37,294,8262010.27,"[EXTERNAL_TRANSFER, INVESTMENT_INCOME, PAYCHEC...",5
158,534,4096693.97,"[EXTERNAL_TRANSFER, DEPOSIT, OTHER_BENEFITS, U...",4
1308,433,2947694.28,"[DEPOSIT, EXTERNAL_TRANSFER, PAYCHECK, INVESTM...",4
1094,376,2271179.03,"[EXTERNAL_TRANSFER, INVESTMENT_INCOME, DEPOSIT]",3
4901,59,2212199.35,"[EXTERNAL_TRANSFER, DEPOSIT]",2
...,...,...,...,...
5718,1,0.06,[INVESTMENT_INCOME],1
824,5,0.05,[INVESTMENT_INCOME],1
5214,3,0.03,[INVESTMENT_INCOME],1
5474,2,0.03,[INVESTMENT_INCOME],1


## Observations:
- Some inflow data for some consumers will be sparse since they only have a few transactions
- Will have to investigate EXTERNAL_TRANSFERS to actaully see if they're income (should do this for all categories)
    - Check for recurrence (look into days and dollar amounts between transactions)

## Investigating 'EXTERNAL_TRANSFERS'
- Checking for recurrent external transfers to mark as income or not:

In [16]:
ext_trnsfr = inflows[inflows.category == 'EXTERNAL_TRANSFER'][['prism_consumer_id', 'amount', 'posted_date']]
ext_trnsfr = ext_trnsfr.sort_values(['prism_consumer_id', 'posted_date'], ignore_index=True)

### Difference of time between transactions for each user:

In [17]:
time_diffs = ext_trnsfr.groupby('prism_consumer_id')['posted_date'].apply(
    lambda x: np.ediff1d(x.astype('int64') // 10**9, to_begin=0)
)

In [18]:
ext_trnsfr['time_diff'] = time_diffs.explode().astype(int).reset_index()['posted_date']
ext_trnsfr['time_diff'] = pd.to_timedelta(ext_trnsfr['time_diff'], unit='s')

### Difference in amounts between transactions for each user:

In [19]:
amount_diffs = ext_trnsfr.groupby('prism_consumer_id')['amount'].apply(
    lambda x: abs(np.ediff1d(x, to_begin=0))
)

In [20]:
ext_trnsfr['amount_diff'] = amount_diffs.explode().reset_index()['amount']

### Setting thresholds for the time and amount differences to mark transactions as income:

In [21]:
ext_trnsfr

Unnamed: 0,prism_consumer_id,amount,posted_date,time_diff,amount_diff
0,0,37000.00,2022-02-14,0 days,0.0
1,0,25.00,2022-02-18,4 days,36975.0
2,0,75.00,2022-02-24,6 days,50.0
3,0,100.00,2022-02-25,1 days,25.0
4,0,300.00,2022-03-17,20 days,200.0
...,...,...,...,...,...
156528,5941,8.66,2023-01-21,1 days,13.99
156529,5941,267.13,2023-01-23,2 days,258.47
156530,5941,2.00,2023-01-24,1 days,265.13
156531,5941,207.16,2023-01-24,0 days,205.16


In [22]:
ext_trnsfr['percent_change'] = ext_trnsfr['amount_diff'] / ext_trnsfr['amount'].shift(1).fillna(1) * 100

# Check if differences are within 10-20%
ext_trnsfr['amount_threshold'] = ext_trnsfr['percent_change'].between(10, 20)

In [23]:
ext_trnsfr['days_threshold'] = ext_trnsfr.time_diff <= pd.Timedelta(days=3)
ext_trnsfr

Unnamed: 0,prism_consumer_id,amount,posted_date,time_diff,amount_diff,percent_change,amount_threshold,days_threshold
0,0,37000.00,2022-02-14,0 days,0.0,0.0,False,True
1,0,25.00,2022-02-18,4 days,36975.0,99.932432,False,False
2,0,75.00,2022-02-24,6 days,50.0,200.0,False,False
3,0,100.00,2022-02-25,1 days,25.0,33.333333,False,True
4,0,300.00,2022-03-17,20 days,200.0,200.0,False,False
...,...,...,...,...,...,...,...,...
156528,5941,8.66,2023-01-21,1 days,13.99,61.766004,False,True
156529,5941,267.13,2023-01-23,2 days,258.47,2984.642032,False,True
156530,5941,2.00,2023-01-24,1 days,265.13,99.251301,False,True
156531,5941,207.16,2023-01-24,0 days,205.16,10258.0,False,True


### Marking recurrent external transfers as income:
- Consider people who either earn hourly or are on a salary:

In [24]:
ext_trnsfr

Unnamed: 0,prism_consumer_id,amount,posted_date,time_diff,amount_diff,percent_change,amount_threshold,days_threshold
0,0,37000.00,2022-02-14,0 days,0.0,0.0,False,True
1,0,25.00,2022-02-18,4 days,36975.0,99.932432,False,False
2,0,75.00,2022-02-24,6 days,50.0,200.0,False,False
3,0,100.00,2022-02-25,1 days,25.0,33.333333,False,True
4,0,300.00,2022-03-17,20 days,200.0,200.0,False,False
...,...,...,...,...,...,...,...,...
156528,5941,8.66,2023-01-21,1 days,13.99,61.766004,False,True
156529,5941,267.13,2023-01-23,2 days,258.47,2984.642032,False,True
156530,5941,2.00,2023-01-24,1 days,265.13,99.251301,False,True
156531,5941,207.16,2023-01-24,0 days,205.16,10258.0,False,True


## Sort by prism_consumer_id, category, then posted_date:

In [25]:
df = inflows[['prism_consumer_id', 'amount', 'posted_date', 'category']].sort_values(['prism_consumer_id', 'posted_date', 'category'], ignore_index=True)

In [26]:
time_diffs = df.groupby('prism_consumer_id')['posted_date'].apply(
    lambda x: np.ediff1d(x.astype('int64') // 10**9, to_begin=0)
)

In [27]:
df['time_diff'] = time_diffs.explode().astype(int).reset_index()['posted_date']
df['time_diff'] = pd.to_timedelta(df['time_diff'], unit='s')

In [28]:
amount_diffs = df.groupby('prism_consumer_id')['amount'].apply(
    lambda x: abs(np.ediff1d(x, to_begin=0))
)

In [29]:
df['amount_diff'] = amount_diffs.explode().reset_index()['amount']

In [30]:
df

Unnamed: 0,prism_consumer_id,amount,posted_date,category,time_diff,amount_diff
0,0,37000.00,2022-02-14,EXTERNAL_TRANSFER,0 days,0.0
1,0,0.04,2022-02-17,INVESTMENT_INCOME,3 days,36999.96
2,0,25.00,2022-02-18,EXTERNAL_TRANSFER,1 days,24.96
3,0,2331.71,2022-02-18,PAYCHECK,0 days,2306.71
4,0,75.00,2022-02-24,EXTERNAL_TRANSFER,6 days,2256.71
...,...,...,...,...,...,...
304092,5941,8.66,2023-01-21,EXTERNAL_TRANSFER,1 days,13.99
304093,5941,267.13,2023-01-23,EXTERNAL_TRANSFER,2 days,258.47
304094,5941,2.00,2023-01-24,EXTERNAL_TRANSFER,1 days,265.13
304095,5941,207.16,2023-01-24,EXTERNAL_TRANSFER,0 days,205.16


In [31]:
df['percent_change'] = df['amount_diff'] / df['amount'].shift(1).fillna(1) * 100

# Check if differences are within 10-20%
df['amount_threshold'] = df['percent_change'].between(10, 20)

In [32]:
df['days_threshold'] = df.time_diff <= pd.Timedelta(days=3)
df

Unnamed: 0,prism_consumer_id,amount,posted_date,category,time_diff,amount_diff,percent_change,amount_threshold,days_threshold
0,0,37000.00,2022-02-14,EXTERNAL_TRANSFER,0 days,0.0,0.0,False,True
1,0,0.04,2022-02-17,INVESTMENT_INCOME,3 days,36999.96,99.999892,False,True
2,0,25.00,2022-02-18,EXTERNAL_TRANSFER,1 days,24.96,62400.0,False,True
3,0,2331.71,2022-02-18,PAYCHECK,0 days,2306.71,9226.84,False,True
4,0,75.00,2022-02-24,EXTERNAL_TRANSFER,6 days,2256.71,96.783477,False,False
...,...,...,...,...,...,...,...,...,...
304092,5941,8.66,2023-01-21,EXTERNAL_TRANSFER,1 days,13.99,61.766004,False,True
304093,5941,267.13,2023-01-23,EXTERNAL_TRANSFER,2 days,258.47,2984.642032,False,True
304094,5941,2.00,2023-01-24,EXTERNAL_TRANSFER,1 days,265.13,99.251301,False,True
304095,5941,207.16,2023-01-24,EXTERNAL_TRANSFER,0 days,205.16,10258.0,False,True
