# Payment and charges: a SQL implementation
In Data/payments.csv we have a stream of payment records
In Data/charges.csv we have a stream of charges records

The payments records, that are identified only by the payments date and don't have any reference to the charge record, must be allocated in the order they arrive to the charge records ordered by their bill dates

This is a sort a porting of the SQL implementation written in SQL implementation for Payments and charges.ipynb in Python using pandas module to trasform the table, to join the table, compute new columns, ... 

Prerequirements:
- Pandas: the overall solution is based on the dataframes
- numpy: not essential, uesed to create and compute new columns starting from the existing ones


In [182]:
import numpy as np
import pandas as pd
payments = pd.read_csv('Data/payments.csv',parse_dates=['payment_date'])
charges = pd.read_csv('Data/charges.csv', parse_dates=['scheduled_payment_date'])


In [183]:
# Payment records adjusted to resemble segment records
# ====================================================
payment_segments = payments.copy()

# Only the columns that are useful to transform payment records into adjacent segments are kept
payment_segments.drop(['payment_status', 'rent_payment', 'utilities_payment', 'late_fee_payment'], axis=1, inplace = True)

# The total payment is the size of segment
payment_segments.rename(columns={'total_payment': 'payment_segment_size'}, inplace=True)

# customer by customer the segments are ordered according their time line (segment size ia added in case 2 records have the same customer and date!) 
payment_segments.sort_values(['customer_id', 'payment_date', 'payment_segment_size'], inplace = True)

# customer by customer the upper bound segment is the cumulative (or rolling) sum over the size of the previous segments 
payment_segments['payment_upper_extreme'] = payment_segments.groupby('customer_id')['payment_segment_size'].cumsum() # rolling sum

# customer by customer the lower bound segment is the ... the upper bound of the previuos segment
payment_segments['payment_lower_extreme'] = payment_segments.groupby('customer_id')['payment_upper_extreme'].shift(1)
payment_segments['payment_lower_extreme'] = payment_segments['payment_lower_extreme'].fillna(0) # managing the first segment

# re-arrange the columns based on their geometric meaning
columns = 'customer_id', 'payment_lower_extreme', 'payment_segment_size', 'payment_upper_extreme', 'payment_date'
payment_segments = payment_segments.reindex(columns = columns)
                             
payment_segments


Unnamed: 0,customer_id,payment_lower_extreme,payment_segment_size,payment_upper_extreme,payment_date
11,1001,0.0,200,200,2021-03-28
12,1001,200.0,2166,2366,2021-04-03
13,1001,2366.0,200,2566,2021-04-19
14,1001,2566.0,2166,4732,2021-04-23
15,1001,4732.0,2166,6898,2021-05-29
16,1001,6898.0,200,7098,2021-06-17
0,1002,0.0,2083,2083,2020-12-28
1,1002,2083.0,2083,4166,2021-01-13
2,1002,4166.0,2083,6249,2021-01-27
3,1002,6249.0,2083,8332,2021-02-12


In [184]:
# charge records adjusted to resemble segment records
# ====================================================
charge_segments = charges.copy()

# Only the columns that are useful to transform charge records into adjacent segments are kept
charge_segments.drop(['scheduled_rent', 'scheduled_utilities', 'scheduled_late_fee'], axis=1, inplace = True)

# The total payment is the size of segment
charge_segments.rename(columns={'scheduled_total_payment': 'charge_segment_size'}, inplace=True)
charge_segments.rename(columns={'scheduled_payment_date': 'charge_date'}, inplace=True)

# customer by customer the segments are ordered according their time line (segment size ia added in case 2 records have the same customer and date!) 
charge_segments.sort_values(['customer_id', 'charge_date', 'charge_segment_size'], inplace = True)

# customer by customer the upper bound segment is the cumulative (or rolling) sum over the size of the previous segments 
charge_segments['charge_upper_extreme'] = charge_segments.groupby('customer_id')['charge_segment_size'].cumsum() # rolling sum

# customer by customer the lower bound segment is the ... the upper bound of the previuos segment
charge_segments['charge_lower_extreme'] = charge_segments.groupby('customer_id')['charge_upper_extreme'].shift(1)
charge_segments['charge_lower_extreme'] = charge_segments['charge_lower_extreme'].fillna(0) # managing the first segment

# re-arrange the columns based on their geometric meaning
columns = 'customer_id', 'charge_lower_extreme', 'charge_segment_size', 'charge_upper_extreme', 'charge_date'
charge_segments = charge_segments.reindex(columns = columns)
                             
charge_segments


Unnamed: 0,customer_id,charge_lower_extreme,charge_segment_size,charge_upper_extreme,charge_date
24,1001,0.0,2166,2166,2021-04-03
25,1001,2166.0,2166,4332,2021-04-24
26,1001,4332.0,2166,6498,2021-05-29
27,1001,6498.0,2166,8664,2021-06-29
28,1001,8664.0,2166,10830,2021-07-14
...,...,...,...,...,...
51,1006,3550.0,1050,4600,2021-06-29
52,1006,4600.0,1050,5650,2021-07-14
53,1006,5650.0,1050,6700,2021-07-29
54,1006,6700.0,1050,7750,2021-08-14


In [185]:
# <payments segments overlapping charge segments> is equivalent to <How payments map to charges>
# ===============================================================================================
# Cross Join on customer_id
payment_allocation = pd.merge(payment_segments, charge_segments, on='customer_id', how='inner')
# keep only the overlapping segments, so the filtering condition is:
# payment_segments.lower_extreme < charge_segments.upper_extreme AND payment_segments.upper_extreme > charge_segments.lower_extreme 
payment_allocation = payment_allocation[((payment_allocation['payment_lower_extreme'] < payment_allocation['charge_upper_extreme']) & (payment_allocation['payment_upper_extreme'] > payment_allocation['charge_lower_extreme']))]

# Columns describing the <Overlapping segment>: 
#  Lower is the MAX(charge_segments.lower_extreme, payment_segments.lower_extreme)
payment_allocation['overlapping_lower_extreme'] = payment_allocation[['payment_lower_extreme', 'charge_lower_extreme']].max(axis=1)

# Upper is the MIN(charge_segments.upper_extreme, payment_segments.upper_extreme) 
payment_allocation['overlapping_upper_extreme'] = payment_allocation[['payment_upper_extreme', 'charge_upper_extreme']].min(axis=1)

# Overlapping segment size is the upper - lower => It exactly matches the payment allocation ammount to the charge 
payment_allocation['allocation_amount'] = payment_allocation['overlapping_upper_extreme'] - payment_allocation['overlapping_lower_extreme'] 

# The status (PAID / UNPAID) of charge and debt or credidit situation in the context of the current payment row
import numpy as np
payment_allocation['charge_status'] = np.where(payment_allocation['charge_upper_extreme'] > payment_allocation['payment_upper_extreme'],
                                               'UNPAID',
                                                'PAID') 
payment_allocation['current_debt'] = np.where(payment_allocation['charge_upper_extreme'] > payment_allocation['payment_upper_extreme'],
                                              payment_allocation['charge_upper_extreme'] - payment_allocation['payment_upper_extreme'],
                                              np.nan) 
payment_allocation['current_credit'] = np.where(payment_allocation['charge_upper_extreme'] <= payment_allocation['payment_upper_extreme'],
                                                payment_allocation['payment_upper_extreme'] - payment_allocation['charge_upper_extreme']  ,
                                                np.nan) 

payment_allocation


Unnamed: 0,customer_id,payment_lower_extreme,payment_segment_size,payment_upper_extreme,payment_date,charge_lower_extreme,charge_segment_size,charge_upper_extreme,charge_date,overlapping_lower_extreme,overlapping_upper_extreme,allocation_amount,charge_status,current_debt,current_credit
0,1001,0.0,200,200,2021-03-28,0.0,2166,2166,2021-04-03,0.0,200,200.0,UNPAID,1966.0,
12,1001,200.0,2166,2366,2021-04-03,0.0,2166,2166,2021-04-03,200.0,2166,1966.0,PAID,,200.0
13,1001,200.0,2166,2366,2021-04-03,2166.0,2166,4332,2021-04-24,2166.0,2366,200.0,UNPAID,1966.0,
25,1001,2366.0,200,2566,2021-04-19,2166.0,2166,4332,2021-04-24,2366.0,2566,200.0,UNPAID,1766.0,
37,1001,2566.0,2166,4732,2021-04-23,2166.0,2166,4332,2021-04-24,2566.0,4332,1766.0,PAID,,400.0
38,1001,2566.0,2166,4732,2021-04-23,4332.0,2166,6498,2021-05-29,4332.0,4732,400.0,UNPAID,1766.0,
50,1001,4732.0,2166,6898,2021-05-29,4332.0,2166,6498,2021-05-29,4732.0,6498,1766.0,PAID,,400.0
51,1001,4732.0,2166,6898,2021-05-29,6498.0,2166,8664,2021-06-29,6498.0,6898,400.0,UNPAID,1766.0,
63,1001,6898.0,200,7098,2021-06-17,6498.0,2166,8664,2021-06-29,6898.0,7098,200.0,UNPAID,1566.0,
72,1002,0.0,2083,2083,2020-12-28,0.0,2083,2083,2020-12-27,0.0,2083,2083.0,PAID,,0.0


In [193]:
# We extract only the columns from the DataFrame to have a result set that matches the SQL implementation  
payment_allocation.rename(columns={'payment_segment_size': 'payment_amount'}, inplace=True)
payment_allocation.rename(columns={'charge_segment_size': 'charge_amount'}, inplace=True)

payment_allocation.filter([
    'customer_id',
    'payment_date',
    'payment_amount',
    'allocation_amount',
    'charge_date',
    'charge_amount',
    'charge_status',
    'current_debt',
    'current_credit'
])


Unnamed: 0,customer_id,payment_date,payment_amount,allocation_amount,charge_date,charge_amount,charge_status,current_debt,current_credit
0,1001,2021-03-28,200,200.0,2021-04-03,2166,UNPAID,1966.0,
12,1001,2021-04-03,2166,1966.0,2021-04-03,2166,PAID,,200.0
13,1001,2021-04-03,2166,200.0,2021-04-24,2166,UNPAID,1966.0,
25,1001,2021-04-19,200,200.0,2021-04-24,2166,UNPAID,1766.0,
37,1001,2021-04-23,2166,1766.0,2021-04-24,2166,PAID,,400.0
38,1001,2021-04-23,2166,400.0,2021-05-29,2166,UNPAID,1766.0,
50,1001,2021-05-29,2166,1766.0,2021-05-29,2166,PAID,,400.0
51,1001,2021-05-29,2166,400.0,2021-06-29,2166,UNPAID,1766.0,
63,1001,2021-06-17,200,200.0,2021-06-29,2166,UNPAID,1566.0,
72,1002,2020-12-28,2083,2083.0,2020-12-27,2083,PAID,,0.0
