### Objetivo: "Como o PIX está se saindo dentro do LCGBR Bank. Para isso, ele deseja sua ajuda para criar indicadores que podem ser usados para rastrear o desempenho do produto. Quais métricas você sugeriria para acompanhá-lo e por quê?"

![db_arch](table_diagram.png)

In [4]:
import pandas as pd

### Carregando a base

In [5]:
df = pd.read_csv('csv/pix_movements.csv')
df_time = pd.read_csv('csv/d_time.csv')

In [6]:
df

Unnamed: 0,id,account_id,in_or_out,pix_amount,pix_requested_at,pix_completed_at,status
0,1362907709468179968,509281836645315264,pix_out,1894.77,1579693633580,1.579694e+12,completed
1,2246794118022659072,509281836645315264,pix_out,419.79,1587309244550,1.587309e+12,completed
2,165696026045637376,509281836645315264,pix_in,943.61,1609306218320,1.609306e+12,completed
3,919545932589046400,509281836645315264,pix_in,124.66,1586015734430,1.586016e+12,completed
4,2001212142240523008,509281836645315264,pix_out,775.96,1590974667000,1.590975e+12,completed
...,...,...,...,...,...,...,...
249877,2268414690237561344,674961947612972032,pix_out,1596.65,1579357052400,1.579357e+12,completed
249878,3152128518248888320,674961947612972032,pix_out,453.79,1581142919280,1.581143e+12,completed
249879,1800577140516179968,674961947612972032,pix_out,45.89,1608954790060,1.608955e+12,completed
249880,160814583687737312,674961947612972032,pix_in,1165.90,1598609653080,1.598610e+12,completed


In [7]:
df_time

Unnamed: 0,time_id,action_timestamp,week_id,month_id,year_id,weekday_id
0,1578150055960,2020-01-04T15:00:55.960Z,102414000,507,1024140,2535
1,1579693646070,2020-01-22T11:47:26.070Z,102415521,507,1024140,1014
2,1579073618020,2020-01-15T07:33:38.020Z,102415014,507,1024140,1014
3,1602776879440,2020-10-15T15:47:59.440Z,102434787,5070,1024140,1521
4,1587309252480,2020-04-19T15:14:12.480Z,102421605,2028,1024140,3042
...,...,...,...,...,...,...
975561,1581142919280,2020-02-08T06:21:59.280Z,102416535,1014,1024140,2535
975562,1588976036360,2020-05-08T22:13:56.360Z,102423126,2535,1024140,2028
975563,1608954790060,2020-12-26T03:53:10.060Z,102439857,6084,1024140,2535
975564,1598609653080,2020-08-28T10:14:13.080Z,102431238,4056,1024140,2028


In [8]:
df.columns

Index(['id', 'account_id', 'in_or_out', 'pix_amount', 'pix_requested_at',
       'pix_completed_at', 'status'],
      dtype='object')

In [9]:
df_time.columns

Index(['time_id', 'action_timestamp', 'week_id', 'month_id', 'year_id',
       'weekday_id'],
      dtype='object')

### Unificando campos

In [10]:
df_merged = df.merge(df_time[['time_id', 'action_timestamp']], left_on='pix_requested_at', right_on='time_id', how='left')
df_merged = df_merged.rename(columns={'action_timestamp': 'timestamp_requested'})
df_merged['timestamp_requested'] = pd.to_datetime(df_merged['timestamp_requested'], errors='coerce')
df_merged['date_requested'] = df_merged['timestamp_requested'].dt.strftime('%Y-%m-%d')
df_merged['time_requested'] = df_merged['timestamp_requested'].dt.strftime('%H:%M:%S')
df_merged = df_merged.drop(columns=['timestamp_requested', 'time_id'])
df_merged

Unnamed: 0,id,account_id,in_or_out,pix_amount,pix_requested_at,pix_completed_at,status,date_requested,time_requested
0,1362907709468179968,509281836645315264,pix_out,1894.77,1579693633580,1.579694e+12,completed,2020-01-22,11:47:13
1,2246794118022659072,509281836645315264,pix_out,419.79,1587309244550,1.587309e+12,completed,2020-04-19,15:14:04
2,165696026045637376,509281836645315264,pix_in,943.61,1609306218320,1.609306e+12,completed,2020-12-30,05:30:18
3,919545932589046400,509281836645315264,pix_in,124.66,1586015734430,1.586016e+12,completed,2020-04-04,15:55:34
4,2001212142240523008,509281836645315264,pix_out,775.96,1590974667000,1.590975e+12,completed,2020-06-01,01:24:27
...,...,...,...,...,...,...,...,...,...
249877,2268414690237561344,674961947612972032,pix_out,1596.65,1579357052400,1.579357e+12,completed,2020-01-18,14:17:32
249878,3152128518248888320,674961947612972032,pix_out,453.79,1581142919280,1.581143e+12,completed,2020-02-08,06:21:59
249879,1800577140516179968,674961947612972032,pix_out,45.89,1608954790060,1.608955e+12,completed,2020-12-26,03:53:10
249880,160814583687737312,674961947612972032,pix_in,1165.90,1598609653080,1.598610e+12,completed,2020-08-28,10:14:13


In [11]:
df_merged_date = df_merged.merge(df_time[['time_id', 'action_timestamp']], left_on='pix_completed_at', right_on='time_id', how='left')
df_merged_date = df_merged_date.rename(columns={'action_timestamp': 'timestamp_completed'})
df_merged_date['timestamp_completed'] = pd.to_datetime(df_merged_date['timestamp_completed'], errors='coerce')
df_merged_date['date_completed'] = df_merged_date['timestamp_completed'].dt.strftime('%Y-%m-%d')
df_merged_date['time_completed'] = df_merged_date['timestamp_completed'].dt.strftime('%H:%M:%S')
df_merged_date = df_merged_date.drop(columns=['timestamp_completed', 'time_id'])
df_merged_date

Unnamed: 0,id,account_id,in_or_out,pix_amount,pix_requested_at,pix_completed_at,status,date_requested,time_requested,date_completed,time_completed
0,1362907709468179968,509281836645315264,pix_out,1894.77,1579693633580,1.579694e+12,completed,2020-01-22,11:47:13,2020-01-22,11:47:26
1,2246794118022659072,509281836645315264,pix_out,419.79,1587309244550,1.587309e+12,completed,2020-04-19,15:14:04,2020-04-19,15:14:12
2,165696026045637376,509281836645315264,pix_in,943.61,1609306218320,1.609306e+12,completed,2020-12-30,05:30:18,2020-12-30,05:30:27
3,919545932589046400,509281836645315264,pix_in,124.66,1586015734430,1.586016e+12,completed,2020-04-04,15:55:34,2020-04-04,15:55:47
4,2001212142240523008,509281836645315264,pix_out,775.96,1590974667000,1.590975e+12,completed,2020-06-01,01:24:27,2020-06-01,01:24:32
...,...,...,...,...,...,...,...,...,...,...,...
249877,2268414690237561344,674961947612972032,pix_out,1596.65,1579357052400,1.579357e+12,completed,2020-01-18,14:17:32,2020-01-18,14:17:43
249878,3152128518248888320,674961947612972032,pix_out,453.79,1581142919280,1.581143e+12,completed,2020-02-08,06:21:59,2020-02-08,06:22:04
249879,1800577140516179968,674961947612972032,pix_out,45.89,1608954790060,1.608955e+12,completed,2020-12-26,03:53:10,2020-12-26,03:53:26
249880,160814583687737312,674961947612972032,pix_in,1165.90,1598609653080,1.598610e+12,completed,2020-08-28,10:14:13,2020-08-28,10:14:23


In [12]:
df_merged_date = df_merged_date.drop(columns=['pix_requested_at', 'pix_completed_at'])

In [13]:
df_merged_date

Unnamed: 0,id,account_id,in_or_out,pix_amount,status,date_requested,time_requested,date_completed,time_completed
0,1362907709468179968,509281836645315264,pix_out,1894.77,completed,2020-01-22,11:47:13,2020-01-22,11:47:26
1,2246794118022659072,509281836645315264,pix_out,419.79,completed,2020-04-19,15:14:04,2020-04-19,15:14:12
2,165696026045637376,509281836645315264,pix_in,943.61,completed,2020-12-30,05:30:18,2020-12-30,05:30:27
3,919545932589046400,509281836645315264,pix_in,124.66,completed,2020-04-04,15:55:34,2020-04-04,15:55:47
4,2001212142240523008,509281836645315264,pix_out,775.96,completed,2020-06-01,01:24:27,2020-06-01,01:24:32
...,...,...,...,...,...,...,...,...,...
249877,2268414690237561344,674961947612972032,pix_out,1596.65,completed,2020-01-18,14:17:32,2020-01-18,14:17:43
249878,3152128518248888320,674961947612972032,pix_out,453.79,completed,2020-02-08,06:21:59,2020-02-08,06:22:04
249879,1800577140516179968,674961947612972032,pix_out,45.89,completed,2020-12-26,03:53:10,2020-12-26,03:53:26
249880,160814583687737312,674961947612972032,pix_in,1165.90,completed,2020-08-28,10:14:13,2020-08-28,10:14:23


In [15]:
df_completed = df[df['status'] == 'completed']
df_completed['pix_amount'].describe()

count    237283.000000
mean        999.612931
std         578.207597
min           0.010000
25%         497.275000
50%         999.050000
75%        1501.840000
max        1999.990000
Name: pix_amount, dtype: float64

### Analisando a base

In [15]:
df_merged_date.columns.tolist()

['id',
 'account_id',
 'in_or_out',
 'pix_amount',
 'status',
 'date_requested',
 'time_requested',
 'date_completed',
 'time_completed']

In [18]:
df_merged_date['status'].value_counts(normalize=True) * 100

completed    94.95802
failed        5.04198
Name: status, dtype: float64

In [19]:
total_transactions = df_merged_date.groupby('account_id').size().rename("total_pix")
status_proportion = df_merged_date.groupby(['account_id', 'status']).size().unstack(fill_value=0)
status_proportion['ratio'] = status_proportion['completed'] / status_proportion.sum(axis=1)

result = total_transactions.to_frame().join(status_proportion[['ratio']])

result = result.sort_values(by=['total_pix'], ascending=False).reset_index()

result

Unnamed: 0,account_id,total_pix,ratio
0,3133285803131044352,151,0.953642
1,2684657256168998400,147,0.972789
2,1558252945903991040,145,0.917241
3,3201760463155440128,144,0.944444
4,2129013948968909312,141,0.950355
...,...,...,...
3974,3380326633499297280,1,1.000000
3975,1513068701380254464,1,1.000000
3976,623863459555141504,1,1.000000
3977,1784983368547723520,1,1.000000


In [20]:
result['total_pix'].describe()

count    3979.000000
mean       62.800201
std        36.728305
min         1.000000
25%        31.000000
50%        62.000000
75%        94.000000
max       151.000000
Name: total_pix, dtype: float64

In [21]:
worst_ratio = result.sort_values(by=['ratio'], ascending=True).reset_index(drop=True)

worst_ratio

Unnamed: 0,account_id,total_pix,ratio
0,3241979217776357376,1,0.000000
1,1304562713001272832,5,0.600000
2,2579703769389719552,5,0.600000
3,2631555412936298496,3,0.666667
4,62725486856205360,9,0.666667
...,...,...,...
3974,2777661816274882560,18,1.000000
3975,407514318012190336,18,1.000000
3976,863861660636230656,17,1.000000
3977,1789505082597483776,18,1.000000


In [None]:
df_status_failed = df_merged_date.loc[df_merged_date['status'] == 'failed']
df_status_failed = df_status_failed.drop(columns=['date_completed', 'time_completed'])
df_status_failed

Unnamed: 0,id,account_id,in_or_out,pix_amount,status,date_requested,time_requested
84,445284370681548352,2969674447809961,pix_in,923.74,failed,2020-03-17,17:48:50
103,2691862463156810752,2969674447809961,pix_out,1353.67,failed,2020-11-30,06:36:38
139,3116117030374974464,2969674447809961,pix_out,14.49,failed,2020-04-02,05:17:18
215,1942164848974726656,5763135580788529,pix_in,806.71,failed,2020-04-12,15:06:20
219,840256070701326080,5763135580788529,pix_in,504.92,failed,2020-01-29,15:56:25
...,...,...,...,...,...,...,...
249824,205416295080949056,3402164927390067200,pix_out,355.53,failed,2020-06-28,11:24:10
249834,1760757337067899392,3402164927390067200,pix_out,1603.83,failed,2020-02-04,10:09:44
249843,478576301554644032,3402164927390067200,pix_out,1095.47,failed,2020-07-25,09:28:03
249845,2346912906066982400,3402164927390067200,pix_in,1824.26,failed,2020-09-20,01:13:24


In [23]:
df_status_failed = df_status_failed.loc[df_status_failed['status'] == 'failed']
df_status_failed['time_requested_format'] = pd.to_datetime(df_status_failed['time_requested']).dt.time
df_status_failed['horario_comercial'] = df_status_failed['time_requested_format'].apply(
    lambda x: '1' if pd.to_datetime("09:00:00").time() <= x <= pd.to_datetime("18:00:00").time() else '0'
)

df_status_failed

Unnamed: 0,id,account_id,in_or_out,pix_amount,status,date_requested,time_requested,time_requested_format,horario_comercial
84,445284370681548352,2969674447809961,pix_in,923.74,failed,2020-03-17,17:48:50,17:48:50,1
103,2691862463156810752,2969674447809961,pix_out,1353.67,failed,2020-11-30,06:36:38,06:36:38,0
139,3116117030374974464,2969674447809961,pix_out,14.49,failed,2020-04-02,05:17:18,05:17:18,0
215,1942164848974726656,5763135580788529,pix_in,806.71,failed,2020-04-12,15:06:20,15:06:20,1
219,840256070701326080,5763135580788529,pix_in,504.92,failed,2020-01-29,15:56:25,15:56:25,1
...,...,...,...,...,...,...,...,...,...
249824,205416295080949056,3402164927390067200,pix_out,355.53,failed,2020-06-28,11:24:10,11:24:10,1
249834,1760757337067899392,3402164927390067200,pix_out,1603.83,failed,2020-02-04,10:09:44,10:09:44,1
249843,478576301554644032,3402164927390067200,pix_out,1095.47,failed,2020-07-25,09:28:03,09:28:03,1
249845,2346912906066982400,3402164927390067200,pix_in,1824.26,failed,2020-09-20,01:13:24,01:13:24,0


In [25]:
df_status_failed.columns.tolist()

['id',
 'account_id',
 'in_or_out',
 'pix_amount',
 'status',
 'date_requested',
 'time_requested',
 'time_requested_format',
 'horario_comercial']

In [24]:
df_status_failed['pix_amount'].describe()

count    12599.000000
mean      1004.384532
std        582.780284
min          0.010000
25%        495.685000
50%       1008.740000
75%       1508.730000
max       1999.820000
Name: pix_amount, dtype: float64

In [None]:
df_status_failed['in_or_out'].value_counts(normalize=True)* 100

pix_in     50.130963
pix_out    49.869037
Name: in_or_out, dtype: float64

In [None]:
comercial_counts = df_status_failed['horario_comercial'].value_counts(normalize=True)* 100

comercial_counts

0    61.655687
1    38.344313
Name: horario_comercial, dtype: float64

In [None]:
in_out_comercial_counts = df_status_failed.groupby(['in_or_out', 'horario_comercial']).size().unstack(fill_value=0)
in_out_comercial_percentages = in_out_comercial_counts.div(in_out_comercial_counts.sum(axis=1), axis=0) * 100

in_out_comercial_percentages

horario_comercial,0,1
in_or_out,Unnamed: 1_level_1,Unnamed: 2_level_1
pix_in,61.431286,38.568714
pix_out,61.881267,38.118733


## Pix por estado

`pix_movements['account_id']` -> `accounts['account_id']`   ['account_id', 'customer_id']   
`accounts['customer_id']` -> `customers['customer_id']`     ['customer_id', 'customer_city']    
`customers['customer_city']` -> `city['city_id']`           ['city_id', 'state_id']  
`city['state_id']` -> `state['state_id']`  

In [33]:
df_pix_movements = df_merged_date.copy()
df_accounts = pd.read_csv('csv/accounts.csv')
df_customers = pd.read_csv('csv/customers.csv')
df_city = pd.read_csv('csv/city.csv')
df_state = pd.read_csv('csv/state.csv')

df_pix_movements

Unnamed: 0,id,account_id,in_or_out,pix_amount,status,date_requested,time_requested,date_completed,time_completed
0,286774842876319872,2569200459575096,pix_out,1489.65,completed,2020-11-26,12:00:02,2020-11-26,12:00:17
1,1621009960637386752,2569200459575096,pix_out,171.82,completed,2020-06-02,14:13:08,2020-06-02,14:13:17
2,2303082362584393984,2569200459575096,pix_in,1963.65,completed,2020-01-11,23:00:19,2020-01-11,23:00:39
3,2505496043020497920,2569200459575096,pix_out,1383.17,completed,2020-03-24,08:39:14,2020-03-24,08:39:25
4,2587782110525443072,2569200459575096,pix_out,457.52,completed,2020-10-05,07:53:51,2020-10-05,07:53:59
...,...,...,...,...,...,...,...,...,...
249877,706826583476606208,3402757528457126912,pix_in,698.84,completed,2020-10-31,08:48:55,2020-10-31,08:48:58
249878,207982978451739520,3402757528457126912,pix_out,364.77,completed,2020-01-12,20:31:40,2020-01-12,20:31:43
249879,2282019638356964864,3402757528457126912,pix_in,409.68,completed,2020-06-25,01:55:47,2020-06-25,01:55:53
249880,2249799862805598464,3402757528457126912,pix_out,636.71,completed,2020-04-17,05:45:22,2020-04-17,05:45:41


In [None]:
df_pix_state = df_pix_movements.merge(df_accounts[['account_id', 'customer_id']], left_on='account_id', right_on='account_id', how='left')
df_pix_state = df_pix_state.merge(df_customers[['customer_id', 'customer_city']], left_on='customer_id', right_on='customer_id', how='left')
df_pix_state = df_pix_state.rename(columns={'customer_city': 'city_id'})
df_pix_state = df_pix_state.merge(df_city[['city_id', 'state_id']], left_on='city_id', right_on='city_id', how='left')

In [None]:
df_pix_state = df_pix_state.merge(df_state[['state_id', 'state']], left_on='state_id', right_on='state_id', how='left')
df_pix_state = df_pix_state.drop(columns=['customer_id', 'city_id', 'state_id'])


df_pix_state

Unnamed: 0,id,account_id,in_or_out,pix_amount,status,date_requested,time_requested,date_completed,time_completed,state
0,286774842876319872,2569200459575096,pix_out,1489.65,completed,2020-11-26,12:00:02,2020-11-26,12:00:17,SP
1,1621009960637386752,2569200459575096,pix_out,171.82,completed,2020-06-02,14:13:08,2020-06-02,14:13:17,SP
2,2303082362584393984,2569200459575096,pix_in,1963.65,completed,2020-01-11,23:00:19,2020-01-11,23:00:39,SP
3,2505496043020497920,2569200459575096,pix_out,1383.17,completed,2020-03-24,08:39:14,2020-03-24,08:39:25,SP
4,2587782110525443072,2569200459575096,pix_out,457.52,completed,2020-10-05,07:53:51,2020-10-05,07:53:59,SP
...,...,...,...,...,...,...,...,...,...,...
249877,706826583476606208,3402757528457126912,pix_in,698.84,completed,2020-10-31,08:48:55,2020-10-31,08:48:58,SP
249878,207982978451739520,3402757528457126912,pix_out,364.77,completed,2020-01-12,20:31:40,2020-01-12,20:31:43,SP
249879,2282019638356964864,3402757528457126912,pix_in,409.68,completed,2020-06-25,01:55:47,2020-06-25,01:55:53,SP
249880,2249799862805598464,3402757528457126912,pix_out,636.71,completed,2020-04-17,05:45:22,2020-04-17,05:45:41,SP


In [None]:
df_pix_state['state'].value_counts(normalize=True) * 100

MG    14.555670
SP    11.357361
RS     9.208346
SC     7.757261
GO     7.392289
TO     6.224938
BA     5.886779
CE     5.643864
MA     5.061589
RN     4.772253
PR     3.671733
PE     3.107467
RJ     2.930583
MT     2.628040
PB     2.150615
ES     1.938115
PI     1.207770
MS     1.077308
AL     0.958853
SE     0.887219
PA     0.849201
AP     0.732746
Name: state, dtype: float64

In [None]:
df_pix_state_status_failed = df_pix_state.loc[df_pix_state['status'] == 'failed']
df_pix_state_status_failed = df_pix_state_status_failed.drop(columns=['date_completed', 'time_completed'])


df_pix_state_status_failed = df_pix_state_status_failed.loc[df_pix_state_status_failed['status'] == 'failed']
df_pix_state_status_failed['time_requested_format'] = pd.to_datetime(df_pix_state_status_failed['time_requested']).dt.time
df_pix_state_status_failed['horario_comercial'] = df_pix_state_status_failed['time_requested_format'].apply(
    lambda x: '1' if pd.to_datetime("09:00:00").time() <= x <= pd.to_datetime("18:00:00").time() else '0'
)

df_pix_state_status_failed['state'].value_counts()

MG    1899
SP    1409
RS    1152
GO     933
SC     932
TO     783
BA     728
CE     684
MA     658
RN     638
PR     440
PE     389
RJ     365
MT     338
ES     255
PB     249
PI     165
MS     153
SE     120
AL     116
PA     101
AP      92
Name: state, dtype: int64

In [None]:

success_failure_counts = df_pix_state.groupby(['state', 'in_or_out']).size().unstack(fill_value=0)
success_failure_percentages = success_failure_counts.div(success_failure_counts.sum(axis=1), axis=0) * 100
success_failure_percentages.columns = ['Failed', 'Completed']

print(success_failure_percentages)

          Failed  Completed
state                      
AL     50.417362  49.582638
AP     49.208083  50.791917
BA     50.156356  49.843644
CE     49.691555  50.308445
ES     48.771423  51.228577
GO     50.313989  49.686011
MA     49.438646  50.561354
MG     50.401408  49.598592
MS     49.665676  50.334324
MT     50.449216  49.550784
PA     50.942507  49.057493
PB     49.906959  50.093041
PE     49.839021  50.160979
PI     49.304175  50.695825
PR     50.386921  49.613079
RJ     48.996313  51.003687
RN     50.524109  49.475891
RS     49.926119  50.073881
SC     49.886504  50.113496
SE     50.383401  49.616599
SP     49.809725  50.190275
TO     50.009643  49.990357
