In [1]:
import numpy as np
import pandas as pd
import ppscore as pps
import matplotlib.pyplot as plt

In [2]:
%%time
df = pd.read_csv("../data/interim/seggregated/payments.csv")
labels = pd.read_csv("../data/raw/train_labels.csv")

CPU times: user 3.35 s, sys: 408 ms, total: 3.75 s
Wall time: 3.77 s


In [3]:
%%time
# checking shape of the dataframe
df.shape, labels.shape

CPU times: user 20 µs, sys: 0 ns, total: 20 µs
Wall time: 22.4 µs


((5531451, 4), (458913, 2))

In [4]:
%%time
df.columns

CPU times: user 8 µs, sys: 5 µs, total: 13 µs
Wall time: 16 µs


Index(['customer_ID', 'P_2', 'P_3', 'P_4'], dtype='object')

In [5]:
%%time
# Let's encode customer_ID for efficient memory management.
labels['encoded_customerID'] = range(0, len(labels))

CPU times: user 4.86 ms, sys: 471 µs, total: 5.33 ms
Wall time: 3.68 ms


In [6]:
%%time
labels.head()

CPU times: user 173 µs, sys: 95 µs, total: 268 µs
Wall time: 246 µs


Unnamed: 0,customer_ID,target,encoded_customerID
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0,1
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0,2
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0,3
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0,4


In [7]:
%%time
custid_mapper = labels[['customer_ID', 'encoded_customerID']].set_index('customer_ID').to_dict()
custid_mapper = custid_mapper['encoded_customerID']

CPU times: user 281 ms, sys: 32.2 ms, total: 313 ms
Wall time: 311 ms


In [8]:
%%time
df['encoded_customerID'] = df['customer_ID'].map(custid_mapper)
df = df.drop(['customer_ID'], axis=1)

CPU times: user 552 ms, sys: 35.8 ms, total: 588 ms
Wall time: 585 ms


In [9]:
%%time
res_df = pd.merge(df, labels[['encoded_customerID', 'target']], on='encoded_customerID', how='left')

CPU times: user 390 ms, sys: 183 ms, total: 574 ms
Wall time: 578 ms


In [10]:
%%time
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Data columns (total 4 columns):
 #   Column              Dtype  
---  ------              -----  
 0   P_2                 float64
 1   P_3                 float64
 2   P_4                 float64
 3   encoded_customerID  int64  
dtypes: float64(3), int64(1)
memory usage: 168.8 MB
CPU times: user 6.15 ms, sys: 1.01 ms, total: 7.16 ms
Wall time: 6.44 ms


In [11]:
%%time
# Column Profiling: data type, null percentage, describe (min, max, [5,25,50,75,95], mean, std, variance)

def col_profiling(col_series):
    res_dict = {'datatype': col_series.dtype,
                'null_pct': col_series.isna().sum()*100/len(col_series),
                'min_val': col_series.min(),
                'max_val': col_series.max(),
                'mean_val': col_series.mean(),
                'std': col_series.std(),
                'variance': col_series.std(),
                '5_pct': col_series.quantile(q=0.05),
                '25_pct': col_series.quantile(q=0.25),
                '50_pct': col_series.quantile(q=0.5),
                '75_pct': col_series.quantile(q=0.75),
                '95_pct': col_series.quantile(q=0.95),
                'pct_uniq': col_series.nunique() / len(col_series),
                'num_unique': col_series.nunique()}
    return res_dict

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 5.48 µs


In [12]:
%%time
# Column Profiling on delinquency variables
ls = []
for item in res_df:
    if res_df[item].dtype == np.float64 or res_df[item].dtype == np.int64:
        res = col_profiling(res_df[item])
        res['column_name'] = item
        ls.append(res)
    else:
        print(f"categorical column is {item}")

CPU times: user 6.56 s, sys: 771 ms, total: 7.33 s
Wall time: 7.35 s


In [13]:
%%time
col_profile_df = pd.DataFrame(ls).set_index('column_name')

CPU times: user 1.37 ms, sys: 399 µs, total: 1.77 ms
Wall time: 1.65 ms


In [14]:
%%time
col_profile_df = col_profile_df.reset_index()
col_profile_df

CPU times: user 694 µs, sys: 201 µs, total: 895 µs
Wall time: 846 µs


Unnamed: 0,column_name,datatype,null_pct,min_val,max_val,mean_val,std,variance,5_pct,25_pct,50_pct,75_pct,95_pct,pct_uniq,num_unique
0,P_2,float64,0.831337,-0.4589548,1.01,0.656334,0.244649,0.244649,0.221304,0.480331,0.694295,0.864816,0.976297,0.9916866,5485466
1,P_3,float64,5.450505,-1.51969,2.428051,0.601289,0.170799,0.170799,0.307036,0.540866,0.618303,0.683826,0.831308,0.945495,5229959
2,P_4,float64,0.0,2.668066e-09,1.26939,0.143927,0.338272,0.338272,0.000584,0.002924,0.005849,0.008766,0.964281,1.0,5531451
3,encoded_customerID,int64,0.0,0.0,458912.0,229419.424355,132490.213791,132490.213791,22922.0,114669.0,229380.0,344209.0,435977.0,0.08296431,458913
4,target,int64,0.0,0.0,1.0,0.249097,0.43249,0.43249,0.0,0.0,0.0,0.0,1.0,3.615688e-07,2


In [17]:
%%time
all_uniq = col_profile_df.loc[col_profile_df['pct_uniq']==1, 'column_name'].values.tolist()

CPU times: user 405 µs, sys: 112 µs, total: 517 µs
Wall time: 515 µs


In [18]:
%%time
all_uniq_df = df[all_uniq+['encoded_customerID']]

CPU times: user 11.8 ms, sys: 16 ms, total: 27.8 ms
Wall time: 27.3 ms


In [19]:
all_uniq_df

Unnamed: 0,P_4,encoded_customerID
0,0.007554,0
1,0.004832,0
2,0.006561,0
3,0.009559,0
4,0.008156,0
...,...,...
5531446,0.003754,458912
5531447,0.009763,458912
5531448,0.005516,458912
5531449,0.002678,458912


In [20]:
%%time
res_df['target'] = res_df['target'].astype('category')
lss = []
for item in res_df:
    if item in ['target', 'encoded_customerID']:
        continue
    else:
        lss.append(pps.score(res_df, item, 'target'))

CPU times: user 1.16 s, sys: 102 ms, total: 1.26 s
Wall time: 1.26 s


In [21]:
pps_score_result = pd.DataFrame(lss)

In [22]:
col_profile_df = pd.merge(col_profile_df, pps_score_result[['x', 'ppscore', 'baseline_score', 'model_score']], left_on='column_name', right_on='x',
                          how='left')

In [23]:
col_profile_df['ppscore'] = np.round(col_profile_df['ppscore'], 2)

In [24]:
col_profile_df.to_csv("../data/metadata/column_profiling_payment.csv", index=False)