In [1]:
import numpy as np
import pandas as pd
import ppscore as pps
import matplotlib.pyplot as plt

In [3]:
%%time
df = pd.read_csv("../data/interim/seggregated/balance.csv")
labels = pd.read_csv("../data/raw/train_labels.csv")

CPU times: user 26.7 s, sys: 9.42 s, total: 36.2 s
Wall time: 36.8 s


In [4]:
%%time
# checking shape of the dataframe
df.shape, labels.shape

CPU times: user 30 µs, sys: 12 µs, total: 42 µs
Wall time: 112 µs


((5531451, 41), (458913, 2))

In [5]:
%%time
df.columns

CPU times: user 5 µs, sys: 2 µs, total: 7 µs
Wall time: 8.82 µs


Index(['customer_ID', 'B_1', 'B_2', 'B_3', 'B_4', 'B_5', 'B_6', 'B_7', 'B_8',
       'B_9', 'B_10', 'B_11', 'B_12', 'B_13', 'B_14', 'B_15', 'B_16', 'B_17',
       'B_18', 'B_19', 'B_20', 'B_21', 'B_22', 'B_23', 'B_24', 'B_25', 'B_26',
       'B_27', 'B_28', 'B_29', 'B_30', 'B_31', 'B_32', 'B_33', 'B_36', 'B_37',
       'B_38', 'B_39', 'B_40', 'B_41', 'B_42'],
      dtype='object')

In [6]:
%%time
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   customer_ID  458913 non-null  object
 1   target       458913 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 7.0+ MB
CPU times: user 52.4 ms, sys: 0 ns, total: 52.4 ms
Wall time: 54.7 ms


In [7]:
%%time
# Let's encode customer_ID for efficient memory management.
labels['encoded_customerID'] = range(0, len(labels))

CPU times: user 2.48 ms, sys: 0 ns, total: 2.48 ms
Wall time: 1.65 ms


In [8]:
%%time
labels.head()

CPU times: user 102 µs, sys: 42 µs, total: 144 µs
Wall time: 165 µs


Unnamed: 0,customer_ID,target,encoded_customerID
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0,1
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0,2
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0,3
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0,4


In [9]:
%%time
custid_mapper = labels[['customer_ID', 'encoded_customerID']].set_index('customer_ID').to_dict()
custid_mapper = custid_mapper['encoded_customerID']

CPU times: user 293 ms, sys: 2.49 ms, total: 296 ms
Wall time: 305 ms


In [10]:
%%time
df['encoded_customerID'] = df['customer_ID'].map(custid_mapper)
df = df.drop(['customer_ID'], axis=1)

CPU times: user 755 ms, sys: 191 ms, total: 946 ms
Wall time: 952 ms


In [11]:
%%time
res_df = pd.merge(df, labels[['encoded_customerID', 'target']], on='encoded_customerID', how='left')

CPU times: user 1.13 s, sys: 662 ms, total: 1.79 s
Wall time: 1.89 s


In [12]:
%%time
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Data columns (total 41 columns):
 #   Column              Dtype  
---  ------              -----  
 0   B_1                 float64
 1   B_2                 float64
 2   B_3                 float64
 3   B_4                 float64
 4   B_5                 float64
 5   B_6                 float64
 6   B_7                 float64
 7   B_8                 float64
 8   B_9                 float64
 9   B_10                float64
 10  B_11                float64
 11  B_12                float64
 12  B_13                float64
 13  B_14                float64
 14  B_15                float64
 15  B_16                float64
 16  B_17                float64
 17  B_18                float64
 18  B_19                float64
 19  B_20                float64
 20  B_21                float64
 21  B_22                float64
 22  B_23                float64
 23  B_24                float64
 24  B_25                floa

In [13]:
%%time
res_df['encoded_customerID'].duplicated().sum()

CPU times: user 56.6 ms, sys: 2.57 ms, total: 59.1 ms
Wall time: 57.4 ms


5072538

In [14]:
%%time
# Column Profiling: data type, null percentage, describe (min, max, [5,25,50,75,95], mean, std, variance)

def col_profiling(col_series):
    res_dict = {'datatype': col_series.dtype,
                'null_pct': col_series.isna().sum()*100/len(col_series),
                'min_val': col_series.min(),
                'max_val': col_series.max(),
                'mean_val': col_series.mean(),
                'std': col_series.std(),
                'variance': col_series.std(),
                '5_pct': col_series.quantile(q=0.05),
                '25_pct': col_series.quantile(q=0.25),
                '50_pct': col_series.quantile(q=0.5),
                '75_pct': col_series.quantile(q=0.75),
                '95_pct': col_series.quantile(q=0.95),
                'pct_uniq': col_series.nunique() / len(col_series),
                'num_unique': col_series.nunique()}
    return res_dict

CPU times: user 3 µs, sys: 1 µs, total: 4 µs
Wall time: 7.15 µs


In [15]:
%%time
# Column Profiling on delinquency variables
ls = []
for item in res_df:
    if res_df[item].dtype == np.float64 or res_df[item].dtype == np.int64:
        res = col_profiling(res_df[item])
        res['column_name'] = item
        ls.append(res)
    else:
        print(f"categorical column is {item}")

CPU times: user 1min 13s, sys: 38.7 ms, total: 1min 13s
Wall time: 1min 13s


In [16]:
%%time
col_profile_df = pd.DataFrame(ls).set_index('column_name')

CPU times: user 2.21 ms, sys: 310 µs, total: 2.52 ms
Wall time: 8.41 ms


In [17]:
%%time
col_profile_df = col_profile_df.reset_index()
col_profile_df

CPU times: user 176 µs, sys: 364 µs, total: 540 µs
Wall time: 461 µs


Unnamed: 0,column_name,datatype,null_pct,min_val,max_val,mean_val,std,variance,5_pct,25_pct,50_pct,75_pct,95_pct,pct_uniq,num_unique
0,B_1,float64,0.0,-7.588799,1.32406,0.12401,0.211987,0.211987,0.002132,0.008864,0.03133,0.125902,0.603555,1.0,5531451
1,B_2,float64,0.036446,9.19228e-09,1.01,0.621489,0.401488,0.401488,0.019755,0.105331,0.814333,1.002403,1.008479,0.9996355,5529435
2,B_3,float64,0.036446,6.285293e-09,1.625262,0.132539,0.234993,0.234993,0.001211,0.005228,0.009777,0.155051,0.707351,0.9996355,5529435
3,B_4,float64,0.0,3.099332e-09,19.803286,0.172554,0.222415,0.222415,0.003583,0.0275,0.082226,0.238882,0.622921,1.0,5531451
4,B_5,float64,0.0,2.804822e-11,144.207023,0.083112,0.397043,0.397043,0.001897,0.00728,0.015377,0.053718,0.344357,1.0,5531451
5,B_6,float64,0.004212,-0.005178168,1214.516926,0.152117,1.47677,1.47677,0.006297,0.020487,0.083385,0.191929,0.334243,0.9999579,5531218
6,B_7,float64,0.0,-2.652748,1.25275,0.186084,0.230419,0.230419,0.008727,0.028247,0.075746,0.270932,0.685931,1.0,5531451
7,B_8,float64,0.402571,1.153704e-08,1.019079,0.450581,0.496941,0.496941,0.000902,0.004507,0.009021,1.004175,1.00866,0.9959743,5509183
8,B_9,float64,0.0,6.72607e-09,27.424859,0.189524,0.286274,0.286274,0.001153,0.005753,0.025878,0.334297,0.65127,1.0,5531451
9,B_10,float64,0.0,-0.002958481,4097.440729,0.2316,4.799846,4.799846,0.0101,0.028991,0.110555,0.295539,0.308041,1.0,5531451


In [18]:
%%time
res_df['target'] = res_df['target'].astype('category')
lss = []
for item in res_df:
    if item in ['target', 'encoded_customerID']:
        continue
    else:
        lss.append(pps.score(res_df, item, 'target'))

CPU times: user 14 s, sys: 1.23 s, total: 15.3 s
Wall time: 15.3 s


In [19]:
pps_score_result = pd.DataFrame(lss)

In [20]:
col_profile_df = pd.merge(col_profile_df, pps_score_result[['x', 'ppscore', 'baseline_score', 'model_score']], left_on='column_name', right_on='x',
                          how='left')

In [21]:
col_profile_df['ppscore'] = np.round(col_profile_df['ppscore'], 2)

In [22]:
col_profile_df.to_csv("../data/metadata/column_profiling_balance.csv", index=False)