In [1]:
import numpy as np
import pandas as pd
import ppscore as pps
import matplotlib.pyplot as plt

In [2]:
%%time
df = pd.read_csv("../data/interim/seggregated/risk.csv")
labels = pd.read_csv("../data/raw/train_labels.csv")

CPU times: user 23.6 s, sys: 2.49 s, total: 26.1 s
Wall time: 26.3 s


In [3]:
%%time
# checking shape of the dataframe
df.shape, labels.shape

CPU times: user 24 µs, sys: 0 ns, total: 24 µs
Wall time: 26 µs


((5531451, 29), (458913, 2))

In [4]:
%%time
df.columns

CPU times: user 8 µs, sys: 0 ns, total: 8 µs
Wall time: 9.78 µs


Index(['customer_ID', 'R_1', 'R_2', 'R_3', 'R_4', 'R_5', 'R_6', 'R_7', 'R_8',
       'R_9', 'R_10', 'R_11', 'R_12', 'R_13', 'R_14', 'R_15', 'R_16', 'R_17',
       'R_18', 'R_19', 'R_20', 'R_21', 'R_22', 'R_23', 'R_24', 'R_25', 'R_26',
       'R_27', 'R_28'],
      dtype='object')

In [6]:
%%time
labels.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Data columns (total 2 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   customer_ID  458913 non-null  object
 1   target       458913 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 7.0+ MB
CPU times: user 55.4 ms, sys: 0 ns, total: 55.4 ms
Wall time: 53.7 ms


In [7]:
%%time
# Let's encode customer_ID for efficient memory management.
labels['encoded_customerID'] = range(0, len(labels))

CPU times: user 1.31 ms, sys: 0 ns, total: 1.31 ms
Wall time: 907 µs


In [8]:
%%time
labels.head()

CPU times: user 117 µs, sys: 25 µs, total: 142 µs
Wall time: 145 µs


Unnamed: 0,customer_ID,target,encoded_customerID
0,0000099d6bd597052cdcda90ffabf56573fe9d7c79be5f...,0,0
1,00000fd6641609c6ece5454664794f0340ad84dddce9a2...,0,1
2,00001b22f846c82c51f6e3958ccd81970162bae8b007e8...,0,2
3,000041bdba6ecadd89a52d11886e8eaaec9325906c9723...,0,3
4,00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8a...,0,4


In [9]:
%%time
custid_mapper = labels[['customer_ID', 'encoded_customerID']].set_index('customer_ID').to_dict()
custid_mapper = custid_mapper['encoded_customerID']

CPU times: user 280 ms, sys: 7.02 ms, total: 287 ms
Wall time: 284 ms


In [10]:
%%time
df['encoded_customerID'] = df['customer_ID'].map(custid_mapper)
df = df.drop(['customer_ID'], axis=1)

CPU times: user 729 ms, sys: 139 ms, total: 868 ms
Wall time: 865 ms


In [11]:
%%time
res_df = pd.merge(df, labels[['encoded_customerID', 'target']], on='encoded_customerID', how='left')

CPU times: user 756 ms, sys: 747 ms, total: 1.5 s
Wall time: 1.52 s


In [12]:
%%time
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5531451 entries, 0 to 5531450
Data columns (total 29 columns):
 #   Column              Dtype  
---  ------              -----  
 0   R_1                 float64
 1   R_2                 float64
 2   R_3                 float64
 3   R_4                 float64
 4   R_5                 float64
 5   R_6                 float64
 6   R_7                 float64
 7   R_8                 float64
 8   R_9                 float64
 9   R_10                float64
 10  R_11                float64
 11  R_12                float64
 12  R_13                float64
 13  R_14                float64
 14  R_15                float64
 15  R_16                float64
 16  R_17                float64
 17  R_18                float64
 18  R_19                float64
 19  R_20                float64
 20  R_21                float64
 21  R_22                float64
 22  R_23                float64
 23  R_24                float64
 24  R_25                floa

In [13]:
%%time
# Column Profiling: data type, null percentage, describe (min, max, [5,25,50,75,95], mean, std, variance)

def col_profiling(col_series):
    res_dict = {'datatype': col_series.dtype,
                'null_pct': col_series.isna().sum()*100/len(col_series),
                'min_val': col_series.min(),
                'max_val': col_series.max(),
                'mean_val': col_series.mean(),
                'std': col_series.std(),
                'variance': col_series.std(),
                '5_pct': col_series.quantile(q=0.05),
                '25_pct': col_series.quantile(q=0.25),
                '50_pct': col_series.quantile(q=0.5),
                '75_pct': col_series.quantile(q=0.75),
                '95_pct': col_series.quantile(q=0.95),
                'pct_uniq': col_series.nunique() / len(col_series),
                'num_unique': col_series.nunique()}
    return res_dict

CPU times: user 3 µs, sys: 0 ns, total: 3 µs
Wall time: 5.72 µs


In [14]:
%%time
# Column Profiling on delinquency variables
ls = []
for item in res_df:
    if res_df[item].dtype == np.float64 or res_df[item].dtype == np.int64:
        res = col_profiling(res_df[item])
        res['column_name'] = item
        ls.append(res)
    else:
        print(f"categorical column is {item}")

CPU times: user 56.4 s, sys: 15.9 ms, total: 56.5 s
Wall time: 56.5 s


In [15]:
%%time
col_profile_df = pd.DataFrame(ls).set_index('column_name')

CPU times: user 2.24 ms, sys: 0 ns, total: 2.24 ms
Wall time: 2.16 ms


In [16]:
%%time
col_profile_df = col_profile_df.reset_index()
col_profile_df

CPU times: user 673 µs, sys: 70 µs, total: 743 µs
Wall time: 724 µs


Unnamed: 0,column_name,datatype,null_pct,min_val,max_val,mean_val,std,variance,5_pct,25_pct,50_pct,75_pct,95_pct,pct_uniq,num_unique
0,R_1,float64,0.0,1.534223e-09,3.256284,0.078803,0.226397,0.226397,0.00058,0.002896,0.005782,0.008661,0.507362,0.9999998,5531450
1,R_2,float64,0.0,8.265748e-10,1.01,0.047518,0.201784,0.201784,0.000523,0.002613,0.005223,0.007837,0.009922,1.0,5531451
2,R_3,float64,0.0,2.238322e-09,11.602601,0.120829,0.210516,0.210516,0.00094,0.0047,0.009401,0.200336,0.503688,1.0,5531451
3,R_4,float64,0.0,4.024685e-10,1.01,0.031204,0.15977,0.15977,0.000515,0.002566,0.005133,0.0077,0.009757,1.0,5531451
4,R_5,float64,0.0,1.154589e-09,35.004515,0.034594,0.256807,0.256807,0.000514,0.002575,0.005149,0.007722,0.009778,1.0,5531451
5,R_6,float64,0.0,4.650681e-10,14.448201,0.058531,0.641566,0.641566,0.000518,0.002592,0.005187,0.007782,0.009852,1.0,5531451
6,R_7,float64,1.8e-05,8.67629e-10,449.988885,0.088064,1.550873,1.550873,0.000514,0.002569,0.005136,0.007703,0.009757,0.9999996,5531449
7,R_8,float64,0.0,1.738091e-09,38.008079,0.03812,0.303642,0.303642,0.00051,0.002554,0.005106,0.007658,0.009702,1.0,5531451
8,R_9,float64,94.349891,3.121485e-07,1.509999,0.22989,0.189174,0.189174,0.006244,0.169122,0.172749,0.17634,0.509133,0.05650109,312533
9,R_10,float64,0.0,2.23325e-09,21.003672,0.064775,0.305694,0.305694,0.000524,0.002625,0.005247,0.00787,0.009968,1.0,5531451


In [17]:
%%time
all_uniq = col_profile_df[col_profile_df['pct_uniq']==1].index.tolist()

CPU times: user 729 µs, sys: 77 µs, total: 806 µs
Wall time: 777 µs


In [18]:
%%time
all_uniq_df = df[all_uniq+['encoded_customerID']]

KeyError: '[1, 2, 3, 4, 5, 7, 9, 10, 12, 14, 15, 16, 17, 18, 20, 21, 22, 23, 24, 27] not in index'

In [19]:
%%time
all_uniq_df.head()

NameError: name 'all_uniq_df' is not defined

In [20]:
%%time
res_df['target'] = res_df['target'].astype('category')
lss = []
for item in res_df:
    if item in ['target', 'encoded_customerID']:
        continue
    else:
        lss.append(pps.score(res_df, item, 'target'))

CPU times: user 9.8 s, sys: 0 ns, total: 9.8 s
Wall time: 9.79 s


In [21]:
pps_score_result = pd.DataFrame(lss)

In [22]:
col_profile_df = pd.merge(col_profile_df, pps_score_result[['x', 'ppscore', 'baseline_score', 'model_score']], left_on='column_name', right_on='x',
                          how='left')

In [23]:
col_profile_df['ppscore'] = np.round(col_profile_df['ppscore'], 2)

In [24]:
col_profile_df.to_csv("../data/metadata/column_profiling_risk.csv", index=False)