-
Notifications
You must be signed in to change notification settings - Fork 0
/
process_cc_balance.R
131 lines (115 loc) · 5.95 KB
/
process_cc_balance.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
library(dplyr)
## Read data
cc_balance <- readRDS("dat/cc_balance.RDS")
#application_train <- readRDS("dat/application_train.RDS")
get_recent_cc_data = function(df) {
df <- filter(df, MONTHS_BALANCE == max(MONTHS_BALANCE,
na.rm = TRUE)) %>%
select(SK_ID_CURR, SK_ID_PREV, AMT_BALANCE, AMT_CREDIT_LIMIT_ACTUAL,
AMT_DRAWINGS_ATM_CURRENT, AMT_DRAWINGS_CURRENT,
AMT_DRAWINGS_OTHER_CURRENT, AMT_DRAWINGS_POS_CURRENT,
AMT_RECEIVABLE_PRINCIPAL, AMT_TOTAL_RECEIVABLE,
CNT_DRAWINGS_ATM_CURRENT, CNT_DRAWINGS_CURRENT,
CNT_DRAWINGS_OTHER_CURRENT, CNT_DRAWINGS_POS_CURRENT,
NAME_CONTRACT_STATUS, SK_DPD, SK_DPD_DEF) %>%
rename(SK_DPD_CC = SK_DPD, SK_DPD_DEF_CC = SK_DPD_DEF,
NAME_CONTRACT_STATUS_CC = NAME_CONTRACT_STATUS)
}
# FEATURE ENGINEERING
# Somehow figure out how well someone is paying off their previous loan
# Key:
# Single Hash: original features
## Double Hash: engineered feature, summary entry
### Triple Hash: engineered feature for each row
# SK_ID_PREV
# SK_ID_CURR
# MONTHS_BALANCE: most recent
# AMT_BALANCE: most recent
## debt_time_corr: calculated from MONTHS_BALANCE and AMT_BALANCE (unused)
# AMT_CREDIT_LIMIT_ACTUAL: most recent and max
# AMT_DRAWINGS_ATM_CURRENT: most recent
# AMT_DRAWINGS_CURRENT: most recent
# AMT_DRAWINGS_OTHER_CURRENT: most recent
# AMT_DRAWINGS_POS_CURRENT: most recent
# AMT_INST_MIN_REGULARITY
# AMT_PAYMENT_CURRENT: drop (not as useful as AMT_PAYMENT_TOTAL_CURRENT I think)
# AMT_PAYMENT_TOTAL_CURRENT
### amt_above_min_payment: AMT_PAYMENT_TOTAL_CURRENT - AMT_INST_MIN_REGULARITY
## amt_above_min_payment_mean: mean(amt_above_min_payment)
# AMT_RECEIVABLE_PRINCIPAL: most recent
# AMT_RECIVABLE: drop (not as useful as AMT_TOTAL_RECEIVABLE I think)
# AMT_TOTAL_RECEIVABLE: most recent
### interest: AMT_TOTAL_RECEIVABLE - AMT_RECEIVABLE_PRINCIPAL
## interest_total: sum(interest)
## interest_mean: mean(interest)
# CNT_DRAWINGS_ATM_CURRENT: most recent
# CNT_DRAWINGS_CURRENT: most recent
# CNT_DRAWINGS_OTHER_CURRENT: most recent
# CNT_DRAWINGS_POS_CURRENT: most recent
# CNT_INSTALMENT_MATURE_CUM: drop
# NAME_CONTRACT_STATUS: most recent
# SK_DPD: sum and mean and recent
# SK_DPD_DEF: sum and mean and recent
cc_balance <- select(cc_balance, -c(AMT_RECIVABLE, AMT_PAYMENT_CURRENT)) %>%
mutate(amt_above_min_payment =
AMT_PAYMENT_TOTAL_CURRENT - AMT_INST_MIN_REGULARITY,
interest = AMT_TOTAL_RECEIVABLE - AMT_RECEIVABLE_PRINCIPAL)
cc_balance_recent <- get_recent_cc_data(cc_balance)
cc_balance_summary <- cc_balance %>%
group_by(SK_ID_CURR, SK_ID_PREV) %>%
#filter(SK_ID_CURR == 126868) %>% # Testing one loan
summarize(AMT_CREDIT_LIMIT_ACTUAL_MAX = max(AMT_CREDIT_LIMIT_ACTUAL,
na.rm = TRUE),
amt_above_min_payment_mean = mean(amt_above_min_payment,
na.rm = TRUE),
interest_total = sum(interest, na.rm = TRUE),
interest_mean = mean(interest, na.rm = TRUE),
SK_DPD_CC_SUM = sum(SK_DPD, na.rm = TRUE),
SK_DPD_CC_MEAN = mean(SK_DPD, na.rm = TRUE),
SK_DPD_DEF_CC_SUM = sum(SK_DPD_DEF, na.rm = TRUE),
SK_DPD_DEF_CC_MEAN = mean(SK_DPD_DEF, na.rm = TRUE))
# This block takes forever to run...maybe not the best way to do it
# cc_balance_correlations <- cc_balance %>%
# group_by(SK_ID_CURR, SK_ID_PREV) %>%
# summarize(debt_time_corr = debt_time_corr(.))
# Join the datasets
cc_balance_summary <- left_join(cc_balance_recent, cc_balance_summary,
by = c("SK_ID_PREV", "SK_ID_CURR"))
# There are still some SK_ID_CURRs that have multiple SK_ID_PREVs. Collapse to 1:1
# Wasn't sure how to make NAME_CONTRACT_STATUS into a single value
cc_mult <- cc_balance_summary %>%
group_by(SK_ID_CURR) %>%
filter(n() > 1) %>%
summarize(AMT_CREDIT_LIMIT_ACTUAL_MAX = sum(AMT_CREDIT_LIMIT_ACTUAL_MAX),
amt_above_min_payment_mean = mean(amt_above_min_payment_mean),
interest_total = sum(interest_total),
interest_mean = mean(interest_mean),
SK_DPD_CC_SUM = sum(SK_DPD_CC_SUM),
SK_DPD_CC_MEAN = mean(SK_DPD_CC_MEAN),
SK_DPD_DEF_CC_SUM = sum(SK_DPD_DEF_CC_SUM),
SK_DPD_DEF_CC_MEAN = mean(SK_DPD_DEF_CC_MEAN),
AMT_BALANCE = sum(AMT_BALANCE),
AMT_CREDIT_LIMIT_ACTUAL = sum(AMT_CREDIT_LIMIT_ACTUAL),
AMT_DRAWINGS_ATM_CURRENT = sum(AMT_DRAWINGS_ATM_CURRENT),
AMT_DRAWINGS_CURRENT = sum(AMT_DRAWINGS_CURRENT),
AMT_DRAWINGS_OTHER_CURRENT = sum(AMT_DRAWINGS_OTHER_CURRENT),
AMT_DRAWINGS_POS_CURRENT = sum(AMT_DRAWINGS_POS_CURRENT),
AMT_RECEIVABLE_PRINCIPAL = sum(AMT_RECEIVABLE_PRINCIPAL),
AMT_TOTAL_RECEIVABLE = sum(AMT_TOTAL_RECEIVABLE),
CNT_DRAWINGS_ATM_CURRENT = sum(CNT_DRAWINGS_ATM_CURRENT),
CNT_DRAWINGS_CURRENT = sum(CNT_DRAWINGS_CURRENT),
CNT_DRAWINGS_OTHER_CURRENT = sum(CNT_DRAWINGS_OTHER_CURRENT),
CNT_DRAWINGS_POS_CURRENT = sum(CNT_DRAWINGS_POS_CURRENT),
SK_DPD_CC = sum(SK_DPD_CC),
SK_DPD_DEF_CC = sum(SK_DPD_DEF_CC),
NAME_CONTRACT_STATUS_CC = "Unknown")
# Drop SK_ID_PREV column, then filter out the SK_ID_CURRs with multiple entries;
# Join to consolidated table (cc_mult)
cc_balance_summary <- cc_balance_summary %>%
select(-SK_ID_PREV) %>%
filter(!(SK_ID_CURR %in% cc_mult$SK_ID_CURR)) %>%
union(cc_mult)
# Join to training set
df <- left_join(df, cc_balance_summary, by = "SK_ID_CURR")
df$NAME_CONTRACT_STATUS_CC <- as.factor(df$NAME_CONTRACT_STATUS_CC)
rm(cc_balance_summary, cc_mult, cc_balance_recent, cc_balance)