In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from lightgbm.sklearn import LGBMClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error, log_loss, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from scipy import sparse
from scipy.stats import kurtosis
import time
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
import datetime

In [2]:
train_df = pd.read_csv('dataset/train.csv', parse_dates=['auditing_date', 'due_date', 'repay_date'])


In [3]:
train_df.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,repay_date,repay_amt
0,748147,3163926,2018-04-25,2018-05-25,72.1167,2018-05-25,72.1167
1,672952,3698760,2018-06-09,2018-07-09,258.7045,2018-07-08,258.7045
2,404196,2355665,2018-02-18,2018-03-18,307.927,\N,\N
3,342769,1994522,2018-01-13,2018-02-13,252.9809,2018-02-13,252.9809
4,828139,3602352,2018-06-01,2018-07-01,107.6503,2018-06-25,107.6503


In [4]:
def plus_1_day(s):
    return s + datetime.timedelta(days=1)

In [5]:
# 如果违约,还款日期为due_date的后一天
train_df['repay_date'] = train_df[['due_date', 'repay_date']].apply(
    lambda x: x['repay_date'] if x['repay_date'] != '\\N' else plus_1_day(x['due_date']), axis=1
)

In [6]:
train_df.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,repay_date,repay_amt
0,748147,3163926,2018-04-25,2018-05-25,72.1167,2018-05-25,72.1167
1,672952,3698760,2018-06-09,2018-07-09,258.7045,2018-07-08,258.7045
2,404196,2355665,2018-02-18,2018-03-18,307.927,2018-03-19,\N
3,342769,1994522,2018-01-13,2018-02-13,252.9809,2018-02-13,252.9809
4,828139,3602352,2018-06-01,2018-07-01,107.6503,2018-06-25,107.6503


In [7]:
# 如果违约,还款金额为0
train_df['repay_amt'] = train_df['repay_amt'].apply(lambda x: x if x != '\\N' else 0).astype('float32')

In [8]:
train_df.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,repay_date,repay_amt
0,748147,3163926,2018-04-25,2018-05-25,72.1167,2018-05-25,72.116699
1,672952,3698760,2018-06-09,2018-07-09,258.7045,2018-07-08,258.704498
2,404196,2355665,2018-02-18,2018-03-18,307.927,2018-03-19,0.0
3,342769,1994522,2018-01-13,2018-02-13,252.9809,2018-02-13,252.980896
4,828139,3602352,2018-06-01,2018-07-01,107.6503,2018-06-25,107.650299


In [9]:
# 错误做法,用还款日-成立日
# train_df['label'] = (train_df['repay_date'] - train_df['auditing_date']).dt.days
# train_df['label'].value_counts(sort=False)

In [10]:
# 设定label
train_df['label'] = (train_df['due_date'] - train_df['repay_date']).dt.days

In [11]:
train_df.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,repay_date,repay_amt,label
0,748147,3163926,2018-04-25,2018-05-25,72.1167,2018-05-25,72.116699,0
1,672952,3698760,2018-06-09,2018-07-09,258.7045,2018-07-08,258.704498,1
2,404196,2355665,2018-02-18,2018-03-18,307.927,2018-03-19,0.0,-1
3,342769,1994522,2018-01-13,2018-02-13,252.9809,2018-02-13,252.980896,0
4,828139,3602352,2018-06-01,2018-07-01,107.6503,2018-06-25,107.650299,6


In [12]:
train_df['label'].value_counts(sort=False)

 0     408187
 1     121085
 2      59430
 3      56404
 4      26425
 5      21380
 6      17568
 7      14797
 8      12993
 9      11393
 10      9984
 11      9002
 12      8219
 13      7688
 14      6920
 15      6443
 16      6231
 17      5832
 18      5492
 19      5108
 20      4788
 21      4504
 22      4295
 23      4197
 24      3922
 25      3934
 26      3930
 27      4102
 28      4677
 29      5645
 30      9865
 31      8368
-1     117192
Name: label, dtype: int64

In [13]:
train_df['label'].nunique()

33

In [14]:
# train_df.loc[train_df['repay_amt'] == -1, 'label'] = 32 错误写法
train_df['label'].replace(-1, 32, inplace=True)

In [15]:
train_df['label'].value_counts(sort=False)

0     408187
1     121085
2      59430
3      56404
4      26425
5      21380
6      17568
7      14797
8      12993
9      11393
10      9984
11      9002
12      8219
13      7688
14      6920
15      6443
16      6231
17      5832
18      5492
19      5108
20      4788
21      4504
22      4295
23      4197
24      3922
25      3934
26      3930
27      4102
28      4677
29      5645
30      9865
31      8368
32    117192
Name: label, dtype: int64

In [16]:
clf_labels = train_df['label'].values

In [17]:
clf_labels.shape

(1000000,)

In [18]:
amt_labels = train_df['repay_amt'].values

In [19]:
del train_df['label'], train_df['repay_amt'], train_df['repay_date']


In [20]:
amt_labels

array([ 72.1167, 258.7045,   0.    , ..., 258.7045, 140.7993, 180.9695],
      dtype=float32)

In [21]:
train_df['due_amt']

0           72.1167
1          258.7045
2          307.9270
3          252.9809
4          107.6503
5          201.0499
6         3730.9948
7         1040.5302
8          133.9311
9          394.3774
10         145.1170
11         199.9221
12         292.3439
13         104.1686
14         259.7528
15         634.1695
16        1131.5995
17         195.7454
18        1027.7126
19         195.7454
20         320.5188
21         109.0835
22         131.6416
23         225.1639
24         133.9311
25         483.4584
26        1389.7108
27         486.1024
28         209.1453
29         190.6062
            ...    
999970     323.8476
999971     394.6893
999972     460.9070
999973     728.3711
999974      84.7085
999975    1062.6251
999976      97.3003
999977     102.1022
999978      90.4321
999979     510.0640
999980    2069.3485
999981     112.5742
999982     291.9417
999983     107.5574
999984     533.2005
999985     556.0866
999986     168.4249
999987     250.6915
999988     509.3960


In [22]:
train_due_amt_df = train_df[['due_amt']]

In [23]:
train_num = train_df.shape[0]


In [24]:
test_df = pd.read_csv('dataset/test.csv', parse_dates=['auditing_date', 'due_date'])

In [25]:
test_df.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt
0,498765,5431438,2019-03-12,2019-04-12,138.5903
1,34524,5443211,2019-03-15,2019-04-15,208.0805
2,821741,5461707,2019-03-22,2019-04-22,421.2097
3,263534,5472320,2019-03-26,2019-04-26,212.6537
4,238853,5459750,2019-03-21,2019-04-21,817.4593


In [106]:
sub = test_df[['listing_id', 'auditing_date', 'due_amt', 'due_date']]


In [107]:
sub.head()

Unnamed: 0,listing_id,auditing_date,due_amt,due_date
0,5431438,2019-03-12,138.5903,2019-04-12
1,5443211,2019-03-15,208.0805,2019-04-15
2,5461707,2019-03-22,421.2097,2019-04-22
3,5472320,2019-03-26,212.6537,2019-04-26
4,5459750,2019-03-21,817.4593,2019-04-21


In [28]:
df = pd.concat([train_df, test_df], axis=0, ignore_index=True)


In [29]:
test_df.shape

(130000, 5)

In [30]:
df.shape

(1130000, 5)

In [31]:
df.tail()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt
1129995,580040,5287412,2019-02-01,2019-03-01,1227.8448
1129996,495129,5461576,2019-03-22,2019-04-22,174.9248
1129997,233442,5319333,2019-02-10,2019-03-10,168.3364
1129998,20165,5336095,2019-02-15,2019-03-15,350.2759
1129999,265473,5460170,2019-03-21,2019-04-21,293.8277


In [32]:
listing_info_df = pd.read_csv('dataset/listing_info.csv')


In [33]:
listing_info_df.head()

Unnamed: 0,user_id,listing_id,auditing_date,term,rate,principal
0,316610,1556649,2017-11-26,9,7.6,4800
1,62002,1556633,2017-11-26,6,7.6,4000
2,192135,1556629,2017-11-26,12,8.0,8660
3,487382,1556628,2017-11-26,9,7.6,4780
4,235186,1556627,2017-11-26,9,7.6,1480


In [34]:
del listing_info_df['user_id'], listing_info_df['auditing_date']


In [35]:
listing_info_df.head()

Unnamed: 0,listing_id,term,rate,principal
0,1556649,9,7.6,4800
1,1556633,6,7.6,4000
2,1556629,12,8.0,8660
3,1556628,9,7.6,4780
4,1556627,9,7.6,1480


In [36]:
df = df.merge(listing_info_df, on='listing_id', how='left')


In [37]:
df.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,term,rate,principal
0,748147,3163926,2018-04-25,2018-05-25,72.1167,9,7.2,630
1,672952,3698760,2018-06-09,2018-07-09,258.7045,9,7.2,2260
2,404196,2355665,2018-02-18,2018-03-18,307.927,9,7.2,2690
3,342769,1994522,2018-01-13,2018-02-13,252.9809,9,7.2,2210
4,828139,3602352,2018-06-01,2018-07-01,107.6503,6,8.6,630


In [38]:
user_info_df = pd.read_csv('dataset/user_info.csv', parse_dates=['reg_mon', 'insertdate'])


In [39]:
user_info_df.head()

Unnamed: 0,user_id,reg_mon,gender,age,cell_province,id_province,id_city,insertdate
0,483833,2017-04-01,男,19,c29,c26,c26241,2018-12-11
1,156772,2016-05-01,男,31,c11,c11,c11159,2018-02-13
2,173388,2016-05-01,男,34,c02,c02,c02182,2018-08-21
3,199107,2016-07-01,女,25,c09,c09,c09046,2018-06-05
4,122560,2016-03-01,男,23,c05,c05,c05193,2018-04-02


In [40]:
user_info_df.rename(columns={'insertdate': 'info_insert_date'}, inplace=True)


In [41]:
user_info_df['user_id'].value_counts().head()

670044    3
567844    3
799738    3
66268     3
336069    3
Name: user_id, dtype: int64

In [42]:
user_info_df = user_info_df.sort_values(by='info_insert_date', ascending=False).drop_duplicates('user_id').reset_index(drop=True)


In [43]:
user_info_df['user_id'].value_counts().head()

2047      1
798931    1
901357    1
903404    1
913643    1
Name: user_id, dtype: int64

In [44]:
user_info_df.head()

Unnamed: 0,user_id,reg_mon,gender,age,cell_province,id_province,id_city,info_insert_date
0,907196,2018-09-01,男,30,c04,c11,c11076,2019-03-30
1,504119,2017-05-01,男,30,c02,c02,c02139,2019-03-30
2,909870,2018-10-01,男,25,c02,c02,c02321,2019-03-30
3,542229,2017-06-01,男,35,c09,c09,c09205,2019-03-30
4,554821,2017-06-01,男,27,c04,c04,c04344,2019-03-30


In [45]:
df = df.merge(user_info_df, on='user_id', how='left')


In [46]:
df.head()

Unnamed: 0,user_id,listing_id,auditing_date,due_date,due_amt,term,rate,principal,reg_mon,gender,age,cell_province,id_province,id_city,info_insert_date
0,748147,3163926,2018-04-25,2018-05-25,72.1167,9,7.2,630,2017-12-01,男,21,c20,c20,c20089,2018-04-24
1,672952,3698760,2018-06-09,2018-07-09,258.7045,9,7.2,2260,2017-09-01,男,37,c14,c17,c17250,2018-06-04
2,404196,2355665,2018-02-18,2018-03-18,307.927,9,7.2,2690,2017-02-01,男,24,c04,c04,c04251,2018-02-17
3,342769,1994522,2018-01-13,2018-02-13,252.9809,9,7.2,2210,2016-12-01,男,23,c17,c17,c17246,2018-01-12
4,828139,3602352,2018-06-01,2018-07-01,107.6503,6,8.6,630,2018-04-01,男,23,c02,c17,c17096,2018-05-31


In [47]:
user_tag_df = pd.read_csv('dataset/user_taglist.csv', parse_dates=['insertdate'])


In [48]:
user_tag_df.head()

Unnamed: 0,user_id,taglist,insertdate
0,113401,4707|473|3498|4759|1654|298|2869|1164|212|1885...,2018-10-03
1,378358,751|2207|1100|2099|1832|1911|5347|2254|171|360...,2018-11-30
2,434838,877|3795|5628|70|2684|691|719|4228|631|1541|12...,2018-03-25
3,577061,2431|3242|340|1823|4020|4357|164|620|2168|1192...,2018-05-25
4,566753,3980|3125|1819|1333|1177|3972|621|5800|3632|16...,2018-12-02


In [49]:
user_tag_df.rename(columns={'insertdate': 'tag_insert_date'}, inplace=True)


In [50]:
user_tag_df = user_tag_df.sort_values(by='tag_insert_date', ascending=False).drop_duplicates('user_id').reset_index(drop=True)


In [51]:
df = df.merge(user_tag_df, on='user_id', how='left')


In [52]:
repay_log_df = pd.read_csv('dataset/user_repay_logs.csv', parse_dates=['due_date', 'repay_date'])


In [53]:
repay_log_df.head()

Unnamed: 0,user_id,listing_id,order_id,due_date,due_amt,repay_date,repay_amt
0,748483,1858122,6,2018-06-29,528.6365,2018-06-20,528.6365
1,748483,1858122,4,2018-04-29,528.6365,2200-01-01,528.6365
2,748483,1858122,7,2018-07-29,528.6365,2018-06-20,528.6365
3,748483,1858122,5,2018-05-29,528.6365,2018-05-29,528.6365
4,748483,1858122,1,2018-01-29,528.6365,2018-01-28,528.6365


In [54]:
repay_log_df = repay_log_df[repay_log_df['order_id'] == 1].reset_index(drop=True)


In [55]:
repay_log_df['repay'] = repay_log_df['repay_date'].astype('str').apply(lambda x: 1 if x != '2200-01-01' else 0)


In [56]:
repay_log_df.head()

Unnamed: 0,user_id,listing_id,order_id,due_date,due_amt,repay_date,repay_amt,repay
0,748483,1858122,1,2018-01-29,528.6365,2018-01-28,528.6365,1
1,369368,3900565,1,2018-08-03,385.5078,2018-07-22,385.5078,1
2,749102,1699160,1,2018-01-11,338.5357,2017-12-11,338.5357,1
3,385257,2204015,1,2018-03-03,1106.1967,2018-03-02,1106.1967,1
4,648677,3811960,1,2018-07-23,385.5078,2018-07-04,385.5078,1


In [57]:
repay_log_df['early_repay_days'] = (repay_log_df['due_date'] - repay_log_df['repay_date']).dt.days


In [58]:
repay_log_df.head()

Unnamed: 0,user_id,listing_id,order_id,due_date,due_amt,repay_date,repay_amt,repay,early_repay_days
0,748483,1858122,1,2018-01-29,528.6365,2018-01-28,528.6365,1,1
1,369368,3900565,1,2018-08-03,385.5078,2018-07-22,385.5078,1,12
2,749102,1699160,1,2018-01-11,338.5357,2017-12-11,338.5357,1,31
3,385257,2204015,1,2018-03-03,1106.1967,2018-03-02,1106.1967,1,1
4,648677,3811960,1,2018-07-23,385.5078,2018-07-04,385.5078,1,19


In [59]:
repay_log_df['early_repay_days'] = repay_log_df['early_repay_days'].apply(lambda x: x if x >= 0 else -1)


In [60]:
for f in ['listing_id', 'order_id', 'due_date', 'repay_date', 'repay_amt']:
    del repay_log_df[f]

In [61]:
group = repay_log_df.groupby('user_id', as_index=False)


In [62]:
repay_log_df = repay_log_df.merge(
    group['repay'].agg({'repay_mean': 'mean'}), on='user_id', how='left'
)

In [63]:
repay_log_df.head()

Unnamed: 0,user_id,due_amt,repay,early_repay_days,repay_mean
0,748483,528.6365,1,1,1.0
1,369368,385.5078,1,12,1.0
2,749102,338.5357,1,31,1.0
3,385257,1106.1967,1,1,1.0
4,648677,385.5078,1,19,1.0


In [64]:
repay_log_df = repay_log_df.merge(
    group['early_repay_days'].agg({
        'early_repay_days_max': 'max', 'early_repay_days_median': 'median', 'early_repay_days_sum': 'sum',
        'early_repay_days_mean': 'mean', 'early_repay_days_std': 'std'
    }), on='user_id', how='left'
)

In [65]:
repay_log_df.head()

Unnamed: 0,user_id,due_amt,repay,early_repay_days,repay_mean,early_repay_days_max,early_repay_days_median,early_repay_days_sum,early_repay_days_mean,early_repay_days_std
0,748483,528.6365,1,1,1.0,14,0.5,15,3.75,6.849574
1,369368,385.5078,1,12,1.0,18,12.0,34,11.333333,7.023769
2,749102,338.5357,1,31,1.0,31,31.0,31,31.0,
3,385257,1106.1967,1,1,1.0,12,2.0,19,3.8,4.658326
4,648677,385.5078,1,19,1.0,31,18.0,501,17.892857,10.640034


In [66]:
repay_log_df = repay_log_df.merge(
    group['due_amt'].agg({
        'due_amt_max': 'max', 'due_amt_min': 'min', 'due_amt_median': 'median',
        'due_amt_mean': 'mean', 'due_amt_sum': 'sum', 'due_amt_std': 'std',
        'due_amt_skew': 'skew', 'due_amt_kurt': kurtosis, 'due_amt_ptp': np.ptp
    }), on='user_id', how='left'
)

In [67]:
del repay_log_df['repay'], repay_log_df['early_repay_days'], repay_log_df['due_amt']

In [68]:
repay_log_df.shape

(2768985, 17)

In [69]:
repay_log_df = repay_log_df.drop_duplicates('user_id').reset_index(drop=True)


In [70]:
repay_log_df.shape

(796610, 17)

In [71]:
df = df.merge(repay_log_df, on='user_id', how='left')


In [72]:
cate_cols = ['gender', 'cell_province', 'id_province', 'id_city']


In [73]:
# 这个lgb应该有参数可以直接传 lgb.train(categorical_feature=cate_cols)
for f in cate_cols:
    df[f] = df[f].map(dict(zip(df[f].unique(), range(df[f].nunique())))).astype('int32')

In [74]:
df['due_amt_per_days'] = df['due_amt'] / (train_df['due_date'] - train_df['auditing_date']).dt.days


In [75]:
date_cols = ['auditing_date', 'due_date', 'reg_mon', 'info_insert_date', 'tag_insert_date']


In [76]:
for f in date_cols:
    if f in ['reg_mon', 'info_insert_date', 'tag_insert_date']:
        df[f + '_year'] = df[f].dt.year
    df[f + '_month'] = df[f].dt.month
    if f in ['auditing_date', 'due_date', 'info_insert_date', 'tag_insert_date']:
        df[f + '_day'] = df[f].dt.day
        df[f + '_dayofweek'] = df[f].dt.dayofweek

In [77]:
df.drop(columns=date_cols, axis=1, inplace=True)


In [79]:
df['taglist'] = df['taglist'].astype('str').apply(lambda x: x.strip().replace('|', ' ').strip())


In [81]:
tag_cv = CountVectorizer(min_df=10, max_df=0.9).fit_transform(df['taglist'])


In [82]:
tag_cv

<1130000x5977 sparse matrix of type '<class 'numpy.int64'>'
	with 55377908 stored elements in Compressed Sparse Row format>

In [83]:
del df['user_id'], df['listing_id'], df['taglist']


In [84]:
# 也可以用lgb.train(categorical_features=cate_cols)
df = pd.get_dummies(df, columns=cate_cols)


In [85]:
df = sparse.hstack((df.values, tag_cv), format='csr', dtype='float32')


In [86]:
train_values, test_values = df[:train_num], df[train_num:]


In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=2019)
clf = LGBMClassifier(
    learning_rate=0.05,
    n_estimators=10000,
    subsample=0.8,
    subsample_freq=1,
    colsample_bytree=0.8,
    random_state=2019
)
amt_oof = np.zeros(train_num)
prob_oof = np.zeros((train_num, 33))
test_pred_prob = np.zeros((test_values.shape[0], 33))
for i, (trn_idx, val_idx) in enumerate(skf.split(train_values, clf_labels)):
    print(i, 'fold...')
    t = time.time()

    trn_x, trn_y = train_values[trn_idx], clf_labels[trn_idx]
    val_x, val_y = train_values[val_idx], clf_labels[val_idx]
    val_repay_amt = amt_labels[val_idx]
    val_due_amt = train_due_amt_df.iloc[val_idx]

    clf.fit(
        trn_x, trn_y,
        eval_set=[(trn_x, trn_y), (val_x, val_y)],
        early_stopping_rounds=100, verbose=5
    )
    # shepe = (-1, 33)
    val_pred_prob_everyday = clf.predict_proba(val_x, num_iteration=clf.best_iteration_)
    prob_oof[val_idx] = val_pred_prob_everyday
    val_pred_prob_today = [val_pred_prob_everyday[i][val_y[i]] for i in range(val_pred_prob_everyday.shape[0])]
    val_pred_repay_amt = val_due_amt['due_amt'].values * val_pred_prob_today
    print('val rmse:', np.sqrt(mean_squared_error(val_repay_amt, val_pred_repay_amt)))
    print('val mae:', mean_absolute_error(val_repay_amt, val_pred_repay_amt))
    amt_oof[val_idx] = val_pred_repay_amt
    test_pred_prob += clf.predict_proba(test_values, num_iteration=clf.best_iteration_) / skf.n_splits

    print('runtime: {}\n'.format(time.time() - t))

print('\ncv rmse:', np.sqrt(mean_squared_error(amt_labels, amt_oof)))
print('cv mae:', mean_absolute_error(amt_labels, amt_oof))
print('cv logloss:', log_loss(clf_labels, prob_oof))
print('cv acc:', accuracy_score(clf_labels, np.argmax(prob_oof, axis=1)))

In [None]:
# # 原本的输出
# prob_cols = ['prob_{}'.format(i) for i in range(33)]
# for i, f in enumerate(prob_cols):
#     sub[f] = test_pred_prob[:, i]
# sub_example = pd.read_csv('dataset/submission.csv', parse_dates=['repay_date'])
# sub_example = sub_example.merge(sub, on='listing_id', how='left')
# sub_example['days'] = (sub_example['repay_date'] - sub_example['auditing_date']).dt.days
# # shape = (-1, 33)
# test_prob = sub_example[prob_cols].values
# test_labels = sub_example['days'].values
# test_prob = [test_prob[i][test_labels[i]] for i in range(test_prob.shape[0])]
# sub_example['repay_amt'] = sub_example['due_amt'] * test_prob
# sub_example[['listing_id', 'repay_date', 'repay_amt']].to_csv('sub.csv', index=False)



In [151]:
import pickle
with open('test_pred_prob.pkl', 'wb') as f:
    pickle.dump(test_pred_prob, f)

In [139]:
prob_cols = ['prob_{}'.format(i) for i in range(33)]


In [None]:
for i, f in enumerate(prob_cols):
    sub[f] = test_pred_prob[:, i]

In [89]:
sub_example = pd.read_csv('dataset/submission.csv', parse_dates=['repay_date'])


In [91]:
sub_example = sub_example.merge(sub, on='listing_id', how='left')


In [121]:
def add_1_month(s):
    s = s.strftime('%F')
    y, m, d = str(s).split('-')
    y = int(y)
    m = int(m)
    d = int(d)
    m = m + 1
    if m == 13:
        m = 1
        y = y + 1
    if m in [4,6,9,11]:
        if d == 31:
            d = 30
    if m == 2:
        if d in [29, 30, 31]:
            if y in [2012, 2016]:
                d = 29
            else:
                d = 28
    return datetime.datetime.strptime(str(y)+'-'+str(m)+'-'+str(d), '%Y-%m-%d')

In [123]:
from tqdm import tqdm, tqdm_notebook
tqdm_notebook().pandas()

HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))




In [124]:
sub_example['due_date'] = sub_example['auditing_date'].progress_apply(add_1_month)

HBox(children=(IntProgress(value=0, max=3987078), HTML(value='')))




In [125]:
sub_example.head()

Unnamed: 0,listing_id,repay_amt,repay_date,auditing_date,due_amt,days,due_date
0,5431438,4.3309,2019-03-12,2019-03-12,138.5903,0,2019-04-12
1,5431438,4.3309,2019-03-13,2019-03-12,138.5903,1,2019-04-12
2,5431438,4.3309,2019-03-14,2019-03-12,138.5903,2,2019-04-12
3,5431438,4.3309,2019-03-15,2019-03-12,138.5903,3,2019-04-12
4,5431438,4.3309,2019-03-16,2019-03-12,138.5903,4,2019-04-12


In [137]:
sub_example['days'] = (sub_example['due_date'] - sub_example['repay_date']).dt.days


In [138]:
sub_example.head()

Unnamed: 0,listing_id,repay_amt,repay_date,auditing_date,due_amt,days,due_date
0,5431438,4.3309,2019-03-12,2019-03-12,138.5903,31,2019-04-12
1,5431438,4.3309,2019-03-13,2019-03-12,138.5903,30,2019-04-12
2,5431438,4.3309,2019-03-14,2019-03-12,138.5903,29,2019-04-12
3,5431438,4.3309,2019-03-15,2019-03-12,138.5903,28,2019-04-12
4,5431438,4.3309,2019-03-16,2019-03-12,138.5903,27,2019-04-12


In [None]:
test_prob = sub_example[prob_cols].values


In [119]:
test_labels = sub_example['days'].values


datetime.datetime(2018, 4, 2, 0, 0)

In [None]:
test_prob = [test_prob[i][test_labels[i]] for i in range(test_prob.shape[0])]


In [None]:
sub_example['repay_amt'] = sub_example['due_amt'] * test_prob


'2019-06-24 17-33-18'

In [None]:
sub_example[['listing_id', 'repay_date', 'repay_amt']].to_csv(f'sub_{datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')}.csv', index=False)


# 下面这行的结果告诉我一下

In [None]:
pd.Series(test_prob).describe()

# 下面这段会额外生成四个提交文件,全部提交看看

In [None]:
for threshold in [0.1, 0.15, 0.2, 0.25]:  # 这里的threshold可以根据上面给我看的describe进行修改
    sub_example['repay_amt'] = sub_example['due_amt'] * test_prob if test_prob > threshold else 0
    sub_example[['listing_id', 'repay_date', 'repay_amt']].to_csv(f'sub_{datetime.datetime.now().strftime('%Y-%m-%d %H-%M-%S')}_{threshold}.csv', index=False)
