In [1]:
from datetime import datetime
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
from sklearn.preprocessing import LabelEncoder

In [2]:
data_dir = '../../data/processed/'

## data loading for sequential embedding

In [3]:
%%time
sequential_data = []

line_number = 0
max_lines = 5000

with open('../../data/raw/data_train.json', 'r') as f:
    for line in f:
        line_number += 1
        if line_number > max_lines:
            break
        sequential_data.append(json.loads(line))

Wall time: 397 ms


- the unique key for each sample is the combination of userid and application time
- applicants might apply more than one time (same userid, differenty application time)

In [4]:
%%time
sequential_driver = {} 

sequential_behavior = {}

for item in sequential_data:   
    user_id = item[0]
    application_time = int(item[1]['order_info']['order_time'])
    sequential_driver.update({f"{user_id}|{application_time}" : item[1]['order_info']})
    sub_data = [x for x in item[1]['data'] if x['petime']<=application_time-100]
    # we only keep data occurs before application time. "-100" is not neccessary for offline data cleaning.
    ## but sometimes we use this trick for online calculation to avoid network slowdown 
    sequential_behavior.update({f"{user_id}|{application_time}":sub_data})

Wall time: 41.9 ms


### driver understanding

In [5]:
driver = pd.DataFrame(sequential_driver).T.reset_index()
driver['user_id'] = driver['index'].apply(lambda x : x.split('|')[0])
driver['application_time'] = driver['index'].apply(lambda x : x.split('|')[1])

In [6]:
driver['new_client'].value_counts()

# we normally separate new client (0) and old client (1) because they have different data pattern and data density

0.0    3027
1.0    1973
Name: new_client, dtype: int64

In [7]:
# converting unix time to real time
driver['application_date'] = pd.to_datetime(driver['order_time'],unit='ms')

In [8]:
driver.head()

Unnamed: 0,index,overdue,new_client,order_time,label,user_id,application_time,application_date
0,56f889ee11df4a72955147cb2f29a638|1509322980000,0.0,0.0,1509323000000.0,0.0,56f889ee11df4a72955147cb2f29a638,1509322980000,2017-10-30 00:23:00
1,82ba63c78d5543b7b2fd1b44412ea954|1507609140000,1.0,0.0,1507609000000.0,0.0,82ba63c78d5543b7b2fd1b44412ea954,1507609140000,2017-10-10 04:19:00
2,d84540c274cc43b894997f633fcf47b9|1509373080000,1.0,0.0,1509373000000.0,0.0,d84540c274cc43b894997f633fcf47b9,1509373080000,2017-10-30 14:18:00
3,b8206ff0ea1f4cf4abda18d0e0145497|1507529520000,0.0,1.0,1507530000000.0,0.0,b8206ff0ea1f4cf4abda18d0e0145497,1507529520000,2017-10-09 06:12:00
4,6e5f6b151edd4d40b51ca3b75e392f8d|1506805920000,0.0,1.0,1506806000000.0,0.0,6e5f6b151edd4d40b51ca3b75e392f8d,1506805920000,2017-09-30 21:12:00


In [9]:
a=sequential_behavior["b8206ff0ea1f4cf4abda18d0e0145497|1507529520000"]

In [10]:
driver['label'].value_counts()

0.0    4609
1.0     391
Name: label, dtype: int64

In [11]:
driver.groupby('label')['overdue'].describe()
# we only regard sample with more than 5 overdue days as bad sample (1)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.0,4609.0,0.305272,0.744698,0.0,0.0,0.0,0.0,5.0
1.0,391.0,20.992327,9.770045,6.0,12.0,20.0,29.0,38.0


In [12]:
driver['user_id'].nunique()

4967

In [13]:
driver['user_id'].size

5000

In [14]:
driver.head()

Unnamed: 0,index,overdue,new_client,order_time,label,user_id,application_time,application_date
0,56f889ee11df4a72955147cb2f29a638|1509322980000,0.0,0.0,1509323000000.0,0.0,56f889ee11df4a72955147cb2f29a638,1509322980000,2017-10-30 00:23:00
1,82ba63c78d5543b7b2fd1b44412ea954|1507609140000,1.0,0.0,1507609000000.0,0.0,82ba63c78d5543b7b2fd1b44412ea954,1507609140000,2017-10-10 04:19:00
2,d84540c274cc43b894997f633fcf47b9|1509373080000,1.0,0.0,1509373000000.0,0.0,d84540c274cc43b894997f633fcf47b9,1509373080000,2017-10-30 14:18:00
3,b8206ff0ea1f4cf4abda18d0e0145497|1507529520000,0.0,1.0,1507530000000.0,0.0,b8206ff0ea1f4cf4abda18d0e0145497,1507529520000,2017-10-09 06:12:00
4,6e5f6b151edd4d40b51ca3b75e392f8d|1506805920000,0.0,1.0,1506806000000.0,0.0,6e5f6b151edd4d40b51ca3b75e392f8d,1506805920000,2017-09-30 21:12:00


### data understanding

In [15]:
%%time
# converting data into dataframe 
## this is not neccessary during modeling phase cause we can use data_generator (presented in data usage)

behavior = []
for user_keys in sequential_behavior:    
    user_id, application_time = user_keys.split("|")
    for item in sequential_behavior[user_keys]:    
        subitem = item.copy()
        subitem.update({"user_id":user_id, "application_time":application_time})
        behavior.append(subitem)

Wall time: 78.8 ms


In [16]:
behavior = pd.DataFrame(behavior)

In [17]:
behavior.to_pickle("x_train.pkl")

In [18]:
behavior[40:70]
#- process id `pid` : as long as the app is open, pid will be the same, no matter whether it's running or hanging in the background. It will be reset if the app is killed.
# session id `sid` : every time when an app is recovered from background, sid will be reset.

Unnamed: 0,pname,pstime,petime,pid,sid,user_id,application_time
40,bind_debit_card,1507192491471,1507192492080,1507192437437X6356,1507192437995,d84540c274cc43b894997f633fcf47b9,1509373080000
41,loan_index,1507192852076,1507192867020,1507192437437X6356,1507192810192-26685,d84540c274cc43b894997f633fcf47b9,1509373080000
42,loan_index,1507192871348,1507192875537,1507192437437X6356,1507192871339-3233,d84540c274cc43b894997f633fcf47b9,1509373080000
43,loan_index,1507192879948,1507192881293,1507192437437X6356,1507192879940-3819,d84540c274cc43b894997f633fcf47b9,1509373080000
44,loan_submission,1507192881420,1507192885233,1507192437437X6356,1507192879940-3819,d84540c274cc43b894997f633fcf47b9,1509373080000
45,loan_index,1507192885316,1507192918124,1507192437437X6356,1507192879940-3819,d84540c274cc43b894997f633fcf47b9,1509373080000
46,loan_index,1508280658201,1508280662085,1508280646788X10764,1508280647318,d84540c274cc43b894997f633fcf47b9,1509373080000
47,loan_index,1508292424840,1508292427610,1508292424448X25496,1508292424829,d84540c274cc43b894997f633fcf47b9,1509373080000
48,loan_index,1508292450329,1508292463360,1508292424448X25496,1508292424829,d84540c274cc43b894997f633fcf47b9,1509373080000
49,loan_index,1508292476849,1508292478229,1508292424448X25496,1508292476841-12536,d84540c274cc43b894997f633fcf47b9,1509373080000


In [19]:
behavior

Unnamed: 0,pname,pstime,petime,pid,sid,user_id,application_time
0,loan_index,1508169825905,1508169827989,1508169825083X3005,1508169825895,56f889ee11df4a72955147cb2f29a638,1509322980000
1,loan_submission,1508169828161,1508169832016,1508169825083X3005,1508169825895,56f889ee11df4a72955147cb2f29a638,1509322980000
2,login,1508159705613,1508159717957,1508159674269X6362,1508159674838,56f889ee11df4a72955147cb2f29a638,1509322980000
3,loan_index,1508159718330,1508159727708,1508159674269X6362,1508159674838,56f889ee11df4a72955147cb2f29a638,1509322980000
4,loan_index,1508159730618,1508159735192,1508159674269X6362,1508159674838,56f889ee11df4a72955147cb2f29a638,1509322980000
...,...,...,...,...,...,...,...
137787,loan_index,1507025631503,1507025633556,1507025265754X13178,1507025631494-8561,dbaf814462be46c18613bed7fcb5c589,1508168640000
137788,loan_submission,1507025633826,1507025641681,1507025265754X13178,1507025631494-8561,dbaf814462be46c18613bed7fcb5c589,1508168640000
137789,loan_index,1507025641812,1507025661303,1507025265754X13178,1507025631494-8561,dbaf814462be46c18613bed7fcb5c589,1508168640000
137790,loan_index,1507025670790,1507025672833,1507025265754X13178,1507025670783-8643,dbaf814462be46c18613bed7fcb5c589,1508168640000


In [20]:
behavior.shape, behavior.user_id.nunique()

((137792, 7), 3693)

In [21]:
len([x.split('|')[0] for x in sequential_behavior if len(sequential_behavior[x])==0])

# there might be several application without any behavior data due to many reasons (which can be ignored)
# data inclusion logic for keys with empty data will be presented in data usage

1285

In [22]:
behavior = behavior.sort_values(['user_id', 'application_time', 'petime'])

In [23]:
behavior['pname'].value_counts()

loan_index         64375
personal_info      15596
id_verify          14167
contacts_info      12245
loan_submission     8388
operator            7788
bind_debit_card     6208
biometric_auto      4328
login               3993
register             575
biometric_auth       129
Name: pname, dtype: int64

### page view path overview

In [24]:
page_name = set(behavior['pname'].value_counts().index)

In [25]:
page_trannsition_overview = {
    f"from_{x}" : {
        f"to_{y}": 0 for y in page_name
    }
    for x in page_name
}

In [26]:
%%time
for keys in sequential_behavior:
    
    subdata = sequential_behavior[keys]
    subdata.sort(key=lambda x : x['petime']) 
    if len(subdata)<2:
        pass
    
    else:
        prev_state = subdata[0]['pname']
        for item in subdata[1:]:
            current_state = item['pname']
            page_trannsition_overview[f"from_{prev_state}"][f"to_{current_state}"] += 1
            prev_state = item['pname']

Wall time: 92.8 ms


## data usage

In [27]:
le = LabelEncoder()
le.fit(['bind_credit_card',
 'bind_debit_card',
'biometric_auth', 
 'biometric_auto',
 'contacts_info',
 'id_verify',
 'loan_index',
 'loan_submission',
 'login',
 'operator',
 'personal_info',
 'register',
 'unknown'])

LabelEncoder()

In [28]:
## here's the logic for which data should be dropped entirely and which data should be included as missing valued data
user_id_with_data = set([x.split("|")[0] for x in sequential_behavior if len(sequential_behavior[x])!=0])
user_id_without_data = set([x.split("|")[0] for x in sequential_behavior if len(sequential_behavior[x])==0])
user_id_without_data_should_keep = [x for x in user_id_without_data if x in user_id_with_data]


In [29]:
final_user_id = list(user_id_with_data) + list(set(user_id_without_data_should_keep))

In [30]:
len(final_user_id),len(set(final_user_id))####

(3704, 3693)

In [31]:
from sklearn import preprocessing
#test
X = ['bind_credit_card',
 'bind_debit_card',
 'biometric_auth', 
 'biometric_auto',
 'contacts_info',
 'id_verify',
 'loan_index',
 'loan_submission',
 'login',
 'operator',
 'personal_info',
 'register',
 'unknown']
X=np.array(X)
X=X[:,np.newaxis]
enc = preprocessing.OneHotEncoder(categories='auto')
enc.fit(X)
onehotlabels = enc.transform(X).toarray()
onehotlabels.shape
onehotlabels

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]])

In [32]:
def data_process(sequence_for_a_single_application):
    sequence_for_a_single_application.sort(key=lambda x : x['petime'])
    page_sequence = [x['pname'] for x in sequence_for_a_single_application]
    pstart = [x['pstime'] for x in sequence_for_a_single_application]
    pend = ([x['petime'] for x in sequence_for_a_single_application])
    page_stay_time = [(y-x)/1000 if (y-x)>0 and (y-x)//1000<600 else -1 for x,y in zip(pstart, pend)]
  
    # calculate the time elapse between two pages, if the page_lagg_time is negative or larger than 600s, page_lagg_time = -1
    page_lagg_time = [(x-y)/1000 if (x-y)>=0 and (x-y)//1000<600 else -1 for x,y in zip(pstart[1:], pend[:-1])]
    page_lagg_time_padd=[0]
    page_lagg_time_padd.extend(page_lagg_time)
    
    pids= []
    pid=''
    for x in sequence_for_a_single_application:
        if x.get("pid"):
            if x['pid']==pid:
                pids.append(0)
            else:
                pids.append(1)
                pid=x['pid']
        else:
            pids.append(-1)
    
    sids=[]
    sid=''
    for x in sequence_for_a_single_application:
        if x.get("sid"):
            if x['sid']==sid:
                sids.append(0)
            else:
                sids.append(1)
                sid=x['sid']
        else:
            sids.append(-1)
            
    
    page_sequence=np.array(page_sequence).reshape(-1, 1)
    page_sequence = enc.transform(page_sequence).A
    page_sequence=page_sequence.reshape(page_sequence.shape[1],-1)
    return page_sequence, page_stay_time,page_lagg_time_padd,pids,sids

In [33]:
def get_data(x):
    sequence_data=[]
    overdue=[]
    #new_client=[]
    #order_time=[]
    #label=[]
    #tmp_dict={"overdue":overdue,"new_client":new_client,"order_time":order_time,"lable":lable}
    unique_user_session=list(set(x.keys()))
    for keys in unique_user_session:
        if x[keys]==[]:
            continue
        page_sequence, page_stay_time, page_lagg_time,pids,sids=data_process(x[keys])
        single_entry=np.vstack((page_sequence, page_stay_time, page_lagg_time,pids,sids)).T
        sequence_data.append(single_entry)
    '''    for i in ['overdue', 'new_client', 'order_time', 'label']:
            if sequential_driver[keys].get(i):
                tmp_dict[i].append(sequential_driver[keys][i])
            else:
                tmp_dict[i].append(-1)   
    overdue=tmp_dict['overdue']
    new_client=tmp_dict['new_client']
    order_time=tmp_dict['order_time']
    label=tmp_dict['label']'''
    return sequence_data

In [34]:
sequence_data=get_data(sequential_behavior)

In [35]:
store=[]
for i in sequential_behavior.keys():
    if sequential_behavior[i]!=[]:
        store.append(i)
        

In [36]:
driver.columns

Index(['index', 'overdue', 'new_client', 'order_time', 'label', 'user_id',
       'application_time', 'application_date'],
      dtype='object')

In [37]:
driver=driver.set_index("index")

In [38]:
label=driver.T[store].T['label'].values

In [39]:
testing=label[10000:20000]

# DTW and KNN

In [40]:
!pip install fastdtw



In [41]:
import fastdtw
from sklearn.metrics import roc_curve

In [42]:
# below are 
def distance(X,x):
    '''
    gnerating distance for points based on DTW
    X: list or array of points
    x: point. 
    
    fastdtw is faster than dtw method
    '''
    store=[]
    for i in X:
#         store.append(fastdtw.dtw(i,x)[0])
        store.append(fastdtw.fastdtw(i,x)[0])
    return np.array(store)
def find_neighbors(x,X,k,distance):
    distances=distance(X,x)
    indexes=np.argpartition(distances,k)[0:k]
    return indexes
def most_common(X):
    values,counts=np.unique(X,return_counts=True)
    return values[counts.argmax()]
def most_common_proba(X):
    '''
    generating the probability for the labels
    '''
    values,counts=np.unique(X,return_counts=True)
    if len(values)==1:
        if values[0]==1:
            return 0,1
        else:
            return 1,0
    if values[0]==0:
        return counts[0]/np.sum(counts),counts[1]/np.sum(counts)
    else:
        return counts[1]/np.sum(counts),counts[10]/np.sum(counts)

class KNeighbors:
    def __init__(self,k,distance):
        '''
        k: nearest piont
        distance: function to generate distance
        '''
        self.k=k
        self.distance=distance
    def fit(self,X,Y):
        self.X=X
        self.Y=Y
    def predict(self,X):
        y=[]
        for i1 in range(len(X)):
            x=X[i1]
            indexes=find_neighbors(x,self.X,self.k,self.distance)
            votes=self.Y[indexes]
            y.append(most_common(votes))
        return np.array(y)
    def predict_proba(self,X):
        y=[]
        for i1 in range(len(X)):
            x=X[i1]
            indexes=find_neighbors(x,self.X,self.k,self.distance)
            votes=self.Y[indexes]
            y.append(most_common_proba(votes))
        return np.array(y)
    
def get_KS(y_prob,y_true):
    '''
    return the best threshold, maximum ks
    '''
    fpr,tpr,threshold=roc_curve(y_true,y_prob)
    ks=(tpr-fpr)
    max_=np.argmax(ks)
    return threshold[max_],np.max(ks)

In [43]:
store=[]
for i in range(10):
    store.append(most_common_proba(label))

In [44]:
# generating the training dataset, mixture of specific portion of 1 and 0 in the label
# pay attention not to using too large dataset, it will take a large time to train 
idx=np.arange(len(label))
idx_0=idx[label==0]
idx_1=idx[label==1]
np.random.shuffle(idx_0)
np.random.shuffle(idx_1)
new_id=np.concatenate([idx_0[:1500],idx_1[:300]],axis=0)
np.random.shuffle(new_id)
print("total:",sum(label[new_id]),"shape: ",len(new_id))

total: 254.0 shape:  1754


In [45]:
X= [sequence_data[i] for i in new_id]
Y=label[new_id]
Y=Y.astype("int")

In [46]:
test=KNeighbors(5,distance)

In [47]:
test.fit(X,Y)

test_dataset

In [48]:
new_id = np.concatenate([idx_0[2500:2600], idx_1[500:520]], axis=0)
np.random.shuffle(new_id)
print("total:", sum(label[new_id]), "shape: ", len(new_id))

total: 0.0 shape:  100


In [49]:
test_X = [sequence_data[i] for i in new_id]
test_Y = label[new_id]
test_Y = test_Y.astype("int")

In [50]:
prob = test.predict_proba(test_X)

In [51]:
get_KS(prob[:, 1], test_Y)



(1.4, nan)