# <font color='CC3D3D'> 0.3 Merge_Loan_Log
    
- log_data.csv와 loan_apply.csv 파일을 합쳐 로그 데이터에 개별 유저가 대출 신청을 했는지 여부 구분
- 대출 신청을 실제로 한 데이터
		- <span style="color:blue"> **log_applied.csv** </span> 생성
- 대출 신청을 실제로 하지 않은 데이터
    - <span style="color:blue"> **log_non_applied.csv** </span> 생성
- 위의 두 데이터를 합친 데이터
		- <span style="color:blue"> **log_applied_history.csv** </span> 생성

# Import

In [1]:
import os
import gc
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl  # 기본 설정 만지는 용도
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm  # 폰트 관련 용도
from IPython.display import display
import warnings

## for dimension reduction or feature selection
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
%matplotlib inline

# Data Load

In [2]:
log = pd.read_csv("../Data/2022빅콘테스트_데이터분석리그_데이터분석분야_퓨처스부문_데이터셋_220908/log_data.csv", engine="python")
apply_date = pd.read_csv("../Data/loan_apply.csv", engine="python")

In [3]:
def AddDate(df:pd.DataFrame, key:str, add_day:bool=False, add_time:bool=False) -> None:
    df[key] = pd.to_datetime(df[key])
    df[f"{key}_year"] = df[key].dt.year
    df[f"{key}_month"] = df[key].dt.month
    if add_day:
        df[f"{key}_day"] = df[key].dt.day
    if add_time:
        df[f"{key}_hour"] = df[key].dt.hour
        df[f"{key}_min"] = df[key].dt.minute
        df[f"{key}_sec"] = df[key].dt.second

* apply한 날을 일단 추출

In [4]:
log.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17843993 entries, 0 to 17843992
Data columns (total 6 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   user_id         int64 
 1   event           object
 2   timestamp       object
 3   mp_os           object
 4   mp_app_version  object
 5   date_cd         object
dtypes: int64(1), object(5)
memory usage: 816.8+ MB


In [5]:
apply_date.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668681 entries, 0 to 668680
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      668681 non-null  float64
 1   insert_date  668681 non-null  object 
 2   is_applied   668681 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.3+ MB


# Log Engineering

In [6]:
log.drop(['mp_os', 'mp_app_version'], axis=1, inplace=True)
log['date_cd'] = pd.to_datetime(log['date_cd']).dt.date
apply_date['insert_date'] = pd.to_datetime(apply_date['insert_date']).dt.date

In [7]:
display(log['user_id'].nunique())
display(apply_date['user_id'].nunique())

584636

259328

In [8]:
log.head()

Unnamed: 0,user_id,event,timestamp,date_cd
0,576409,StartLoanApply,2022-03-25 11:12:09,2022-03-25
1,576409,ViewLoanApplyIntro,2022-03-25 11:12:09,2022-03-25
2,72878,EndLoanApply,2022-03-25 11:14:44,2022-03-25
3,645317,OpenApp,2022-03-25 11:15:09,2022-03-25
4,645317,UseLoanManage,2022-03-25 11:15:11,2022-03-25


In [9]:
apply_date.head()

Unnamed: 0,user_id,insert_date,is_applied
0,9.0,2022-05-21,0
1,11.0,2022-03-24,1
2,11.0,2022-04-20,1
3,14.0,2022-04-18,0
4,17.0,2022-03-06,1


In [10]:
log.sort_values(by=['user_id', 'date_cd'], inplace=True)
log.reset_index(drop=True, inplace=True)
log.head()

Unnamed: 0,user_id,event,timestamp,date_cd
0,1,GetCreditInfo,2022-05-03 14:52:28,2022-05-03
1,1,GetCreditInfo,2022-05-03 14:52:35,2022-05-03
2,1,UseLoanManage,2022-06-16 23:58:41,2022-06-16
3,1,Login,2022-06-16 23:58:41,2022-06-16
4,1,GetCreditInfo,2022-06-16 23:58:42,2022-06-16


In [11]:
apply_date.sort_values(by=['user_id', 'insert_date'], inplace=True)
apply_date.reset_index(drop=True, inplace=True)
apply_date.columns = ['user_id', 'date_cd', 'is_applied']
apply_date.head()

Unnamed: 0,user_id,date_cd,is_applied
0,9.0,2022-05-21,0
1,11.0,2022-03-24,1
2,11.0,2022-04-20,1
3,14.0,2022-04-18,0
4,17.0,2022-03-06,1


In [12]:
left = pd.merge(left=log, right=apply_date, on=['user_id', 'date_cd'], how='left')
display(left.info())
left.drop_duplicates(inplace=True)
display(left)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17843993 entries, 0 to 17843992
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   user_id     int64  
 1   event       object 
 2   timestamp   object 
 3   date_cd     object 
 4   is_applied  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 816.8+ MB


None

Unnamed: 0,user_id,event,timestamp,date_cd,is_applied
0,1,GetCreditInfo,2022-05-03 14:52:28,2022-05-03,
1,1,GetCreditInfo,2022-05-03 14:52:35,2022-05-03,
2,1,UseLoanManage,2022-06-16 23:58:41,2022-06-16,
3,1,Login,2022-06-16 23:58:41,2022-06-16,
4,1,GetCreditInfo,2022-06-16 23:58:42,2022-06-16,
...,...,...,...,...,...
17843988,879696,GetCreditInfo,2022-03-14 05:35:47,2022-03-14,1.0
17843989,879696,GetCreditInfo,2022-03-14 05:37:22,2022-03-14,1.0
17843990,879698,OpenApp,2022-05-24 22:33:24,2022-05-24,
17843991,879698,StartLoanApply,2022-05-24 22:33:32,2022-05-24,


In [13]:
left_applied = left[left['is_applied'] > 0]
right_applied = left[left['is_applied'] == 0]

In [14]:
left_applied.info()
right_applied.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4096724 entries, 9 to 17843989
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   user_id     int64  
 1   event       object 
 2   timestamp   object 
 3   date_cd     object 
 4   is_applied  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 187.5+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2405232 entries, 6 to 17843935
Data columns (total 5 columns):
 #   Column      Dtype  
---  ------      -----  
 0   user_id     int64  
 1   event       object 
 2   timestamp   object 
 3   date_cd     object 
 4   is_applied  float64
dtypes: float64(1), int64(1), object(3)
memory usage: 110.1+ MB


# Deployment CSV

In [15]:
left_applied.to_csv("../Data/log_applied.csv", index=False)
right_applied.to_csv("../Data/log_non_applied.csv", index=False)
left.to_csv('../Data/log_applied_history.csv', index=False)