# <font color='CC3D3D'> 0.2 Extract_Applied_Or_Not
    
- loan_result.csv와 user_spec.csv 파일을 합쳐 application id에 loan id를 달아줌
- 시간 정보를 추출하여 유저별로 각 일자에 대출 신청을 했는지 하지 않았는지 구분
    - <span style="color:blue"> **loan_apply.csv** </span> 생성

# Import

In [1]:
import os
import gc
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as mpl  # 기본 설정 만지는 용도
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm  # 폰트 관련 용도
from IPython.display import display
import warnings

## for dimension reduction or feature selection
from sklearn.decomposition import PCA
from sklearn.linear_model import Lasso

warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', None)
%matplotlib inline

# Data Load

In [2]:
loan = pd.read_csv("../Data/2022빅콘테스트_데이터분석리그_데이터분석분야_퓨처스부문_데이터셋_220908/loan_result.csv", engine="python")
user = pd.read_csv("../Data/2022빅콘테스트_데이터분석리그_데이터분석분야_퓨처스부문_데이터셋_220908/user_spec.csv", engine="python")

In [3]:
user[user['user_id'] == 879698]

Unnamed: 0,application_id,user_id,birth_year,gender,insert_time,credit_score,yearly_income,income_type,company_enter_month,employment_type,houseown_type,desired_amount,purpose,personal_rehabilitation_yn,personal_rehabilitation_complete_yn,existing_loan_cnt,existing_loan_amt


# Log Data Preprocessing

In [4]:
def AddDate(df:pd.DataFrame, key:str, add_day:bool=False, add_time:bool=False) -> None:
    df[key] = pd.to_datetime(df[key])
    df[f"{key}_year"] = df[key].dt.year
    df[f"{key}_month"] = df[key].dt.month
    if add_day:
        df[f"{key}_day"] = df[key].dt.day
    if add_time:
        df[f"{key}_hour"] = df[key].dt.hour
        df[f"{key}_min"] = df[key].dt.minute
        df[f"{key}_sec"] = df[key].dt.second

* apply한 날을 일단 추출 - 6월 이전의 데이터에 대해서만

In [5]:
loan['insert_date'] = pd.to_datetime(loan['loanapply_insert_time']).dt.date

In [6]:
loan.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13527363 entries, 0 to 13527362
Data columns (total 8 columns):
 #   Column                 Dtype  
---  ------                 -----  
 0   application_id         int64  
 1   loanapply_insert_time  object 
 2   bank_id                int64  
 3   product_id             int64  
 4   loan_limit             float64
 5   loan_rate              float64
 6   is_applied             float64
 7   insert_date            object 
dtypes: float64(3), int64(3), object(2)
memory usage: 825.6+ MB


In [7]:
loan['month'] = pd.to_datetime(loan['insert_date']).dt.month
loan = loan[loan['month'] < 6]
loan.drop('month', axis=1, inplace=True)

In [8]:
loan.isnull().sum()

application_id              0
loanapply_insert_time       0
bank_id                     0
product_id                  0
loan_limit               5738
loan_rate                5738
is_applied                  0
insert_date                 0
dtype: int64

In [9]:
app_user = {user['application_id'].iloc[i]: user['user_id'].iloc[i] for i in range(user.shape[0])}
len(app_user)

1394216

In [10]:
user['application_id'].nunique()

1394216

In [11]:
loan['application_id'].nunique()

728997

In [12]:
loan['user_id'] = loan['application_id'].map(app_user)
loan.head()

Unnamed: 0,application_id,loanapply_insert_time,bank_id,product_id,loan_limit,loan_rate,is_applied,insert_date,user_id
13284,2157865,2022-05-09 08:44:59,54,235,20000000.0,16.5,1.0,2022-05-09,346970.0
13285,576643,2022-05-09 10:54:53,54,235,11000000.0,16.5,0.0,2022-05-09,545882.0
13286,576643,2022-05-09 10:54:53,11,118,3000000.0,20.0,0.0,2022-05-09,545882.0
13287,2136706,2022-05-09 10:41:06,42,216,10000000.0,13.5,0.0,2022-05-09,558819.0
13288,2136706,2022-05-09 10:41:07,25,169,22000000.0,15.9,0.0,2022-05-09,558819.0


In [13]:
loan.isnull().sum()

application_id              0
loanapply_insert_time       0
bank_id                     0
product_id                  0
loan_limit               5738
loan_rate                5738
is_applied                  0
insert_date                 0
user_id                   113
dtype: int64

* 113명의 유저 아이디가 loan의 application id와 매핑되지 않는다!

In [14]:
user.isnull().sum()

application_id                               0
user_id                                      0
birth_year                               12961
gender                                   12961
insert_time                                  0
credit_score                            105115
yearly_income                               90
income_type                                 85
company_enter_month                     171760
employment_type                             85
houseown_type                               85
desired_amount                              85
purpose                                     85
personal_rehabilitation_yn              587461
personal_rehabilitation_complete_yn    1203354
existing_loan_cnt                       198556
existing_loan_amt                       313774
dtype: int64

In [15]:
app_nulls = loan[loan['user_id'].isna()]
app_nulls['insert_date'] = pd.to_datetime(app_nulls['insert_date'])

app_nulls.sort_values(by='insert_date', inplace=True)
app_nulls.reset_index(drop=True, inplace=True)

app_nulls.tail()

Unnamed: 0,application_id,loanapply_insert_time,bank_id,product_id,loan_limit,loan_rate,is_applied,insert_date,user_id
108,450452,2022-05-20 17:53:11,22,124,,,1.0,2022-05-20,
109,889541,2022-05-20 17:53:12,22,124,,,1.0,2022-05-20,
110,1756845,2022-05-20 17:54:05,22,124,,,1.0,2022-05-20,
111,1907865,2022-05-20 17:53:11,22,124,,,1.0,2022-05-20,
112,1979792,2022-05-20 17:53:59,22,124,,,1.0,2022-05-20,


In [16]:
loan.dropna(subset=['user_id'], inplace=True)
loan.isnull().sum()

application_id              0
loanapply_insert_time       0
bank_id                     0
product_id                  0
loan_limit               5625
loan_rate                5625
is_applied                  0
insert_date                 0
user_id                     0
dtype: int64

In [17]:
loan_apply = loan[['user_id', 'insert_date', 'is_applied']]
display(loan_apply)

Unnamed: 0,user_id,insert_date,is_applied
13284,346970.0,2022-05-09,1.0
13285,545882.0,2022-05-09,0.0
13286,545882.0,2022-05-09,0.0
13287,558819.0,2022-05-09,0.0
13288,558819.0,2022-05-09,0.0
...,...,...,...
13519634,109899.0,2022-05-16,0.0
13519635,109899.0,2022-05-16,0.0
13519636,109899.0,2022-05-16,0.0
13519637,109899.0,2022-05-16,0.0


In [18]:
loan_apply.drop_duplicates(inplace=True)
display(loan_apply)

Unnamed: 0,user_id,insert_date,is_applied
13284,346970.0,2022-05-09,1.0
13285,545882.0,2022-05-09,0.0
13287,558819.0,2022-05-09,0.0
13304,558819.0,2022-05-09,1.0
13327,341662.0,2022-05-09,0.0
...,...,...,...
13519613,386186.0,2022-05-16,0.0
13519617,879596.0,2022-05-16,1.0
13519618,879596.0,2022-05-16,0.0
13519620,601384.0,2022-05-16,1.0


In [19]:
loan_apply.sort_values(by=['user_id', 'insert_date'], inplace=True)
loan_apply.reset_index(drop=True, inplace=True)
loan_apply

Unnamed: 0,user_id,insert_date,is_applied
0,9.0,2022-05-21,0.0
1,11.0,2022-03-24,0.0
2,11.0,2022-03-24,1.0
3,11.0,2022-04-20,0.0
4,11.0,2022-04-20,1.0
...,...,...,...
965661,879695.0,2022-05-27,0.0
965662,879695.0,2022-05-27,1.0
965663,879696.0,2022-03-14,0.0
965664,879696.0,2022-03-14,1.0


In [20]:
loan_total = loan_apply.groupby(by=['user_id', 'insert_date']).sum()
loan_total

Unnamed: 0_level_0,Unnamed: 1_level_0,is_applied
user_id,insert_date,Unnamed: 2_level_1
9.0,2022-05-21,0.0
11.0,2022-03-24,1.0
11.0,2022-04-20,1.0
14.0,2022-04-18,0.0
17.0,2022-03-06,1.0
...,...,...
879693.0,2022-05-17,0.0
879693.0,2022-05-20,1.0
879695.0,2022-05-27,1.0
879696.0,2022-03-14,1.0


In [21]:
loan_total.reset_index(drop=False, inplace=True)
loan_total

Unnamed: 0,user_id,insert_date,is_applied
0,9.0,2022-05-21,0.0
1,11.0,2022-03-24,1.0
2,11.0,2022-04-20,1.0
3,14.0,2022-04-18,0.0
4,17.0,2022-03-06,1.0
...,...,...,...
668676,879693.0,2022-05-17,0.0
668677,879693.0,2022-05-20,1.0
668678,879695.0,2022-05-27,1.0
668679,879696.0,2022-03-14,1.0


In [22]:
loan_total['is_applied'] = loan_total['is_applied'].map(lambda x: x > 0).astype('int')
loan_total.head()

Unnamed: 0,user_id,insert_date,is_applied
0,9.0,2022-05-21,0
1,11.0,2022-03-24,1
2,11.0,2022-04-20,1
3,14.0,2022-04-18,0
4,17.0,2022-03-06,1


In [23]:
loan_total.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 668681 entries, 0 to 668680
Data columns (total 3 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   user_id      668681 non-null  float64
 1   insert_date  668681 non-null  object 
 2   is_applied   668681 non-null  int32  
dtypes: float64(1), int32(1), object(1)
memory usage: 12.8+ MB


# Deployment CSV

In [24]:
loan_total.to_csv('../Data/loan_apply.csv', index=False)