## import Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import font_manager
%matplotlib inline

font_fname = 'C:/Windows/Fonts/malgun.ttf'
font_family = font_manager.FontProperties(fname=font_fname).get_name()
plt.rcParams["font.family"] = font_family

In [2]:
data = pd.read_csv('./data/KLoSA_08_lt.csv', low_memory=False)
data.head()

Unnamed: 0,pid,hhid,HHID20,CID20,ed,w08wgt_c,w08wgt_p,w01,w02,w03,...,w08G014,w08G026,w08G027,w08G028,w08G029,w08G030,w08G031,w08G032,w08Ba001,w08Ba002
0,11,1,10000000,11,833,1600.303883,2189.837105,1,1.0,1.0,...,,80,60,,60.0,70,4,20,1.0,
1,21,2,20000000,21,833,3180.749884,3919.849664,1,1.0,1.0,...,,50,40,60.0,50.0,30,5,10,1.0,
2,22,2,20000000,21,833,4525.161503,5994.598161,1,1.0,1.0,...,,60,50,60.0,,60,4,20,1.0,
3,41,4,40000000,41,833,1239.297401,1695.84006,1,1.0,1.0,...,,70,60,60.0,70.0,60,5,10,1.0,
4,42,4,40000000,41,833,1069.049862,1207.745056,1,1.0,1.0,...,,60,60,70.0,,60,6,5,1.0,


## Simple Model

In [3]:
New_elder = data[(data['w08A002_age'] >= 60) & (data['w08A002_age'] < 65)]
New_elder

Unnamed: 0,pid,hhid,HHID20,CID20,ed,w08wgt_c,w08wgt_p,w01,w02,w03,...,w08G014,w08G026,w08G027,w08G028,w08G029,w08G030,w08G031,w08G032,w08Ba001,w08Ba002
38,421,42,420000000,421,596,2767.325355,3410.359116,1,1.0,1.0,...,,60,80,80.0,80.0,80,4,30,1.0,
39,422,42,420000000,421,596,5157.259687,6831.954930,1,1.0,1.0,...,,80,80,80.0,,80,4,50,1.0,
40,431,43,430000000,431,596,4553.861303,6032.617527,1,1.0,1.0,...,,80,80,80.0,,80,4,30,1.0,
41,432,43,430000000,431,596,2687.264496,3311.694794,1,1.0,1.0,...,,80,80,80.0,80.0,80,4,20,1.0,
54,621,62,620000000,621,689,9450.982963,11647.074980,1,1.0,1.0,...,,80,80,80.0,80.0,80,4,30,1.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5690,61471,6147,61470000000,61471,178,2079.293363,1987.650894,1,1.0,1.0,...,,90,80,,60.0,80,6,0,1.0,
5692,61491,6149,61490000000,61491,178,2708.116682,2618.884551,1,1.0,1.0,...,,80,80,80.0,,60,6,0,1.0,
5693,61492,6149,61490000000,61491,178,1972.718448,1885.773146,1,1.0,1.0,...,,90,80,70.0,70.0,70,6,5,1.0,
5695,61502,6150,61500000000,61501,373,3865.213957,3737.855604,1,1.0,1.0,...,,60,70,80.0,,80,4,10,1.0,


In [4]:
# 고유번호, 가구원수, 결혼상태, 지역, 자녀수, 건강상태, ADL, 인지기능점수, 건보/의료급여 가입여부, 현재 경활상태
# 구직활동여부, 은퇴여부, 지난해 임금소득, 월평균 생활비, 거주주택 자산가치, 가구순자산, 주관적 계층의식, 월평균 용돈
temp_cols = ['pid','w08hhsize', 'w08marital', 'w08region3','w08Ba003','w08C001','w08adl','w08mmse','w08C301','w08present_ecotype',
            'w08job_search','w08retired','w08wage','w08E201','w08residence','w08hhnetassets','w08G031','w08G032']
New_elder = New_elder[temp_cols]

In [7]:
New_elder.shape

(1118, 18)

In [5]:
New_elder.isna().sum()

pid                     0
w08hhsize               0
w08marital              0
w08region3              0
w08Ba003                0
w08C001                 0
w08adl                  0
w08mmse                41
w08C301                 0
w08present_ecotype      0
w08job_search         611
w08retired            611
w08wage               750
w08E201               396
w08residence          571
w08hhnetassets         11
w08G031                 0
w08G032                 0
dtype: int64

In [6]:
for col in temp_cols:
    print(New_elder[col].value_counts())

# marital ; 1: 혼인 중, 2: 별거, 3: 이혼, 4: 사별, 5: 결혼한 적 없음
# C001 ; 1~5 건강상태: 1 최상 5가 매우 나쁨
# adl ; 일상 수행 능력, 높을수록 안 좋음
# C301 ; 가입하지 않았다면 5
# job_search ; 구직활동 중이 아니라면 5.0
# retired ; 0: 노동시장 비참여자, 1: 은퇴, 2: 부분은퇴
# G031 ; 주관적 계층의식, 1~6


30721    1
13651    1
58722    1
40282    1
7512     1
        ..
56002    1
4803     1
29381    1
51911    1
14332    1
Name: pid, Length: 1118, dtype: int64
4    339
3    302
2    287
5    119
1     39
6     25
7      6
8      1
Name: w08hhsize, dtype: int64
1    986
4     70
3     38
5     17
2      7
Name: w08marital, dtype: int64
1    478
2    439
3    201
Name: w08region3, dtype: int64
2    710
3    181
1    142
0     47
4     28
5      9
6      1
Name: w08Ba003, dtype: int64
3    526
4    414
2     94
5     70
1     14
Name: w08C001, dtype: int64
0    1113
7       4
5       1
Name: w08adl, dtype: int64
30.0    361
29.0    189
28.0    133
27.0    110
26.0     63
25.0     60
24.0     50
23.0     31
22.0     23
20.0     15
21.0     15
19.0      8
18.0      8
15.0      4
16.0      3
17.0      3
14.0      1
Name: w08mmse, dtype: int64
1    1084
5      34
Name: w08C301, dtype: int64
1    611
3    497
2     10
Name: w08present_ecotype, dtype: int64
5.0    478
1.0     29
Name: w08job_se

In [9]:
New_elder['w08mmse'].fillna(New_elder['w08mmse'].mean(), inplace=True)
New_elder['w08job_search'].fillna(5.0, inplace=True)
New_elder['w08retired'].fillna(0, inplace=True)
New_elder['w08wage'].fillna(New_elder['w08wage'].mean(), inplace=True)
New_elder['w08E201'].fillna(New_elder['w08E201'].mean(), inplace=True)
New_elder['w08residence'].fillna(New_elder['w08residence'].mean(), inplace=True)
New_elder['w08hhnetassets'].fillna(New_elder['w08hhnetassets'].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().fillna(


In [13]:
New_elder['w08C301'].replace(5, 0, inplace=True)
New_elder['w08job_search'].replace(5.0, 0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [15]:
New_elder.head()

Unnamed: 0,pid,w08hhsize,w08marital,w08region3,w08Ba003,w08C001,w08adl,w08mmse,w08C301,w08present_ecotype,w08job_search,w08retired,w08wage,w08E201,w08residence,w08hhnetassets,w08G031,w08G032
38,421,4,1,1,2,4,0,29.0,1,3,0.0,0.0,2508.168478,400.0,180000.0,180000.0,4,30
39,422,4,1,1,2,4,0,30.0,1,1,0.0,0.0,3600.0,214.32133,24620.917733,180000.0,4,50
40,431,4,1,1,2,4,0,27.0,1,1,0.0,0.0,2160.0,214.32133,24620.917733,31000.0,4,30
41,432,4,1,1,2,4,0,29.0,1,3,0.0,1.0,2508.168478,300.0,30000.0,31000.0,4,20
54,621,4,1,1,2,4,0,27.0,1,1,0.0,0.0,2508.168478,500.0,24620.917733,117300.0,4,30


#### Preprocessing

In [38]:
data.columns[data.isna().sum() > len(data)*0.2]

Index(['w08A035_01', 'w08A035_02', 'w08A035_03', 'w08A035_04', 'w08A035_05',
       'w08A035_06', 'w08A035_07', 'w08A036_1', 'w08livewith', 'w08livewithnm',
       ...
       'w08G008', 'w08G009', 'w08G010', 'w08G011', 'w08G012', 'w08G013',
       'w08G014', 'w08G028', 'w08G029', 'w08Ba002'],
      dtype='object', length=250)

In [None]:
# 삭제 변수
data.drop(['hhid','HHID20','CID20','ed','w01','w02','w03','w04','w05','w06','w07','w08type','w08panel_n','w08A035_01',
        'w08A035_02','w08A035_03','w08A035_04','w08A035_05','w08A035_06','w08A035_07','w08A036','w08A036_1','w08region2',
        'w08mniw_y','w08mniw_m','w08mniw_d','w08Ba068','w08Ba069','w08Ba070','w08bb009','w08bb015','w08bp1_2','w08_target1',
        'w08bp3','w08bp4','w08bp5','w08bp6','w08bm3','w08bm4','w08bm5','w08bm6','w08bb_adl2','w08bb_adl_num2','w08bb_adl3','w08bb_adl3',
        'w08fromchildren','w08tochildren','w08fromparent','w08toparent','w08fromothers','w08toothers']axis=1)

In [None]:
# 변수명 변경
data.rename(columns={'w08wgt_c':'weight_c',
                    'w08wgt_p':'weight_p',
                    'w08_fam1':'vrt_of_fam',
                    'w08_fam2':'str_of_fam',
                    'w08hhsize':'num_of_fam',
                    'w08edu':'edu',
                    'w08gender1':'gender',
                    'w08A002y':'year',
                    'w08A002m':'month',
                    'w08A002_age':'age',
                    'w08marital':'married',
                    'w08year2':'married_year',
                    'w08edu_s':'married_edu',
                    'w08_ecoact_s':'married_job'
                    'w08A030':'religion',
                    'w08A032':'friendly',
                    'w08enu_type':'living_type',
                    'w08region1':'sido_code',
                    'w08region3':'size_of_city',
                    'w08Ba003':'num_of_children',
                    'w08livewith':'live_with_chi',
                    'w08livenear':'live_near_chi',
                    'w08livewithnm':'live_nm_chi',
                    'w08contact1':'meet_chi',
                    'w08contact2':'contact_chi',
                    'w08Ba075':'is_carechi',
                    'w08Ba076':'num_of_carechi',
                    'w08Ba_resp':'is_Ba_ok',
                    'w08s_sum':'num_of_bs',
                    'w08l_sum':'lv_num_bs',
                    'w08bp1':'lv_of_parent',
                    'w08np1_1':'is_lvwwith_pr',
                    'w08bb_adl1':'is_ADL_fam',
                    'w08bb_adl_num1':'num_of_ADL_fam',
                    'w08transferfrom':'given_money',
                    'w08transferto':'give_money',
                    })