#### Import Library

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('./data/2020년_노인실태조사_ver2.0(최종공개용).csv')
data.head()

Unnamed: 0,PID,HNO,AREA,EDM,ANS_TYPE,ANS_SUB1,ANS_SUB2,ANS_SUB3,S9_0,S9_2,...,RES_YY,RES_MM,RES_AGE,RES_MAR,RES_EDU1,RES_EDU2,RES_YN,RES_SUB,f_wgt1,s_wgt1
0,7501,1,35,2,0,99,999,1,0,99,...,1952,10,67,1,4,9,1,9,393.325736,0.512734
1,7601,1,35,2,0,99,999,1,0,99,...,1951,12,68,3,2,0,1,9,393.325736,0.512734
2,7802,2,23,1,0,99,999,1,1,1,...,1954,3,66,2,5,12,1,9,587.163656,0.765418
3,7801,1,23,1,0,99,999,1,0,99,...,1954,1,66,2,5,12,1,9,640.64343,0.835133
4,8001,1,35,1,0,99,999,1,0,99,...,1951,12,68,3,3,6,1,9,493.769927,0.643671


#### Data Preprocessing

In [3]:
# 성별 (0: 여자, 1: 남자)
data['Gender'] = np.where(data['RES_SEX'] == 1, 1, 0)

# 연령, 종교
data.rename(columns={'RES_AGE':'Age', 'D8':'Religion'}, inplace=True)

# 직업 (0: 비경활, 1: 은퇴, 2: 임금_저숙련, 3: 임금_중숙련, 4: 임금_고숙련, 5: 자영업, 6: 무급가족종사자)
data['Job'] = np.where(data['E1_1'].isin([11,12,13,14,15,21,22,23,24,25,26,27,28]), 4,
                     np.where(data['E1_1'].isin([31,32,33,39,61,62,63,72,73,74,75,76,77,78,79,81,82,83,84,85,86,87,88,89,100]), 3,
                              np.where(data['E1_1'].isin([41,42,43,44,51,52,53,91,92,93,94,95,99]), 2,
                                       np.where(data['E1_3'].isin([3,4]), 5,
                                                np.where(data['E1_3']==6 , 6,
                                                         np.where(data['E1']==2, 1,
                                                                  np.where(data['E1_1']==999, 0,np.nan)))))))

# 혼인상태 (0: 미혼, 1: 기혼)
data['RES_MAR'].replace([1, 3, 4, 5], 0, inplace=True)
data['RES_MAR'].replace(2, 1, inplace=True)
data['Married'] = data['RES_MAR']

# 교육 (0: 초등, 1: 중등, 2: 고등, 3: 대학교 이상)
data['Edu'] = np.where(data['RES_EDU1'] >= 6 , 3,
                    np.where(data['RES_EDU1']== 5, 2,
                              np.where(data['RES_EDU1']== 4, 1, 0)))

# 건강 (0: 활동에 제한 있음, 1: 그렇지 않은 편, 2: 전혀 지장없음)
data['Health'] = np.where(data['B1']<3, 2,
                        np.where(data['B1']==3, 1, 0))

# 만족도 (0: 하, 1: 중, 2: 상)
data['H16_1'].replace(9,0, inplace=True)
data['H16_2'].replace(9,0, inplace=True)
data['H16_3'].replace(9,0, inplace=True)
data['H16_4'].replace(9,0, inplace=True)
data['H16_7'].replace(9,0, inplace=True)

data['Self_conf_Score'] = data['H16_1'] + data['H16_2']+ data['H16_3'] + data['H16_4'] + data['H16_7']

data['Self_conf'] = pd.qcut(data['Self_conf_Score'], 3, labels=['하','중','상'])
data['Self_conf'] = np.where(data['Self_conf'] == '하', 0,
                        np.where(data['Self_conf'] == '중', 1, 2))

# 소득 (0: 하, 1: 중, 2: 상)
data['J3b_3_13'].replace(999998,0)
data['J3b_3_13'].replace(999999,0)

data['Earn'] = pd.qcut(data['J3b_3_13'], 3, labels=['하', '중', '상'])
data['Earn'] = np.where(data['Earn'] == '하', 0,
                    np.where(data['Earn'] == '중', 1, 2))

# 지난 1년간 시설 이용 여부 (0: 이용경험 없음, 1: 이용경험 있음)
# D11_1_1; 경로당, D11_1_2; 노인복지관, D11_1_3; 종합복지관, D11_1_4;노인교실, D11_1_5;공공 여가문화시설, D11_1_6; 민간 문화시설
data['D11_1_1'].replace(2,0, inplace=True)
data['D11_1_2'].replace(2,0, inplace=True)
data['D11_1_3'].replace(2,0, inplace=True)
data['D11_1_4'].replace(2,0, inplace=True)
data['D11_1_5'].replace(2,0, inplace=True)
data['D11_1_6'].replace(2,0, inplace=True)

data['Activity'] = data['D11_1_1'] + data['D11_1_2'] + data['D11_1_3'] + data['D11_1_4'] + data['D11_1_5'] + data['D11_1_6']

# 향후 일하고 싶은 노인 일자리 유형 (Target)
data['target']= data['E5_1']

In [4]:
data = data[data['Age'] >= 69]
data = data[['Gender','Age','Religion','Job','Married','Edu', 'Health', 'Self_conf', 'Earn', 'Activity', 'target']]

In [5]:
print(data.info())
data.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7147 entries, 5 to 10076
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Gender     7147 non-null   int32  
 1   Age        7147 non-null   int64  
 2   Religion   7147 non-null   int64  
 3   Job        7138 non-null   float64
 4   Married    7147 non-null   int64  
 5   Edu        7147 non-null   int32  
 6   Health     7147 non-null   int32  
 7   Self_conf  7147 non-null   int32  
 8   Earn       7147 non-null   int32  
 9   Activity   7147 non-null   int64  
 10  target     7147 non-null   int64  
dtypes: float64(1), int32(5), int64(5)
memory usage: 530.4 KB
None


Gender       0
Age          0
Religion     0
Job          9
Married      0
Edu          0
Health       0
Self_conf    0
Earn         0
Activity     0
target       0
dtype: int64

In [6]:
data.dropna(inplace=True)

In [7]:
index = data[data["Age"]==999].index
data.drop(index, inplace=True)

index = data[data["target"]==9].index
data.drop(index, inplace=True)

data.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1531 entries, 5 to 10071
Data columns (total 11 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   Gender     1531 non-null   int32  
 1   Age        1531 non-null   int64  
 2   Religion   1531 non-null   int64  
 3   Job        1531 non-null   float64
 4   Married    1531 non-null   int64  
 5   Edu        1531 non-null   int32  
 6   Health     1531 non-null   int32  
 7   Self_conf  1531 non-null   int32  
 8   Earn       1531 non-null   int32  
 9   Activity   1531 non-null   int64  
 10  target     1531 non-null   int64  
dtypes: float64(1), int32(5), int64(5)
memory usage: 113.6 KB


In [8]:
data.shape

(1531, 11)

In [9]:
data.Married.value_counts()

1    779
0    752
Name: Married, dtype: int64

In [10]:
data.to_csv('./data/Elderly.csv', encoding='utf-8', index=False)