## import Library

In [46]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import font_manager
%matplotlib inline
import os
import warnings
import glob
warnings.filterwarnings('ignore')

font_fname = 'C:/Windows/Fonts/malgun.ttf'
font_family = font_manager.FontProperties(fname=font_fname).get_name()
plt.rcParams["font.family"] = font_family

#### Data Load

In [63]:
# path = './data/'
# filenames = os.listdir('./data')
data_lst = []

path = os.path.abspath('GAP') + '\\New_Aging_Survey\\data'
all_files = glob.glob(path + "\Lt*.csv")

for file in all_files:
    var_name = file[41:-4]
    globals()[f'{var_name}'] = pd.read_csv(file, low_memory=False)
    data_lst.append(globals()[f'{var_name}'])

In [3]:
New_elder= Lt08[(Lt08['w08A002_age'] >= 60) & (Lt08['w08A002_age'] < 65)]

In [4]:
New_elder.w08present_ecotype.value_counts()
# 취업자, 실업자, 비경활

1    611
3    497
2     10
Name: w08present_ecotype, dtype: int64

In [5]:
New_elder.w08retired.value_counts()
# 노동시장 비참여자, 은퇴, 부분은퇴

1.0    219
0.0    205
2.0     83
Name: w08retired, dtype: int64

In [6]:
New_elder.w08d_com077.value_counts()
# 낮을수록 필요, 높을수록 불필요

2.0    355
3.0    134
1.0     94
4.0     28
Name: w08d_com077, dtype: int64

In [7]:
New_elder.w08job.value_counts()
# 1.0 관리자 2.0 전문가 3.0 사무직 4.0 서비스근로자 5.0 판매근로자 6.0 농업, 임업 및 어업 숙련근로자
# 7.0 기능원 및 관련 기능 종사자 8.0 장치, 기계조작 및 조립 종사자 9.0 단순노무 종사자 -9.0 직업군인

 9.0    146
 4.0    107
 7.0     71
 5.0     69
 8.0     68
 6.0     42
 2.0     36
 1.0     26
 3.0     24
-9.0     20
Name: w08job, dtype: int64

In [8]:
New_elder.w08emp.value_counts()
# 임금, 자영업, 무급가족 종사자, 일X

1.0    329
2.0    233
3.0     49
4.0      2
Name: w08emp, dtype: int64

In [9]:
New_elder.w08edu.value_counts()
# 대학교 이상, 고졸, 중졸, 초졸

3    557
2    208
4    183
1    170
Name: w08edu, dtype: int64

In [10]:
# iadl 과 동일 (0일수록 도움이 필요하지 않으며, 7일수록 일상생활 불편)
print(New_elder.w08adl.value_counts())
print(New_elder.w08iadl.value_counts())

0    1113
7       4
5       1
Name: w08adl, dtype: int64
0     1078
2       12
1       10
3       10
10       3
4        1
5        1
6        1
8        1
9        1
Name: w08iadl, dtype: int64


In [11]:
# 건강상태, 값이 작을수록 좋음
print(New_elder.w08C001.value_counts())
print(New_elder.w08C005.value_counts())

3    526
4    414
2     94
5     70
1     14
Name: w08C001, dtype: int64
3    626
4    268
2    203
1     21
Name: w08C005, dtype: int64


In [12]:
# 자산항목 나머지와 동일
New_elder.w08pnetassets.value_counts()

1000.0     25
2000.0     24
5000.0     23
500.0      21
20000.0    20
           ..
26500.0     1
10850.0     1
21190.0     1
7900.0      1
3200.0      1
Name: w08pnetassets, Length: 417, dtype: int64

In [13]:
# 주관적 기대감 나머지 응답항목과 동일
New_elder.w08G026.value_counts()

70     312
80     277
60     190
50     156
40      79
90      47
30      29
20       9
0        8
100      6
10       5
Name: w08G026, dtype: int64

#### Data Preprocessing

In [14]:
# present_ecotype; 현재 경활상태, retired; 은퇴 및 부분은퇴, job; 직업분류, emp; 임금, 자영업, 무급가족 구분
New_elder['Job'] = np.where(New_elder['w08present_ecotype'] == 3, '비경활',
                           np.where(New_elder['w08retired'].isin([1.0, 2.0]), '은퇴',
                                np.where(New_elder['w08job'].isin([1.0, 2.0]), '임금_고숙련',
                                        np.where(New_elder['w08job'].isin([3.0, 6.0, 7.0, 8.0, 9.0]), '임금_중숙련',
                                                np.where(New_elder['w08job'].isin([4.0, 5.0, 9.0]), '임금_저숙련',
                                                        np.where(New_elder['w08emp'] == 2.0, '자영업',
                                                                np.where(New_elder['w08emp'] == 3.0, '무급가족종사자', np.nan)))))))

In [15]:
# edu; 응답자 학력
New_elder['Edu'] = np.where(New_elder['w08edu'] == 1, '대학교 이상',
                           np.where(New_elder['w08edu'] == 2, '고등',
                                   np.where(New_elder['w08edu'] == 3, '중등', '초등')))

In [16]:
# adl; ADL 지수화, iadl; IADL 지수화, C001; 건강상태, C005; 건강상태로 인한 활동 제한
New_elder['Health_Score'] = New_elder['w08adl'] + New_elder['w08iadl'] + New_elder['w08C001'] + New_elder['w08C005']
print(New_elder['Health_Score'].value_counts())

New_elder['Health'] = np.where(New_elder['Health_Score'] < 8, '전혀 지장없음',
                              np.where(New_elder['Health_Score']>10,'활동에 제한 있음', '그렇지 않은 편'))

6     416
7     404
5     139
8     117
4      17
9      13
10      5
12      2
20      2
23      2
22      1
Name: Health_Score, dtype: int64


In [17]:
# pnetassets ; 개인순자산, pinc; 지난해 개인총소득 
New_elder['Earn_Score'] = New_elder['w08pnetassets'] + New_elder['w08pinc']
New_elder['Earn_Score'].describe()

New_elder['Earn'] = np.where(New_elder['Earn_Score'] > New_elder['Earn_Score'].describe()['75%'], '상',
                            np.where(New_elder['Earn_Score'] < New_elder['Earn_Score'].describe()['25%'], '하', '중'))

In [18]:
# G026; 자신의 건강상태, G027; 자신의 경제상태, G028; 배우자와의 관계, G029; 자녀와의 관계, G030; 전반적인 삶의 질
New_elder['Self_conf_Score'] = New_elder['w08G026'] + New_elder['w08G027'] + New_elder['w08G028'] + New_elder['w08G029'] + New_elder['w08G030']
New_elder['Self_conf'] = np.where(New_elder['Self_conf_Score'] > 350, '상',
                                 np.where(New_elder['Self_conf_Score'] < 200, '하', '중'))

In [19]:
New_elder = New_elder[['Job', 'Edu', 'Health', 'Earn', 'Self_conf']]

In [21]:
def preprocessing(data, period):
    # data = data[(data[f'w{period}A002_age'] >= 60) & (data[f'w{period}A002_age'] < 65)]
    data = data[(data[f'w{period}A002y'] >= 1959) & (data[f'w{period}A002y'] < 1964)]
    data['Gender'] = np.where(data[f'w{period}gender1'] == 1, '남성', '여성')
    
    data['Job'] = np.where(data[f'w{period}present_ecotype'] == 3, '비경활',
                           np.where(data[f'w{period}retired'].isin([1.0, 2.0]), '은퇴',
                                np.where(data[f'w{period}job'].isin([1.0, 2.0]), '임금_고숙련',
                                        np.where(data[f'w{period}job'].isin([3.0, 6.0, 7.0, 8.0, -9.0]), '임금_중숙련',
                                                np.where(data[f'w{period}job'].isin([4.0, 5.0, 9.0]), '임금_저숙련',
                                                        np.where(data[f'w{period}emp'] == 2.0, '자영업', '무급가족종사자'))))))
    data['Edu'] = np.where(data[f'w{period}edu'] == 1, '대학교 이상',
                           np.where(data[f'w{period}edu'] == 2, '고등',
                                   np.where(data[f'w{period}edu'] == 3, '중등', '초등')))
    data['Health_Score'] = data[f'w{period}adl'] + data[f'w{period}iadl'] + data[f'w{period}C001'] + data[f'w{period}C005']

    data['Health'] = np.where(data['Health_Score'] < 7, '전혀 지장없음',
                              np.where(data['Health_Score']>10,'활동에 제한 있음', '그렇지 않은 편'))
    
    data['Earn_Score'] = data[f'w{period}pnetassets'] + data[f'w{period}pinc']

    data['Earn'] = np.where(data['Earn_Score'] > data['Earn_Score'].describe()['75%'], '상',
                            np.where(data['Earn_Score'] < data['Earn_Score'].describe()['25%'], '하', '중'))
    
    data['Self_conf_Score'] = data[f'w{period}G026'] + data[f'w{period}G027'] + data[f'w{period}G028'] + data[f'w{period}G029'] + data[f'w{period}G030']

    data['Self_conf'] = np.where(data['Self_conf_Score'] > 350, '상',
                                 np.where(data['Self_conf_Score'] < 200, '하', '중'))
    
    data = data[['Gender','Job', 'Edu', 'Health', 'Earn', 'Self_conf']]
    data.reset_index(drop=True, inplace=True)
    return data

In [22]:
def to_int(data):
    data['Gender'] = np.where(data['Gender'] == '남성', 0, 1)
    data['Job']  = np.where(data['Job'] == '비경활', 0,
                    np.where(data['Job'] == '은퇴', 1,
                            np.where(data['Job'] == '임금_고숙련', 2,
                                np.where(data['Job'] == '임금_중숙련', 3,
                                    np.where(data['Job'] == '임금_저숙련', 4,
                                        np.where(data['Job'] == '자영업', 5, 6))))))
    data['Edu'] = np.where(data['Edu'] == '초등', 0,
                    np.where(data['Edu'] == '중등', 1,
                        np.where(data['Edu'] == '고등', 2, 3)))
    data['Health'] = np.where(data['Health'] == '활동에 제한 있음', 0,
                        np.where(data['Health'] == '그렇지 않은 편', 1, 2))
    data['Earn'] = np.where(data['Earn'] == '하', 0,
                    np.where(data['Earn'] == '중', 1, 2))
    data['Self_conf'] = np.where(data['Self_conf'] == '하', 0,
                            np.where(data['Self_conf'] == '중', 1, 2))
    return data

In [66]:
for idx, df in enumerate(data_lst):
    globals()['data_{}'.format(idx+1)] = preprocessing(df, '0'+str(idx+1))

In [75]:
aft_lst = [data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8]
for idx, data in enumerate(aft_lst):
    data.to_csv(f'./data/data_{idx}.csv', encoding='utf-8', index=False)