## import Library

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import font_manager
%matplotlib inline
import os
import warnings
import glob
warnings.filterwarnings('ignore')

font_fname = 'C:/Windows/Fonts/malgun.ttf'
font_family = font_manager.FontProperties(fname=font_fname).get_name()
plt.rcParams["font.family"] = font_family

#### Data Load

In [7]:
# path = './data/'
# filenames = os.listdir('./data')
data_lst = []

path = os.path.abspath('data')
all_files = glob.glob(path + "\Lt*.csv")

for file in all_files:
    var_name = file[41:-4]
    globals()[f'{var_name}'] = pd.read_csv(file, low_memory=False)
    data_lst.append(globals()[f'{var_name}'])

#### Data Preprocessing

In [8]:
New_elder = Lt08[(Lt08['w08A002y'] >= 1957) & (Lt08['w08A002y'] < 1962)]

In [9]:
# present_ecotype; 현재 경활상태 (1: 취업자, 2: 실업자, 3: 비경활)
# retired; 은퇴 및 부분은퇴 (0: 노동시장 비참여자, 1: 은퇴, 2: 부분은퇴)
# job; 직업분류 (1: 관리자, 2: 전문가, 3: 사무직, 4: 서비스근로자, 5: 판매근로자, 6: 농임업 및 어업 숙련근로자, 7: 기능원, 8: 기계 관련 종사자, 9: 단순 노무, -9: 직업군인)
# emp; 임금, 자영업, 무급가족 구분 (1: 임금, 2: 자영업, 3: 무급가족 종사자, 4: 일하지 않음)
New_elder['Job'] = np.where(New_elder['w08present_ecotype'] == 3, '비경활',
                           np.where(New_elder['w08retired'].isin([1.0, 2.0]), '은퇴',
                                np.where(New_elder['w08job'].isin([1.0, 2.0]), '임금_고숙련',
                                        np.where(New_elder['w08job'].isin([3.0, 6.0, 7.0, 8.0, 9.0]), '임금_중숙련',
                                                np.where(New_elder['w08job'].isin([4.0, 5.0, 9.0]), '임금_저숙련',
                                                        np.where(New_elder['w08emp'] == 2.0, '자영업',
                                                                np.where(New_elder['w08emp'] == 3.0, '무급가족종사자', np.nan)))))))

In [10]:
# edu; 응답자 학력 (4: 초등, 3: 중등, 2: 고등, 1: 대학교 이상)
New_elder['Edu'] = np.where(New_elder['w08edu'] == 1, '대학교 이상',
                           np.where(New_elder['w08edu'] == 2, '고등',
                                   np.where(New_elder['w08edu'] == 3, '중등', '초등')))

In [11]:
# C005; 건강 상태로 인한 일하는데의 지장 (1 매우 그렇다 ~ 4 전혀 그렇지 않다)
New_elder['Health'] = np.where(New_elder['w08C005'].isin([1,2]), '활동에 제한 있음',
                              np.where(New_elder['w08C005'] == 3,'그렇지 않은 편', '전혀 지장 없음'))

In [12]:
# hhinc ; 가구총소득, hhsize; 가구원수 
New_elder['Earn_Score'] = New_elder['w08hhinc'] / np.sqrt(New_elder['w08hhsize'])
New_elder['Earn'] = pd.qcut(New_elder['Earn_Score'], 3, labels=['하', '중', '상'])

In [13]:
# G026; 자신의 건강상태, G027; 자신의 경제상태, G028; 배우자와의 관계, G029; 자녀와의 관계, G030; 전반적인 삶의 질 (0~100 Score)
New_elder['Self_conf_Score'] = New_elder['w08G026'] + New_elder['w08G027'] + New_elder['w08G028'] + New_elder['w08G029'] + New_elder['w08G030']
New_elder['Self_conf'] = np.where(New_elder['Self_conf_Score'] > 350, '상',
                                 np.where(New_elder['Self_conf_Score'] < 200, '하', '중'))

In [14]:
New_elder = New_elder[['Job', 'Edu', 'Health', 'Earn', 'Self_conf']]

In [15]:
def preprocessing(data, period):
    # data = data[(data[f'w{period}A002_age'] >= 60) & (data[f'w{period}A002_age'] < 65)]
    data['Earn_Score'] = data[f'w{period}hhinc'] / np.sqrt(data[f'w{period}hhsize'])

    data['Earn'] = pd.qcut(data['Earn_Score'], 3, labels=['하', '중', '상'])

    index = data[data.Earn.isna()==True].index
    data.drop(index, inplace=True)
    
    data = data[(data[f'w{period}A002y'] >= 1955) & (data[f'w{period}A002y'] < 1964)]
    data['Gender'] = np.where(data[f'w{period}gender1'] == 1, '남성', '여성')
    
    data['Job'] = np.where(data[f'w{period}present_ecotype'] == 3, '비경활',
                           np.where(data[f'w{period}retired'].isin([1.0, 2.0]), '은퇴',
                                np.where(data[f'w{period}job'].isin([1.0, 2.0]), '임금_고숙련',
                                        np.where(data[f'w{period}job'].isin([3.0, 6.0, 7.0, 8.0, -9.0]), '임금_중숙련',
                                                np.where(data[f'w{period}job'].isin([4.0, 5.0, 9.0]), '임금_저숙련',
                                                        np.where(data[f'w{period}emp'] == 2.0, '자영업', '무급가족종사자'))))))
    data['Edu'] = np.where(data[f'w{period}edu'] == 1, '대학교 이상',
                           np.where(data[f'w{period}edu'] == 2, '고등',
                                   np.where(data[f'w{period}edu'] == 3, '중등', '초등')))

    data['Health'] = np.where(data[f'w{period}C005'].isin([1,2]), '활동에 제한 있음',
                              np.where(data[f'w{period}C005'] == 3,'그렇지 않은 편', '전혀 지장 없음'))

    data['Self_conf_Score'] = data[f'w{period}G026'] + data[f'w{period}G027'] + data[f'w{period}G028'] + data[f'w{period}G029'] + data[f'w{period}G030']

    data['Self_conf'] = np.where(data['Self_conf_Score'] > 350, '상',
                                 np.where(data['Self_conf_Score'] < 200, '하', '중'))
    
    data = data[['Gender','Job', 'Edu', 'Health', 'Earn', 'Self_conf']]
    data.reset_index(drop=True, inplace=True)
    return data

In [16]:
def to_int(data):
    data['Gender'] = np.where(data['Gender'] == '남성', 0, 1)
    data['Job']  = np.where(data['Job'] == '비경활', 0,
                    np.where(data['Job'] == '은퇴', 1,
                            np.where(data['Job'] == '임금_고숙련', 4,
                                np.where(data['Job'] == '임금_중숙련', 3,
                                    np.where(data['Job'] == '임금_저숙련', 2,
                                        np.where(data['Job'] == '자영업', 5, 6))))))
    data['Edu'] = np.where(data['Edu'] == '초등', 0,
                    np.where(data['Edu'] == '중등', 1,
                        np.where(data['Edu'] == '고등', 2, 3)))
    data['Health'] = np.where(data['Health'] == '활동에 제한 있음', 0,
                        np.where(data['Health'] == '그렇지 않은 편', 1, 2))
    data['Earn'] = np.where(data['Earn'] == '하', 0,
                    np.where(data['Earn'] == '중', 1, 2))
    data['Self_conf'] = np.where(data['Self_conf'] == '하', 0,
                            np.where(data['Self_conf'] == '중', 1, 2))
    return data

In [17]:
# for idx, df in enumerate(data_lst):
#    globals()['data_{}'.format(idx+1)] = preprocessing(df, '0'+str(idx+1))
data =preprocessing(Lt08, '08')

In [18]:
# aft_lst = [data_1, data_2, data_3, data_4, data_5, data_6, data_7, data_8]
# for idx, data in enumerate(aft_lst):
#     data.to_csv(f'./data/data_{idx}.csv', encoding='utf-8', index=False)
data.to_csv('./data/data_08.csv', encoding='utf-8', index=False)