# 사용자 설문 데이터 - 성별 및 연령대 별 더미 데이터 생성

In [1]:
import numpy as np
import pandas as pd
import random
import re

# 경고 메시지를 무시
import warnings
warnings.filterwarnings(action='ignore')

In [2]:
# 모듈화한 list 혹은 dict 불러오기
from com_code import survey_function_com_codes, survey_disease_com_codes, survey_allergy_com_codes
from dummy_survey_variable import sex_age_group_ratings, adult_function_weight, pregnancy_ratings, function_rankings

print(survey_function_com_codes)
print(survey_disease_com_codes)
print(survey_allergy_com_codes)

print(sex_age_group_ratings)
print(adult_function_weight)
print(pregnancy_ratings)
print(function_rankings)

['HF00', 'HF01', 'HF02', 'HF03', 'HF04', 'HF05', 'HF06', 'HF07', 'HF08', 'HF09', 'HF10', 'HF11', 'HF12', 'HF13', 'HF14', 'HF15', 'HF16', 'HF17', 'HF18', 'HF19', 'HF20', 'HF21', 'HF22', 'HF23', 'HF24', 'HF25']
['DI00', 'DI01', 'DI02', 'DI03', 'DI04', 'DI05', 'DI06', 'DI07', 'DI08', 'DI09', 'DI10', 'DI11', 'DI12', 'DI13', 'DI14', 'DI15', 'DI16', 'DI17']
['AL00', 'AL01', 'AL02', 'AL03', 'AL04', 'AL05', 'AL06', 'AL07', 'AL08', 'AL09', 'AL10', 'AL11', 'AL12', 'AL13', 'AL14', 'AL15', 'AL16', 'AL17', 'AL18', 'AL19', 'AL20']
[{'survey_sex': 'm', 'survey_age_group': 0, 'survey_pregnancy_code': 0.0, 'survey_operation_code': 0.01, 'survey_alcohol_code': 0.0, 'survey_smoking_code': 0.0, 'HF01': 0.0, 'HF02': 0.0, 'HF03': 0.0, 'HF04': 0.1, 'HF05': 0.0, 'HF06': 0.0, 'HF07': 0.2, 'HF08': 0.0, 'HF09': 0.0, 'HF10': 0.5, 'HF11': 0.0, 'HF12': 0.0, 'HF13': 0.0, 'HF14': 0.0, 'HF15': 0.0, 'HF16': 0.0, 'HF17': 0.0, 'HF18': 0.0, 'HF19': 0.0, 'HF20': 0.5, 'HF21': 0.8, 'HF22': 0.0, 'HF23': 0.0, 'HF24': 0.0, 'AL0

## 1. 성별 및 연령대 별 더미 데이터 생성 비율 값을 가진 데이터프레임 생성

In [3]:
def create_dummy_rating_df(sex_age_group_ratings, adult_function_weight):
    # 더미 데이터 생성 비율 데이터프레임 생성
    dummy_rating_df = pd.DataFrame(sex_age_group_ratings)

    # 성인일 경우 건강기능 비율에 가중치 적용
    for idx, row in dummy_rating_df.iterrows():
        if row['survey_age_group'] > 10:
            for col, weight in adult_function_weight.items():
                dummy_rating_df.loc[idx, col] = row[col] * weight
                
    return dummy_rating_df

In [4]:
# 함수 적용 및 확인
dummy_rating_df = create_dummy_rating_df(sex_age_group_ratings, adult_function_weight)
dummy_rating_df

Unnamed: 0,survey_sex,survey_age_group,survey_pregnancy_code,survey_operation_code,survey_alcohol_code,survey_smoking_code,HF01,HF02,HF03,HF04,...,DI08,DI09,DI10,DI11,DI12,DI13,DI14,DI15,DI16,DI17
0,m,0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.1,...,0.0,0.01,0.01,0.005,0.01,0.01,0.001,0.001,0.001,0.0
1,m,10,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.05,...,0.0,0.01,0.01,0.005,0.01,0.01,0.001,0.001,0.001,0.0
2,m,20,0.0,0.009,0.631,0.376,0.0297,0.29316,0.0,0.002254,...,0.0,0.022,0.023,0.004,0.009,0.109,0.001,0.001,0.001,0.0
3,m,30,0.0,0.016,0.748,0.427,0.03765,0.323174,0.022545,0.001633,...,0.003,0.017,0.05,0.01,0.009,0.097,0.001,0.001,0.001,0.0
4,m,40,0.0,0.016,0.748,0.427,0.03765,0.323174,0.022545,0.001633,...,0.011,0.021,0.023,0.014,0.009,0.104,0.005,0.001,0.001,0.0
5,m,50,0.0,0.028,0.713,0.369,0.03675,0.252676,0.027555,0.001449,...,0.026,0.03,0.017,0.025,0.016,0.099,0.005,0.001,0.001,0.0
6,m,60,0.0,0.075,0.667,0.286,0.03555,0.155654,0.1002,0.001219,...,0.109,0.022,0.029,0.037,0.016,0.074,0.022,0.001,0.001,0.0
7,f,0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.1,...,0.0,0.01,0.01,0.005,0.01,0.01,0.001,0.001,0.001,0.0
8,f,10,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.05,...,0.0,0.01,0.01,0.005,0.01,0.01,0.001,0.001,0.001,0.0
9,f,20,0.014,0.015,0.612,0.094,0.0243,0.302932,0.0,0.001242,...,0.0,0.025,0.03,0.004,0.012,0.111,0.001,0.001,0.001,0.05


In [5]:
dummy_rating_df = pd.DataFrame(sex_age_group_ratings)

# 성인일 경우 건강기능 비율에 가중치 적용
for idx, row in dummy_rating_df.iterrows():
    if row['survey_age_group'] > 10:
        for col, weight in adult_function_weight.items():
            dummy_rating_df.loc[idx, col] = row[col] * weight
            
dummy_rating_df

Unnamed: 0,survey_sex,survey_age_group,survey_pregnancy_code,survey_operation_code,survey_alcohol_code,survey_smoking_code,HF01,HF02,HF03,HF04,...,DI08,DI09,DI10,DI11,DI12,DI13,DI14,DI15,DI16,DI17
0,m,0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.1,...,0.0,0.01,0.01,0.005,0.01,0.01,0.001,0.001,0.001,0.0
1,m,10,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.05,...,0.0,0.01,0.01,0.005,0.01,0.01,0.001,0.001,0.001,0.0
2,m,20,0.0,0.009,0.631,0.376,0.0297,0.29316,0.0,0.002254,...,0.0,0.022,0.023,0.004,0.009,0.109,0.001,0.001,0.001,0.0
3,m,30,0.0,0.016,0.748,0.427,0.03765,0.323174,0.022545,0.001633,...,0.003,0.017,0.05,0.01,0.009,0.097,0.001,0.001,0.001,0.0
4,m,40,0.0,0.016,0.748,0.427,0.03765,0.323174,0.022545,0.001633,...,0.011,0.021,0.023,0.014,0.009,0.104,0.005,0.001,0.001,0.0
5,m,50,0.0,0.028,0.713,0.369,0.03675,0.252676,0.027555,0.001449,...,0.026,0.03,0.017,0.025,0.016,0.099,0.005,0.001,0.001,0.0
6,m,60,0.0,0.075,0.667,0.286,0.03555,0.155654,0.1002,0.001219,...,0.109,0.022,0.029,0.037,0.016,0.074,0.022,0.001,0.001,0.0
7,f,0,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.1,...,0.0,0.01,0.01,0.005,0.01,0.01,0.001,0.001,0.001,0.0
8,f,10,0.0,0.01,0.0,0.0,0.0,0.0,0.0,0.05,...,0.0,0.01,0.01,0.005,0.01,0.01,0.001,0.001,0.001,0.0
9,f,20,0.014,0.015,0.612,0.094,0.0243,0.302932,0.0,0.001242,...,0.0,0.025,0.03,0.004,0.012,0.111,0.001,0.001,0.001,0.05


## 2. 성별 및 연령대 별 더미 데이터 데이터프레임 생성
- `dummy_rating_df`의 비율을 활용

In [6]:
# 이진 값으로 더미 데이터를 생성하는 함수 정의
# 1: 값이 있음, 0 : 값이 없음
def ratelist(total, rate):
    li = []

    o = 1
    x = 0

    _rate = rate
    _count = total

    for i in range(1,_count):
        if i <= _rate * _count:
            li.append(o)
        else:
            li.append(x)
    
    random.shuffle(li)
    return li

In [7]:
dummy_cols = ['survey_sex', 'survey_age_group', 'survey_pregnancy_code', 'survey_operation_code', 'survey_alcohol_code', 'survey_smoking_code']
dummy_cols += survey_function_com_codes + survey_disease_com_codes + survey_allergy_com_codes

check_dummy_cols = ['survey_pregnancy_code', 'survey_operation_code', 'survey_alcohol_code', 'survey_smoking_code',
                     'HF01', 'HF02', 'HF03', 'HF04', 'HF05', 'HF06', 'HF07', 'HF08', 'HF09', 'HF10', 'HF11', 'HF12', 
                     'HF13', 'HF14', 'HF15', 'HF16', 'HF17', 'HF18', 'HF19', 'HF20', 'HF21', 'HF22', 'HF23', 'HF24', 
                     'AL01', 'AL02', 'AL03', 'AL04', 'AL05', 'AL06', 'AL07', 'AL08', 'AL09', 'AL10', 
                     'AL11', 'AL12', 'AL13', 'AL14', 'AL15', 'AL16', 'AL17', 'AL18', 'AL19', 'AL20',
                     'DI01', 'DI02', 'DI03', 'DI04', 'DI05', 'DI06', 'DI07', 'DI08', 'DI09', 'DI10',
                     'DI11', 'DI12', 'DI13', 'DI14', 'DI15', 'DI16', 'DI17']

In [8]:
# 성별 및 연령대 별로 코드 비율에 따라 random 값을 생성하여 사용자 설문 데이터 프레임 생성하는 함수 정의
def create_survey_df(dummy_rating_df, dummy_cols, check_dummy_cols, sample_num):
    global pregnancy_ratings, survey_function_com_codes, survey_disease_com_codes, survey_allergy_com_codes
    
    # 1. 성별 및 연령대 별로 코드별 비율에 맞춰 랜덤으로 더미 데이터 생성 => dummy_survey_df 데이터프레임 생성 
    dummy_survey_df = pd.DataFrame(columns=dummy_cols)

    for idx, row in dummy_rating_df.iterrows():
        dummy_df = pd.DataFrame(columns=dummy_cols)
        
        # 성별, 연령대별 비율만큼 건강기능 및 주의사항 코드 값을 0 또는 1로 생성
        for col in check_dummy_cols:
            dummy_df[col] = ratelist(sample_num + 1, row[col]) 
        
        # 비율에 해당하는 성별, 연령대 값 넣기    
        dummy_df.loc[dummy_df['survey_sex'].isnull(), 'survey_sex'] = row['survey_sex'] 
        dummy_df.loc[dummy_df['survey_age_group'].isnull(), 'survey_age_group'] = row['survey_age_group'] 
        
        # 각 성별 및 연령대별 sample_num개의 샘플 데이터프레임을 하나로 합치기
        dummy_survey_df = pd.concat([dummy_survey_df, dummy_df], ignore_index=True)

        # 'HF25' (임신/태아) 컬럼값 설정 - 'survey_pregnancy_code' 값이 0인 rows는 'HF25' 값은 0
        dummy_survey_df.loc[dummy_survey_df['survey_pregnancy_code'] == 0, 'HF25'] = 0
    
    # 2. 'survey_pregnancy_code' 값이 1인 row만 'HF25', 알콜 여부, 흡연 여부 데이터를 비율에 맞게 다시 생성
    pregnancy_mask = dummy_survey_df['survey_pregnancy_code'] == 1  
    for col, rating in pregnancy_ratings.items():
        dummy_survey_df.loc[pregnancy_mask, col] = ratelist(len(dummy_survey_df[pregnancy_mask]) + 1, rating)

    return dummy_survey_df

In [9]:
# 함수 적용
dummy_survey_df = create_survey_df(dummy_rating_df, dummy_cols, check_dummy_cols, 1000)
dummy_survey_df

Unnamed: 0,survey_sex,survey_age_group,survey_pregnancy_code,survey_operation_code,survey_alcohol_code,survey_smoking_code,HF00,HF01,HF02,HF03,...,AL11,AL12,AL13,AL14,AL15,AL16,AL17,AL18,AL19,AL20
0,m,0,0,0,0,0,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,m,0,0,0,0,0,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,m,0,0,0,0,0,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,m,0,0,0,0,0,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,m,0,0,0,0,0,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13995,f,60,0,0,0,0,,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13996,f,60,0,0,0,0,,0,0,1,...,0,0,0,0,0,0,0,0,0,0
13997,f,60,0,0,1,0,,0,1,1,...,0,0,0,0,0,0,0,0,0,0
13998,f,60,0,0,0,0,,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 코드성 컬럼 값 수정
- 코드 컬럼 중 사용자 설문에서 선택한 값이 모두 0일 경우 해당사항없음을 의미하는 zero 코드 컬럼값을 1로 지정, 아닐 경우 0으로 지정
- 건강기능 코드가 5개 초과 값을 가지는 경우, 해당 컬럼에 대해 랜덤하게 초과된 컬럼값을 0으로 변경하여 값이 1인 경우가 5개로 수정

In [10]:
# 코드 컬럼 중 선택한 값이 없거나(모두 0), 건강기능 코드가 5개 초과 값을 가지는 경우 값을 변경하는 함수 정의
def update_code_col(df, update_col, check_cols, zero_code, count_check_cols=None):
    for idx, row in df.iterrows():
        cols_to_check = [code for code in check_cols if code != zero_code]
        # 1. 건강기능으로 값이 1인 경우가 5개 초과인 컬럼 찾기 -> 해당 컬럼에 대해 랜덤하게 0으로 변경하여 값이 1인 경우가 5개로 수정
        if count_check_cols:
            exceed_columns = []
            if row[cols_to_check].sum() > 5:
                for col in survey_function_com_codes:
                    if row[col] == 1:
                        exceed_columns.append(col)
                        
                indices_to_change = row[cols_to_check][row[cols_to_check] == 1].sample(frac=1).index[:row[cols_to_check].sum() - 5]
                df.loc[idx, indices_to_change] = 0
        
        # 2. 코드값이 모두 0일 경우 zero 코드(해당사항 없음)에 값을 1로 설정            
        all_zero = all(row[col] == 0 for col in cols_to_check)
        df.loc[idx, update_col] = 1 if all_zero else 0
    return df

In [11]:
# 건강기능, 기저질환, 알레르기 관련 코드 컬럼에 대해 함수 적용
dummy_survey_df = update_code_col(dummy_survey_df, 'HF00', survey_function_com_codes, 'HF00', count_check_cols=True)
dummy_survey_df = update_code_col(dummy_survey_df, 'AL00', survey_allergy_com_codes, 'AL00')
dummy_survey_df = update_code_col(dummy_survey_df, 'DI00', survey_disease_com_codes, 'DI00')

In [12]:
# 확인
dummy_survey_df.loc[dummy_survey_df['HF00'] == 1, survey_function_com_codes] # 1947 행
dummy_survey_df.loc[dummy_survey_df['AL00'] == 1, survey_allergy_com_codes] # 13645 행
dummy_survey_df.loc[dummy_survey_df['DI00'] == 1, survey_disease_com_codes] # 5682 행

Unnamed: 0,DI00,DI01,DI02,DI03,DI04,DI05,DI06,DI07,DI08,DI09,DI10,DI11,DI12,DI13,DI14,DI15,DI16,DI17
0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13879,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13932,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13934,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
13937,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [13]:
# 건강기능으로 값이 1인 경우가 5개 초과인 컬럼이 없는지 확인
check_idxs = []
for idx, row in dummy_survey_df.iterrows():
    if row[survey_function_com_codes].sum() > 5:
        print('5개 초과인 경우가 존재 O')
        check_idxs.append(idx)

if not check_idxs:
    print('5개 초과인 경우가 존재 X')

5개 초과인 경우가 존재 X


### 코드값 업데이트
- DB에 넣을 용으로 코드 값 업데이트

In [14]:
def update_row(row):  
    # 임신 상태 업데이트
    if row['survey_pregnancy_code'] == 0:
        row['survey_pregnancy_code'] = 'P0'
    else:
        row['survey_pregnancy_code'] = f'P{random.choices([1, 2, 3], [0.05, 0.05, 0.9])[0]}'
    
    # 수술 상태 업데이트
    if row['survey_operation_code'] == 0:
        row['survey_operation_code'] = 'O0'
    else:
        row['survey_operation_code'] = f'O{random.choices([1, 2, 3, 9], [0.04, 0.45, 0.45, 0.01])[0]}' # 'O9' 추가 - 비율 수정
    
    # 음주 상태 업데이트
    if row['survey_alcohol_code'] == 0:
        row['survey_alcohol_code'] = 'A0'
    else:
        row['survey_alcohol_code'] = f'A{random.choices([1, 2, 3, 9], [0.5, 0.3, 0.15, 0.05])[0]}' # 'O9' 추가 - 비율 수정
    
    # 흡연 상태 업데이트
    if row['survey_smoking_code'] == 0:
        row['survey_smoking_code'] = 'S0'
    else:
        row['survey_smoking_code'] = f'S{random.choices([1, 9], [0.95, 0.05])[0]}' # 코드 컬럼으로 수정, 'S9' 추가 - 비율 생성
    
    return row

In [15]:
# 함수 적용
dummy_survey_df = dummy_survey_df.apply(update_row, axis=1)

### 다중 값을 가지는 코드성 컬럼에 대해 JSON 컬럼 생성
- 건강기능, 기저질환, 알레르기 코드 관련
- 건강기능 코드의 경우 선택한 건강기능(고민)의 순위에 대한 내용으로 넣는다.
  - 순위는 random 하게 생성

In [16]:
def create_json_survey_code_col(df, code_list, code_json_col, com_code_grp, function_rankings=None):
    # 코드 JSON 컬럼 데이터 생성
    # key : 순위, value : 코드(코드컬럼명)
    code_json = []
    for idx, row in df.iterrows():
        code_dict_data = {}
        if (com_code_grp == 'FUNCTION') and (function_rankings):
            check_code_list = row[code_list][row[code_list] == 1].sample(frac=1).index.tolist() # 순위 무작위 index
            for i, code in enumerate(check_code_list):
                code_dict_data[function_rankings[i]] = code
        else:
            code_dict_data[com_code_grp] = code_list
            
        code_json.append(code_dict_data)
    
    # 코드 JSON 컬럼 생성
    df[code_json_col] = code_json
    
    return df

In [17]:
# 함수 적용
dummy_survey_df = create_json_survey_code_col(dummy_survey_df, survey_function_com_codes, 'survey_function_code', com_code_grp='FUNCTION', function_rankings=function_rankings)
dummy_survey_df = create_json_survey_code_col(dummy_survey_df, survey_allergy_com_codes, 'survey_allergy_code', com_code_grp='ALLERGY')
dummy_survey_df = create_json_survey_code_col(dummy_survey_df, survey_disease_com_codes, 'survey_disease_code', com_code_grp='DISEASE')

In [18]:
# 확인
dummy_survey_df[survey_function_com_codes +  ['survey_function_code']][4000:4010]

Unnamed: 0,HF00,HF01,HF02,HF03,HF04,HF05,HF06,HF07,HF08,HF09,...,HF17,HF18,HF19,HF20,HF21,HF22,HF23,HF24,HF25,survey_function_code
4000,0,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,"{'1st': 'HF17', '2nd': 'HF24', '3rd': 'HF02'}"
4001,0,1,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,"{'1st': 'HF05', '2nd': 'HF01', '3rd': 'HF18', ..."
4002,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,{'1st': 'HF06'}
4003,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,{'1st': 'HF02'}
4004,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,{'1st': 'HF00'}
4005,0,0,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"{'1st': 'HF05', '2nd': 'HF02'}"
4006,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,{'1st': 'HF00'}
4007,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,{'1st': 'HF05'}
4008,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,{'1st': 'HF02'}
4009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,{'1st': 'HF20'}


## `survey_id` 컬럼 생성
- PK로 사용될 `survey_id` 컬럼 생성

In [19]:
dummy_survey_df['survey_id'] = [id for id in range(1, len(dummy_survey_df) + 1)]

## 그 외 컬럼 생성
- 성별 및 연령대 컬럼을 기준으로 'user_id', 'profile_id', 'survey_height', 'survey_weight' 값 생성
- 'survey_created_at' 생성

- 질병관리청 국민건강영양조사 2022 자료를 바탕으로 user_id, profill_id 생성

In [95]:
# csv 파일 데이터 로드
data_path = 'C:\\Users\\user\\working\\Our-family-pharmacist\\data\\질병관리청_국민건강영양조사_2022.csv'
raw_fam_df = pd.read_csv(data_path)
raw_fam_df

Unnamed: 0,ID,ID_fam,sex,age,age_month,fam_rela,ID_F,ID_M,HE_ht,HE_wt
0,A901215301,A9012153,1,61,,1,,,179.9,86.7
1,A901215302,A9012153,1,57,,9,,,179.3,82
2,A901225501,A9012255,2,39,,1,,,159.9,59
3,A901225502,A9012255,2,19,,3,,A901225501,169.7,63.1
4,A901227201,A9012272,1,70,,1,,,171.4,55.8
...,...,...,...,...,...,...,...,...,...,...
7085,R904351302,R9043513,1,25,,3,,,177.7,86.9
7086,R904353001,R9043530,1,45,,1,,,175.3,105.2
7087,R904353002,R9043530,2,43,,2,,,167,54.8
7088,R904353003,R9043530,1,8,,3,R904353001,R904353002,131.6,27.5


In [96]:
raw_fam_df.shape

(7090, 10)

In [158]:
# dummy_survey_df 에 매칭할 수 있도록 컬럼을 생성하는 함수 정의
def create_user_profile_data(df, id_fam_col, sex_col, age_col):
    # ID_fam이 같은 경우 같은 user_id 값을 생성
    # ID_fam 컬럼의 값에 대한 unique한 정수 매핑 딕셔너리 생성
    id_fam_mapping = {}
    count = 1 # user_id는 1 이상의 정수 값을 가진다.

    for id_fam in df[id_fam_col].unique():
        if id_fam not in id_fam_mapping:
            id_fam_mapping[id_fam] = count
            count += 1
    
    # ID_fam 컬럼의 값을 정수로 변환하여 user_id에 해당하는 새로운 컬럼에 할당
    df['user_id'] = df[id_fam_col].map(id_fam_mapping)
    
    # # profile_id 값을 1부터 정수 값을 차례대로 부여
    # df['profile_id'] = range(1, len(df) + 1)
    
    # 성별 컬럼 생성
    df['survey_sex'] = df[sex_col].apply(lambda x: 'm' if x == 1 else 'f')
 
    # 연령대 컬럼 생성
    # 6세 미만 영유아에 해당하는 행 제거
    df.drop(df[df[age_col]<=8].index, axis=0, inplace=True)

    age_group_mask = [
        (df['age'] >= 5) & (df['age'] <= 11),
        (df['age'] >= 12) & (df['age'] <= 18),
        (df['age'] >= 19) & (df['age'] <= 29),
        (df['age'] >= 30) & (df['age'] <= 39),
        (df['age'] >= 40) & (df['age'] <= 49),
        (df['age'] >= 50) & (df['age'] <= 59),
        (df['age'] >= 60)
    ]

    age_group_labels = [0, 10, 20, 30, 40, 50, 60]
    df['survey_age_group'] = np.select(age_group_mask, age_group_labels, default=np.nan)
    df['survey_age_group'] = df['survey_age_group'].astype(int)
    
    # 컬럼명 변경
    df.rename(columns={'HE_ht' : 'survey_height', 'HE_wt' : 'survey_weight'}, inplace=True)
    
    # ' ' 공백 문자열로 들어가있는 결측치를 nan 값으로 대체
    df.loc[df['survey_height'] == ' ', 'survey_height'] = np.nan
    df.loc[df['survey_weight'] == ' ', 'survey_weight'] = np.nan        
    
    return df

In [159]:
fam_df = create_user_profile_data(raw_fam_df, 'ID_fam', 'sex', 'age')[['user_id', 'survey_sex', 'survey_age_group', 'survey_height', 'survey_weight']]
fam_df

Unnamed: 0,user_id,survey_sex,survey_age_group,survey_height,survey_weight
0,1,m,60,179.9,86.7
1,1,m,50,179.3,82
2,2,f,30,159.9,59
3,2,f,20,169.7,63.1
4,3,m,60,171.4,55.8
...,...,...,...,...,...
7083,3422,f,10,,
7084,3423,f,50,152.7,54.6
7085,3424,m,20,177.7,86.9
7086,3425,m,40,175.3,105.2


In [160]:
dummy_survey_df.groupby(['survey_sex', 'survey_age_group']).size().reset_index(name='count')

Unnamed: 0,survey_sex,survey_age_group,count
0,f,0,1000
1,f,10,1000
2,f,20,1000
3,f,30,1000
4,f,40,1000
5,f,50,1000
6,f,60,1000
7,m,0,1000
8,m,10,1000
9,m,20,1000


In [75]:
fam_df.groupby(['survey_sex', 'survey_age_group']).size().reset_index(name='count')

Unnamed: 0,survey_sex,survey_age_group,count
0,f,0,100
1,f,10,209
2,f,20,380
3,f,30,383
4,f,40,546
5,f,50,631
6,f,60,1387
7,m,0,116
8,m,10,233
9,m,20,350


In [98]:
# 기존 데이터프레임을 n회 반복 복사해 user_id 값만 unique 값으로 바꾼 행을 추가하여 행의 개수가 n배 되는 데이터프레임 생성
def concat_copy_df(df, repetition):
    count = 1
    while count < repetition:
        df2 = df.copy()
        
        df2['user_id'] += df['user_id'].max()
        
        df = pd.concat([df, df2], axis = 0, ignore_index=True)
        count += 1
    return df

In [161]:
fam_df = concat_copy_df(fam_df, 5)
fam_df.groupby(['survey_sex', 'survey_age_group']).size().reset_index(name='count')

Unnamed: 0,survey_sex,survey_age_group,count
0,f,0,1600
1,f,10,3344
2,f,20,6080
3,f,30,6128
4,f,40,8736
5,f,50,10096
6,f,60,22192
7,m,0,1856
8,m,10,3728
9,m,20,5600


In [162]:
# dummy_survey_df와 매칭할 수 있게 성별 및 연령대 별 1000개씩의 샘플만 갖도록 인덱싱
sex_labels = ['m', 'f']
age_group_labels = [0, 10, 20, 30, 40, 50, 60]

new_fam_df = pd.DataFrame(columns=fam_df.columns.tolist())

for sex_label in sex_labels:
    for age_group_label in age_group_labels:
        label = fam_df.loc[(fam_df['survey_sex'] == sex_label) & (fam_df['survey_age_group'] == age_group_label)]
        new_fam_df = pd.concat([new_fam_df, label[:1000]], axis=0, ignore_index=True)

new_fam_df.groupby(['survey_sex', 'survey_age_group']).size().reset_index(name='count')

Unnamed: 0,survey_sex,survey_age_group,count
0,f,0,1000
1,f,10,1000
2,f,20,1000
3,f,30,1000
4,f,40,1000
5,f,50,1000
6,f,60,1000
7,m,0,1000
8,m,10,1000
9,m,20,1000


In [163]:
# profile_id 추가
new_fam_df.sort_values(by=['user_id'], ignore_index=True, inplace=True)
new_fam_df['profile_id'] = range(1, len(new_fam_df) + 1)

In [164]:
# survey_created_at 추가 - 기존 데이터셋 값으로 넣어준다.
new_fam_df['survey_created_at'] = '2023-05-11 17:40:25'
new_fam_df['survey_created_at'] = pd.to_datetime(new_fam_df['survey_created_at'])
new_fam_df['survey_created_at']

0       2023-05-11 17:40:25
1       2023-05-11 17:40:25
2       2023-05-11 17:40:25
3       2023-05-11 17:40:25
4       2023-05-11 17:40:25
                ...        
13995   2023-05-11 17:40:25
13996   2023-05-11 17:40:25
13997   2023-05-11 17:40:25
13998   2023-05-11 17:40:25
13999   2023-05-11 17:40:25
Name: survey_created_at, Length: 14000, dtype: datetime64[ns]

In [171]:
# 매칭시킬 수 있게 성별 및 연령대 기준으로 정렬
new_fam_df.sort_values(by=['survey_sex', 'survey_age_group'], ascending=[False, True], ignore_index=True, inplace=True)
dummy_survey_df.sort_values(by=['survey_sex', 'survey_age_group'], ascending=[False, True], ignore_index=True, inplace=True)

# 필요한 추가 컬럼 추가
dummy_survey_df = pd.concat([dummy_survey_df, new_fam_df[['user_id', 'profile_id', 'survey_height', 'survey_weight', 'survey_created_at']]], axis=1)

In [172]:
dummy_survey_df

Unnamed: 0,survey_sex,survey_age_group,survey_pregnancy_code,survey_operation_code,survey_alcohol_code,survey_smoking_code,HF00,HF01,HF02,HF03,...,AL20,survey_function_code,survey_allergy_code,survey_disease_code,survey_id,user_id,profile_id,survey_height,survey_weight,survey_created_at
0,m,0,P0,O0,A0,S0,0,0,0,0,...,0,"{'1st': 'HF20', '2nd': 'HF04', '3rd': 'HF21'}","{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...",1,34,66,148.5,59.1,2023-05-11 17:40:25
1,m,0,P0,O0,A0,S0,0,0,0,0,...,0,"{'1st': 'HF21', '2nd': 'HF10'}","{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...",2,137,257,148.2,42,2023-05-11 17:40:25
2,m,0,P0,O0,A0,S0,0,0,0,0,...,0,{'1st': 'HF21'},"{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...",3,236,444,,,2023-05-11 17:40:25
3,m,0,P0,O0,A0,S0,0,0,0,0,...,0,"{'1st': 'HF20', '2nd': 'HF10'}","{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...",4,236,445,,,2023-05-11 17:40:25
4,m,0,P0,O0,A0,S0,0,0,0,0,...,0,"{'1st': 'HF04', '2nd': 'HF10'}","{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...",5,386,723,153.5,56.5,2023-05-11 17:40:25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13995,f,60,P0,O0,A0,S0,0,0,0,0,...,0,{'1st': 'HF05'},"{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...",13996,2645,5178,143.1,50.5,2023-05-11 17:40:25
13996,f,60,P0,O0,A0,S0,0,0,0,1,...,0,"{'1st': 'HF20', '2nd': 'HF03', '3rd': 'HF24'}","{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...",13997,2646,5180,,,2023-05-11 17:40:25
13997,f,60,P0,O0,A1,S0,0,0,1,1,...,0,"{'1st': 'HF02', '2nd': 'HF03'}","{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...",13998,2647,5183,147.7,74.8,2023-05-11 17:40:25
13998,f,60,P0,O0,A0,S0,0,0,0,0,...,0,"{'1st': 'HF05', '2nd': 'HF24', '3rd': 'HF17'}","{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...",13999,2648,5185,157,58.9,2023-05-11 17:40:25


## DB insert 용으로 데이터프레임을 파일로 저장
- pickle, csv 파일 형태

In [173]:
survey_table = dummy_survey_df[['survey_id', 'user_id', 'profile_id', 
                                'survey_sex', 'survey_age_group', 'survey_pregnancy_code',
                                'survey_operation_code', 'survey_alcohol_code', 'survey_smoking_code',
                                'survey_allergy_code', 'survey_disease_code', 'survey_function_code', 
                                'survey_height', 'survey_weight', 'survey_created_at']]

survey_table

Unnamed: 0,survey_id,user_id,profile_id,survey_sex,survey_age_group,survey_pregnancy_code,survey_operation_code,survey_alcohol_code,survey_smoking_code,survey_allergy_code,survey_disease_code,survey_function_code,survey_height,survey_weight,survey_created_at
0,1,34,66,m,0,P0,O0,A0,S0,"{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...","{'1st': 'HF20', '2nd': 'HF04', '3rd': 'HF21'}",148.5,59.1,2023-05-11 17:40:25
1,2,137,257,m,0,P0,O0,A0,S0,"{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...","{'1st': 'HF21', '2nd': 'HF10'}",148.2,42,2023-05-11 17:40:25
2,3,236,444,m,0,P0,O0,A0,S0,"{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...",{'1st': 'HF21'},,,2023-05-11 17:40:25
3,4,236,445,m,0,P0,O0,A0,S0,"{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...","{'1st': 'HF20', '2nd': 'HF10'}",,,2023-05-11 17:40:25
4,5,386,723,m,0,P0,O0,A0,S0,"{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...","{'1st': 'HF04', '2nd': 'HF10'}",153.5,56.5,2023-05-11 17:40:25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13995,13996,2645,5178,f,60,P0,O0,A0,S0,"{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...",{'1st': 'HF05'},143.1,50.5,2023-05-11 17:40:25
13996,13997,2646,5180,f,60,P0,O0,A0,S0,"{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...","{'1st': 'HF20', '2nd': 'HF03', '3rd': 'HF24'}",,,2023-05-11 17:40:25
13997,13998,2647,5183,f,60,P0,O0,A1,S0,"{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...","{'1st': 'HF02', '2nd': 'HF03'}",147.7,74.8,2023-05-11 17:40:25
13998,13999,2648,5185,f,60,P0,O0,A0,S0,"{'ALLERGY': ['AL00', 'AL01', 'AL02', 'AL03', '...","{'DISEASE': ['DI00', 'DI01', 'DI02', 'DI03', '...","{'1st': 'HF05', '2nd': 'HF24', '3rd': 'HF17'}",157,58.9,2023-05-11 17:40:25


## 파일로 저장

In [176]:
# 피클 파일 및 csv 파일로 저장
import pickle

# 피클 파일 저장
with open('C:\\Users\\user\\working\\Our-family-pharmacist\\data\\survey_table.pkl', 'wb') as f:
    pickle.dump(survey_table, f)    
    
with open('C:\\Users\\user\\working\\Our-family-pharmacist\\data\\dummy_survey_df.pkl', 'wb') as f:
    pickle.dump(dummy_survey_df, f)
    
# csv 파일 저장
survey_table.to_csv('C:\\Users\\user\\working\\Our-family-pharmacist\\data\\survey_table.csv', index=False, encoding='utf-8-sig')