In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

## Excel 파일 읽기

In [None]:
df = pd.read_excel('./RiskRate.xlsx', sheet_name='위험률조회').fillna(0.)
print(df.shape)
df.head()

In [None]:
raw_df = df.copy(deep=True)

## 필요한 컬럼만 발라내기

In [None]:
print(df.columns)

구분자는 1~4만 사용

In [None]:
df = df[['위험률명칭', '성별', '등급', '분류옵션',] \
    + [f"구분{i}" for i in range(1, 5)] +\
        [f'q{j}' for j in range(151)]]
print(df.shape)
df.head()

분류 옵션

    - 1차원 : 전처리하려는 내용
    - 2차원 : 이차위험률 (ex.재진단암)
    - 전체0 : 왜 있는지 모르겠음

In [None]:
np.unique(df['분류옵션'].values)

In [None]:
df.loc[df['분류옵션'] == '전체0']

In [None]:
# 분류옵션이 1차원인 건만 대상건으로 하자.

df = df.loc[df['분류옵션'] == '1차원']
df = df.drop(['분류옵션'], axis = 1)
print(df.shape)
df.head()

위험률명칭 컬럼

    위험률명만 뽑아내기

In [None]:
df['위험률명칭'] = df['위험률명칭'].apply(lambda x:x.split('_')[-1])
df.head()

위험률명에 > 들어간 경우 제거

In [None]:
df = df.loc[[(">" not in v) for v in df['위험률명칭'].values]]
print(df.shape)
df.sample(5)

In [None]:
df.loc[df['위험률명칭'] == "상해중환자실입원율(1-10일)"]

성별 컬럼

    성별컬럼이 ALL인경우 1, 2로 넣기

In [None]:
np.unique(df['성별'].values)

In [None]:
df2 = df.loc[df['성별'] == 'ALL'].copy(deep=True)
print(df2.shape)
df2 = df2.append(df2)
df2.index = range(df2.shape[0])
for i in range(df2.shape[0]):
    df2.iloc[i, 1] = '1' if i<df2.shape[0]/2 else '2'
print(df2.shape)
df2.head()

In [None]:
df = df.loc[df['성별'] != 'ALL']
print(df.shape)
df = df.append(df2)

print(df.shape)
df.head()

In [None]:
single = []
for row in df.values:
    s = True
    q0 = row[7]
    for q_ in row[8:]:
        if q_ != q0:
            s = False
    single.append(s)
df['단일률여부'] = single

위험률키 

In [None]:
print('구분1 ---> ', np.unique(df['구분1'].values))
print('구분2 ---> ', np.unique(df['구분2'].values))
print('구분3 ---> ', np.unique(df['구분3'].values))
print('구분4 ---> ', np.unique(df['구분4'].values))

In [None]:
riskKeys = []
for row in df.values:
    riskName, sex, degree, sub1, sub2, sub3, sub4 = row[:7]
    riskKeys.append(f"{riskName}|{degree}|{sub1}|{sub2}|{sub3}|{sub4}")
df['위험률키'] = riskKeys

In [None]:
df.head()

In [None]:
df = df.sort_values(by='위험률키')
df.index = range(df.shape[0])

In [None]:
df.head()

In [None]:
cols = ['위험률명칭','성별', '등급', '구분1', '구분2', '구분3', '구분4']
result_male = {}
result_female = {}

for row in df[cols + ['단일률여부'] +[f"q{i}" for i in range(121)]].values:
   
    riskName, sex, degree, sub1, sub2, sub3, sub4, isSingle = row[:8]
    
    key = f"{riskName}|{degree}|{sub1}|{sub2}|{sub3}|{sub4}"

    qx = row[8:]
    
    if int(sex)==1:
        result_male[key] = {'qx' : qx[:121], 'isSingle' : isSingle}
    else:
        result_female[key] = {'qx' : qx[:121], 'isSingle' : isSingle}

In [None]:
with open('result.csv', 'w') as f:
    f.write("위험률명칭|등급| 구분1| 구분2|구분3|구분4|연령|남자|여자 \n")
    for key in riskKeys:
        try:
            qx_male = result_male[key]['qx']
        except:
            qx_male = [0.]*121
        try:
            qx_female = result_female[key]['qx']
        except:
            qx_female = [0.]*121
        try:
            isSingle = result_male[key]['isSingle']
        except:
            isSingle = result_female[key]['isSingle']
    
        if isSingle:
            f.write(key + f'|ZZ|{qx_male[0]}|{qx_female[0]} \n')       
        else:
            for s in range(121):
                f.write(key + f'|{s}|{qx_male[0]}|{qx_female[0]} \n')        
