# Preprocessor

In [1]:
import os, sys
from pathlib import Path

project_path = Path().cwd().parent
os.sys.path.append(project_path.as_posix())

from src.MyModule.utils import *
import pandas as pd

config = load_config()

file_names = config['lung_file_name']

def load_raw_file(file_name):
    '''
    load the raw file from the raw database
    '''
    data_path = project_path.joinpath('data/raw')
    file_path = data_path.joinpath(file_name)
    
    extension = check_extension(file_name)
    try : 
        if extension == 'xlsx' : 
            return pd.read_excel(file_path)

        elif extension == 'csv' : 
            return pd.read_csv(file_path)
    except : 
        raise ValueError("error in reading the file")
        pass

    
def check_extension(file_name : str) : 
    '''
    check the extension for the data
    '''
    assert isinstance(file_name, str), "file name must be string"
    
    try :
        extension = str.split(file_name, '.')[1]
        return extension
    except :
        print("the file name does not contain extension")
        pass
        

In [2]:
pt_bsnf_file_name = file_names['lung_pt_bsnf']

pt_bsnf = load_raw_file(pt_bsnf_file_name)
pt_bsnf = pt_bsnf.drop(columns = "CRTN_DT")

In [3]:
from src.MyModule.preprocessing import *

In [4]:
staticprocessor = PreprocessStatic(static_data=pt_bsnf, 
                 date_columns=['BSPT_BRYM', 'BSPT_FRST_DIAG_YMD', 'BSPT_FRST_OPRT_YMD', 'BSPT_FRST_ANCN_TRTM_STRT_YMD', 'BSPT_FRST_RDT_STRT_YMD', 'BSPT_DEAD_YMD',
                                      'CENTER_LAST_VST_YMD'], 
                 statndard_date_column='BSPT_BRYM')

In [5]:
staticprocessor.preprocess()

Unnamed: 0,CENTER_CD,IRB_APRV_NO,PT_SBST_NO,BSPT_SEX_CD,BSPT_BRYM,BSPT_FRST_DIAG_YMD,BSPT_FRST_DIAG_CD,BSPT_FRST_DIAG_NM,BSPT_IDGN_AGE,BSPT_FRST_OPRT_YMD,...,BSPT_FRST_ANCN_TRTM_STRT_YMD_mu,BSPT_FRST_ANCN_TRTM_STRT_YMD_std,BSPT_FRST_RDT_STRT_YMD_mu,BSPT_FRST_RDT_STRT_YMD_std,BSPT_DEAD_YMD_mu,BSPT_DEAD_YMD_std,CENTER_LAST_VST_YMD_mu,CENTER_LAST_VST_YMD_std,OVRL_SRVL_DTRN_DCNT_mu,OVRL_SRVL_DTRN_DCNT_std
0,90,4-2021-05-20,RN00075817,M,0,1.110416,C343,"Lower lobe, bronchus or lung",1.141836,,...,1.693319e-16,1.000092,1.554231e-17,1.000091,6.318892e-17,1.000117,-4.315785e-16,1.000045,-4.636884e-17,1.000045
1,90,4-2021-05-20,RN00100695,M,0,1.147205,C343,"Lower lobe, bronchus or lung",1.141836,,...,1.693319e-16,1.000092,1.554231e-17,1.000091,6.318892e-17,1.000117,-4.315785e-16,1.000045,-4.636884e-17,1.000045
2,90,4-2021-05-20,RN00104968,F,0,0.730952,C343,"Lower lobe, bronchus or lung",0.766025,0.906418,...,1.693319e-16,1.000092,1.554231e-17,1.000091,6.318892e-17,1.000117,-4.315785e-16,1.000045,-4.636884e-17,1.000045
3,90,4-2021-05-20,RN00110825,F,0,-0.144003,C342,"Middle lobe, bronchus or lung",-0.173502,,...,1.693319e-16,1.000092,1.554231e-17,1.000091,6.318892e-17,1.000117,-4.315785e-16,1.000045,-4.636884e-17,1.000045
4,90,4-2021-05-20,RN00083644,M,0,-0.493625,C341,"Upper lobe, bronchus or lung",-0.455361,,...,1.693319e-16,1.000092,1.554231e-17,1.000091,6.318892e-17,1.000117,-4.315785e-16,1.000045,-4.636884e-17,1.000045
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11143,90,4-2021-05-20,RN00092310,M,0,-0.461467,C348,Overlapping lesion of bronchus and lung,-0.455361,,...,1.693319e-16,1.000092,1.554231e-17,1.000091,6.318892e-17,1.000117,-4.315785e-16,1.000045,-4.636884e-17,1.000045
11144,90,4-2021-05-20,RN00042625,M,0,-0.324603,C343,"Lower lobe, bronchus or lung",-0.361408,,...,1.693319e-16,1.000092,1.554231e-17,1.000091,6.318892e-17,1.000117,-4.315785e-16,1.000045,-4.636884e-17,1.000045
11145,90,4-2021-05-20,RN00052813,F,0,-0.114418,C341,"Upper lobe, bronchus or lung",-0.079550,0.026539,...,1.693319e-16,1.000092,1.554231e-17,1.000091,6.318892e-17,1.000117,-4.315785e-16,1.000045,-4.636884e-17,1.000045
11146,90,4-2021-05-20,RN00004040,F,0,1.064366,C343,"Lower lobe, bronchus or lung",1.047883,,...,1.693319e-16,1.000092,1.554231e-17,1.000091,6.318892e-17,1.000117,-4.315785e-16,1.000045,-4.636884e-17,1.000045
