In [747]:
from crawl_utils.html_request import * 
from crawl_utils.url_extractor import *

def strip_all(text):
    '''
    input : text(str)
    output : text(str)

    strip all blank in the text
    '''
    return text.strip().replace(" ", "").replace("\n", "").replace("\t", "").replace('\r',"")


def get_row(row):
    '''
    input : row(bs4.element.ResultSet)
    output : element(list)

    get list of text from row
    '''
    element = []
    for r in row:
        if r.has_attr('colspan'):
            element += [strip_all(r.text) for _ in range(int(r['colspan']))]
        else:
            element += [strip_all(r.text)]
    return element

def get_table_element(dom):
    '''
    input : dom
    output : list

    get element whose tag name is 'td' or 'th' or 'dd'
    '''
    return dom.find_all(['td', 'th', 'dd'])


def get_template(row):
    '''
    input : row(bs4.element.ResultSet)
    output : template(list)

    get text whose tag has attribute rowspan, otherwise None
    '''
    template = []
    rowspan_list = []
    for r in row:
        if r.has_attr('rowspan'):
            rowspan_list.append(int(r["rowspan"]))
            if r.has_attr('colspan'):
                template += [r.text for _ in range(int(r["colspan"]))]
            else:
                template += [r.text]
        elif r.has_attr('colspan'):
            template += [None for _ in range(int(r["colspan"]))]
        else:
            template += [None]
    return template, rowspan_list

def merge_template(template, row):
    '''
    input : template(list), row(list)
    output : merged(list)

    given template(mostly row having rowspan element),
    merge it with row
    '''
    merged = []
    for temp in template:
        if temp is not None:
            merged += [temp]
        else:
            merged += [row.pop(0)]
    return merged
    
    
def how_many_not_None(lst):
    return len([_ for _ in lst if _ is not None])

def tbody_parsing(rows):
    '''
    input : rows(list of bs4.element.ResultSet)
    output : element_list(list)

    parse 
    '''
    element = []
    element_list = []
    if rows:
        standard_len = max(len(get_table_element(r)) for r in rows)
        template, _ = get_template(get_table_element(rows[0]))
        for row in rows:
            emnt = get_table_element(row)
            if emnt:
                if any(e for e in emnt if e.has_attr('rowspan')):
                    template_sub, rowspan_list = get_template(emnt)
                if len(emnt) == standard_len:
                    template, rowspan_list = get_template(emnt)
                    element = get_row(emnt)
                else:
                    if how_many_not_None(template) + how_many_not_None(get_row(emnt)) == standard_len:
                        element = merge_template(template, get_row(emnt))
                    else:   
                        try:
                            template_ = merge_template(template, template_sub)
                            element = merge_template(template_, get_row(emnt))
                        except:
                            element = ['' for _ in range(standard_len)]

            element_list.append(element)
        return element_list



def get_table_column(table):
    '''
    input : table(dom obejct)
    output : column(list)
    
    given table dom, get column list 
    if having double column, merge first and second
    '''
    columns = []
    columns_final = []
    trs = table.find_all('tr')
    if len(trs) > 1:
        for tr in trs:
            columns.append(tr.find_all(['th', 'td']))
        first_row, _ = get_template(columns[0])
        if any(first_row):
            second_row = [strip_all(c.text) for c in columns[1]]
            return (merge_template(first_row, second_row), 2)
            
        else:
            return (get_row(columns[0]), 1)
    else:
        return (get_row(trs[0].find_all(['th', 'td', 'dd'])), 1)

def table_parsing(url):
    '''
    input : url(str)
    output : list of table(DataFrame)

    get list of table given url
    '''
    table_df_list = []
    columns = []
    soup = parsing(url)
    if soup:
        tables = soup.find_all('table')
        for table in tables:
            columns_body, n = get_table_column(table)
            rows = table.find_all('tr')
            element_list = tbody_parsing(rows)
            columns, n = get_table_column(table)
            len_element = max(len(e) for e in element_list)
            for _ in range(n):
                element_list.pop(0)
            if not columns:
                columns = [str(_) for _ in range(len_element)]
            columns = list(map(strip_all, columns))
            i = 1
            table_df = pd.DataFrame()
            for key, value in zip(columns, zip(*element_list)):
                if key not in table_df:
                    table_df[key] = [v for v in list(value)]         
                else:
                    table_df["{}_{}".format(key, i)] = [v for v in list(value)] 
                    i += 1
            table_df_list.append(table_df)
    table_df_list = [table for table in table_df_list if any(table)]
    return table_df_list


In [752]:
link = 'http://www.hanmihospital.com/index.php?mid=certificate'

t = table_parsing(link)[0]

In [755]:
t.to_csv('카티스뎀.csv')

In [736]:
# link = 'http://barobone.kr/06_customer/customer04.html'
# link = 'http://www.disc21.co.kr/doc/02inform/inform6.php'
table = parsing(link).find_all('table')[0]

['분류', None, None, None, '특이사항'] ['분류', None, None, None, '특이사항']
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] ['BMAC(자가골수줄기세포치료)', None, None, '\xa0']
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] [None, '\xa0']
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] []
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] []
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] []
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] []
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] []
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] []
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] []
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] []
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] []
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] []
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] []
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] []
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] []
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] []
['치료재료대', '카티스템(Cartistem)', None, None, '\xa0'] []


Unnamed: 0,분류,명칭,구분,비용,특이사항
0,상급병실료차액,상급병실차액-1인실(5병동),1인실,140000,
1,식대,보호자식,,5500,
2,식대,공기밥,,1000,
3,초음파검사료,ULTRASONOGRAM-초음파,"초음파검사-근골격,연부",30000,급여인정기준외실시한경우비급여
4,초음파검사료,ULTRASONOGRAM-초음파1,"초음파검사-근골격,연부-정맥혈관",50000,급여인정기준외실시한경우비급여
...,...,...,...,...,...
168,제증명 수수료,의사소견서-노인장기요양,(일반20%),7110,
169,제증명 수수료,의사소견서-노인장기요양,(수급권자10%),3550,
170,제증명 수수료,의사소견서-노인장기요양,(의뢰서없이내원),35570,
171,제증명 수수료,의무기록사본,,1000,


In [687]:
rows = table.find_all('tr')
for r in rows:
    print(get_table_element(r))
    

[<th class="pe_qF pe_qL" rowspan="2" scope="col">분류</th>, <th class="pe_qF pe_qL" scope="col">항목</th>, <th class="pe_qF pe_qL" colspan="2" scope="col">가격정보(단위:원)</th>, <th class="pe_qF pe_qL" rowspan="2" scope="col">특이사항</th>]
[<th class="pe_qF pe_qL" scope="col">명칭</th>, <th class="pe_qF pe_qL" scope="col">구분</th>, <th class="pe_qF pe_qL" scope="col">비용</th>]
[<td class="tl111">상급병실료 차액</td>, <td class="tl111">상급병실차액-1인실(5병동)</td>, <td class="tl111">1인실</td>, <td class="tl1111">140,000</td>, <td> </td>]
[<td class="tl111" rowspan="2">식대</td>, <td class="tl111">보호자식</td>, <td class="tl111"> </td>, <td class="tl1111">5,500</td>, <td> </td>]
[<td class="tl111">공기밥</td>, <td class="tl111"> </td>, <td class="tl1111">1,000</td>, <td> </td>]
[<td class="tl111" rowspan="6">초음파검사료</td>, <td class="tl111">ULTRASONOGRAM -초음파</td>, <td class="tl111">초음파검사-근골격,연부</td>, <td class="tl1111">30,000</td>, <td class="tl111">급여 인정기준 외<br/>
			실시한 경우 비급여</td>]
[<td class="tl111">ULTRASONOGRAM -초음파1</td>, 

In [427]:
tbody_parsing(rows)

[['분류', '기본항목', '세부항목', '단위', '가격', '비고'],
 ['시술', 'PROLO(증식치료)', '', '1회', '60,000', ''],
 ['시술', 'ESWT(체외충격파)', '', '1회', '60,000', ''],
 ['수술및수술재료대', '술후신발', '', '1개', '60,000', ''],
 ['수술 및수술 재료대', 'KD', '', '1개', '200,000', ''],
 ['수술 및수술 재료대', 'KD', '', '2개', '350,000', ''],
 ['수술 및수술 재료대', '인조골(미네날본)', '', '1개', '500,000', ''],
 ['수술 및수술 재료대', 'Prostop', '', '1개', '1,500,000', ''],
 ['수술 및수술 재료대', '무지외반증수술(일반)', '', '1족지', '6,000,000', ''],
 ['수술 및수술 재료대', '소건막류수술(일반)', '', '1족지', '6,000,000', ''],
 ['수술 및수술 재료대', '단지증수술(일반)', '', '1족지', '5,000,000', ''],
 ['수술 및수술 재료대', '장지증수술(일반)', '', '1족지', '3,000,000', ''],
 ['수술 및수술 재료대', '기타족부일반수술', '', '1족', '6,000,000', ''],
 ['수술 및수술 재료대', 'REGENSEAL', '', '1개', '500,000', ''],
 ['수술 및수술 재료대', '유착방지제', '', '1개', '200,000', ''],
 ['수술 및수술 재료대', '습윤드레싱(Sorbact)', '', '1개', '20,000', ''],
 ['보조기및보호대', '기성깔창', '', '1족', '60,000', ''],
 ['보조기 및보호대', 'Heelcup', '', '1개', '40,000', ''],
 ['보조기 및보호대', '쎄라밴드', '', '1개', '15,000', ''],
 ['보조기 및보

In [None]:
get_template()

In [336]:
link = 'http://www.disc21.co.kr/doc/02inform/inform6.php'
table = parsing(link).find_all('table')[0]

In [337]:
get_table_column(table2[0])

(['분류', '기본항목', '세부항목', '단위', '가격', '비고'], 1)

In [340]:
table2[0]

<table cellpadding="0" cellspacing="0" class="table_default" summary="비급여 항목 현황표 입니다.">
<caption>비급여 항목 현황표 입니다.</caption>
<colgroup>
<col style="width:20%"/>
<col style="width:25%"/>
<col style="width:15%"/>
<col style="width:13.75%"/>
<col style="width:13.75%"/>
<col style="width:13.75%"/>
</colgroup>
<thead>
<tr>
<th scope="col">분류</th>
<th scope="col">기본항목</th>
<th scope="col">세부항목</th>
<th scope="col">단위</th>
<th scope="col">가격</th>
<th scope="col">비고</th>
</tr>
</thead>
<tbody>
<tr>
<th rowspan="2" scope="row">시술</th>
<td>PROLO(증식치료)</td>
<td></td>
<td>1회</td>
<td>60,000</td>
<td></td>
</tr>
<tr>
<td>ESWT(체외충격파)</td>
<td></td>
<td>1회</td>
<td>60,000</td>
<td></td>
</tr>
<tr>
<th rowspan="13" scope="row">수술 및<br/>수술 재료대</th>
<td>술후신발</td>
<td></td>
<td>1개</td>
<td>60,000</td>
<td></td>
</tr>
<tr>
<td rowspan="2">KD</td>
<td rowspan="2"></td>
<td>1개</td>
<td>200,000</td>
<td></td>
</tr>
<tr>
<td>2개</td>
<td>350,000</td>
<td></td>
</tr>
<tr>
<td>인조골(미네날본)</td>
<td></td>
<td>1개</td>


In [425]:
link2 = 'http://www.disc21.co.kr/doc/02inform/inform6.php'
table2 = parsing(link2).find_all('table')
print(get_table_column(table2[0]))
table_parsing(link2)[0]

(['분류', '분류', '분류', '명칭', '코드', '금액', '최저비용', '최고비용', '특이사항'], 2)


Unnamed: 0,분류,명칭,코드,금액,최저비용,최고비용,특이사항
0,,헬리코박터파일로리검사,B4151,11600,,,
1,,헬리코박터파일로리검사+생검료,B4151,23040,,,E7611010포함
2,,인플루엔자A.B바이러스항원검사(현장검사),CZ394,20000,,,
3,,당화알부민검사,CZ241,20000,,,
4,,AntiCCPAb(류마티스관절염검사),CZ432,50000,,,
5,일반화학검사,IMA(허혈성변형알부민)TEST,CZ246,50000,,,
6,"외피,근골기능검사",동작분석역동적근전도1.2,EZ773,,60000.0,100000.0,
7,"외피,근골기능검사",체열촬영[D.I.T.I],EZ776,,50000.0,150000.0,
8,수면내시경,위수면내시경관리료,VB0300001,30000,,,
9,수면내시경,대장수면내시경관리료,VB0300002,50000,,,


In [321]:
get_table_column(table)

(['분류', '분류', '분류', '명칭', '코드', '금액', '최저비용', '최고비용', '특이사항'], 2)

In [322]:
table_parsing(link2)[3]

Unnamed: 0,상호명 :의료법인해담의료재단 | 대표자:유만훈주 소:충남천안시서북구검은들1길12(불당동702번지)대표전화:041-559-9999/FAX:041-559-9998/E-mail:naeunhospital@hanmail.net COPYRIGHT(C)2018www.barun.comALLRIGHTRESERVED.
0,상호명 :의료법인해담의료재단 | 대표자:유만훈주 소:충남천안시서북구...
1,상호명 :의료법인해담의료재단 | 대표자:유만훈주 소:충남천안시서북구...
2,
3,상호명 :의료법인해담의료재단 | 대표자:유만훈
4,주 소:충남천안시서북구검은들1길12(불당동702번지)
5,대표전화:041-559-9999/FAX:041-559-9998/E-mail:naeu...
6,COPYRIGHT(C)2018www.barun.comALLRIGHTRESERVED.
7,
8,COPYRIGHT(C)2018www.barun.comALLRIGHTRESERVED.
9,


In [263]:
link3 = 'http://jeonju.wooridul.co.kr/?page_id=3247'
table3 = parsing(link3).find_all('table')[]
table3

<table border="0" cellpadding="0" cellspacing="0" class="basic-table issue-table1">
<thead>
<tr>
<th></th>
<th></th>
<th colspan="2">항목</th>
<th colspan="6">가격정보(단위: 원)</th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<th width="10%">중분류</th>
<th width="10%">분류</th>
<th width="10%">명칭</th>
<th width="5%">코드</th>
<th width="5%">구분</th>
<th width="8%">비용</th>
<th width="8%">최저비용</th>
<th width="8%">최고비용</th>
<th width="12%">치료재료대<br/>포함여부</th>
<th width="12%">약제비<br/>포함여부</th>
<th width="12%">특이사항</th>
</tr>
<tr>
<td>기본진료료</td>
<td>상급병실료 차액</td>
<td>1인실</td>
<td>AB902</td>
<td>일반</td>
<td>100,000</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td></td>
</tr>
<tr>
<td>검사료</td>
<td>감염증 기타 검사</td>
<td>인플루엔자 A·B 바이러스항원검사 [현장검사]</td>
<td>CZ394</td>
<td></td>
<td>28,000</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td>노490</td>
</tr>
<tr>
<td>검사료</td>
<td>자가면역질환검사</td>
<td>항CCP항체[IgG]</td>
<td>CZ432</td>
<td></td>
<td>42,900</td>
<td></td>
<td></td>
<td></td>
<td></td>
<td>노432</td>
</tr>
<tr>
<td>검사료

In [264]:
get_table_column(table3)

(['',
  '',
  '항목',
  '항목',
  '가격정보(단위:원)',
  '가격정보(단위:원)',
  '가격정보(단위:원)',
  '가격정보(단위:원)',
  '가격정보(단위:원)',
  '가격정보(단위:원)',
  ''],
 1)

In [256]:
table_parsing(link)[0]

Unnamed: 0,분류,명칭,코드,금액,최저비용,최고비용,특이사항
0,,헬리코박터파일로리검사,B4151,11600,,,
1,,헬리코박터파일로리검사+생검료,B4151,23040,,,E7611010포함
2,,인플루엔자A.B바이러스항원검사(현장검사),CZ394,20000,,,
3,,당화알부민검사,CZ241,20000,,,
4,,AntiCCPAb(류마티스관절염검사),CZ432,50000,,,
5,일반화학검사,IMA(허혈성변형알부민)TEST,CZ246,50000,,,
6,"외피,근골기능검사",동작분석역동적근전도1.2,EZ773,,60000.0,100000.0,
7,"외피,근골기능검사",체열촬영[D.I.T.I],EZ776,,50000.0,150000.0,
8,수면내시경,위수면내시경관리료,VB0300001,30000,,,
9,수면내시경,대장수면내시경관리료,VB0300002,50000,,,


In [82]:
thead

[<thead>
 <tr>
 <th class="b_blue bg_blue" scope="col" width="102"></th>
 <th class="b_blue bg_blue" colspan="2" scope="col" width="196">항목</th>
 <th class="b_blue bg_blue" colspan="7" scope="col" width="595">가격정보(단위 : 원)</th>
 </tr>
 </thead>,
 <thead>
 <tr>
 <th class="b_blue bg_blue" scope="col" width="102"></th>
 <th class="b_blue bg_blue" colspan="2" scope="col" width="196">항목</th>
 <th class="b_blue bg_blue" colspan="7" scope="col" width="595">가격정보(단위 : 원)</th>
 </tr>
 </thead>,
 <thead>
 <tr>
 <th class="b_blue bg_blue" scope="col" width="102"></th>
 <th class="b_blue bg_blue" colspan="2" scope="col" width="196">항목</th>
 <th class="b_blue bg_blue" colspan="7" scope="col" width="595">가격정보(단위 : 원)</th>
 </tr>
 </thead>,
 <thead>
 <tr>
 <th class="b_blue bg_blue" scope="col" width="102"></th>
 <th class="b_blue bg_blue" colspan="2" scope="col" width="196">항목</th>
 <th class="b_blue bg_blue" colspan="7" scope="col" width="595">가격정보(단위 : 원)</th>
 </tr>
 </thead>,
 <thead>
 <tr>
 <th 

In [69]:
parsed[0]

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,분류,명칭,코드,구분,비용,최저비용,최고비용,치료재료대 포함여부,약제비 포함여부,특이사항
1,뇨검사,Urine HCG-요임신반응검사(정성),B0260,,1870,,,X,X,
2,혈액학검사,혈소판 응집능검사[교류저항혈소판응집]_Trap,BZ078,,50000,,,X,X,
3,혈액학검사,비)성장호르몬-핵의학적방법,C7342007,,30000,,,X,X,
4,혈액화학검사,의뢰)혈액암(남)검사,TMTEST1,,90000,,,X,X,
5,혈액화학검사,의뢰)혈액암(여)검사,TMTEST,,90000,,,X,X,
6,혈액화학검사,비)혈소판 약물 반응검사 (아스피린),BZ071,,50000,,,X,X,
7,혈액화학검사,의뢰)Testosterone 비급여,D3710060A,,50000,,,X,X,
8,혈액화학검사,의뢰)TBPE-약물및독물검사(정성)-비급여,C450139+,,20000,,,X,X,
9,혈액화학검사,의뢰)Cannabinoids-약물및독물검사(정성)-비급여,C450139A+,,20000,,,X,X,


In [63]:
get_table_column_thead(tbody)

['뇨검사', 'UrineHCG-요임신반응검사(정성)', 'B0260', '', '1,870', '', '', 'X', 'X', '']

In [34]:
table_parsing('http://www.yonserang.com/template/etc/non_pay_guide.php')[0]

Unnamed: 0,분류,명칭,코드,구분,비용,최저비용,최대비용,치료대포함여부,약제비포함여부,특이사항
0,상급병실료차액,병실료차액,AB9000,1인실,190000,,,,,
