In [40]:
import glob
import os
import pandas as pd
import shutil as sh
from pprint import pprint
from bs4 import BeautifulSoup

In [160]:
def get_tds_from_object(obj, data_=dict()):
    for row in obj:
        try:
            tds = row.find_all('td')
            attr_name = tds[0].text.strip().replace(':', '').replace('\n', '').replace(';', ',')
            attr_value = tds[1].text.replace('\n', ' ').strip().replace(';', ',')
            data_[attr_name] = attr_value
        except Exception as err:
            pretty_print(row)
            print(err)
    return data_

In [161]:
def get_identification_data(obj):
    rows_ = obj.find_all('tr')
    data = get_tds_from_object(rows_[1:], dict())
    return data

In [162]:
def get_culture_space_time_data(obj):
    rows_ = obj.find_all('tr')
    data = get_tds_from_object(rows_, dict())
    
    return data

In [163]:
def get_measurements_physical_description_data(obj):
    rows_ = obj.find_all('tr')
    rows_ = rows_[1:]
    rows_ = rows_[:8] + rows_[9:]
    data = get_tds_from_object(rows_, dict())

    return data

In [164]:
def get_conservation_data(obj):
    rows_ = obj.find_all('tr')
    data = dict()
    non_conservation_data_complete_text = rows_[0].text
    if non_conservation_data_complete_text == 'No presenta ficha de conservación completa':
        data['Ficha de conservación completa'] = False
        data = get_tds_from_object(rows_[1:], data)
    else:
        data['Ficha de conservación completa'] = True
        data = get_tds_from_object(rows_, data)
    
    return data

In [165]:
def get_cataloging_data(obj):
    rows_ = obj.find_all('tr')
    data = get_tds_from_object(rows_, dict())
    
    return data


In [166]:
def get_additional_resources_data(obj):
    rows_ = obj.find_all('tr')
    data = get_tds_from_object(rows_, dict())
    
    return data

In [167]:
def get_exhibitions_data(obj):
    rows_ = obj.find_all('tr')
    data = get_tds_from_object(rows_, dict())
    
    return data

In [168]:
def get_ceramic_technique_data(obj):
    rows_ = obj.find_all('tr')
    data = get_tds_from_object(rows_, dict())
    return data

In [169]:
def get_iconography_data(obj):
    rows_ = obj.find_all('tr')
    data = get_tds_from_object(rows_, dict())
    return data

In [170]:
def get_comments_data(obj):
    data_ = dict()
    tds = obj.find_all('td')
    for i_td in range(0, len(tds), 2):
        attr_name = tds[i_td].text.replace(':', '').replace('\n', '').replace(' () ', '').strip().replace(';', ',')
        attr_value = tds[i_td + 1].text.replace('\n', ' ').strip().replace(';', ',')
        data_[attr_name] = attr_value
    return data_

In [171]:
def get_anatomical_zones_ceramic_data(table_name, obj, section=None):
    
    data = dict()
    
    tds_headers = obj[0].find_all('td')
    col_1_name = tds_headers[1].text.strip().replace(';', ',')
    col_2_name = tds_headers[2].text.strip().replace(';', ',')
    for elem in obj[1:]:
        tds = elem.find_all('td')
        row_name = tds[0].text.strip().replace(';', ',')
        row_value_1 = tds[1].text.replace('\n', ' ').replace(';', ',')
        row_value_2 = tds[2].text.replace('\n', ' ').replace(';', ',')

        if section is not None:
            data_key_1 = '_'.join([table_name, section, col_1_name, row_name])
            data_key_2 = '_'.join([table_name, section, col_2_name, row_name])
        else:
            data_key_1 = '_'.join([table_name, col_1_name, row_name])
            data_key_2 = '_'.join([table_name, col_2_name, row_name])
        data[data_key_1] = row_value_1
        data[data_key_2] = row_value_2
         
    return data

In [172]:
def modify_image_file(obj_identifier, path, img_filename, ):
    '''
    sufijo 'a': imagen frontal
    '''
    if path.endswith('.html'):
        base_img_folder_path = path.replace('.html', '_archivos')
    elif path.endswith('.htm'):
        base_img_folder_path = path.replace('.htm', '_archivos')
    img_folder_path = os.path.join(base_img_folder_path, img_filename)
    img_path = os.path.join('data', 'images', f'{obj_identifier}a.jpg')
    sh.copy2(img_folder_path, img_path)
    
    return img_path


In [173]:
def get_file_paths():
    file_paths = list()

    folders = os.listdir('data')
    for f in folders:
        if f == '.DS_Store' or f == 'images':
            continue
        folder_path = os.path.join('data', f)
        html_files = glob.glob(f'{folder_path}/*.html') if glob.glob(f'{folder_path}/*.html') else glob.glob(f'{folder_path}/*.htm')
        for _file_path in html_files:
            filename_splitted = _file_path.split(' ')
            if filename_splitted[1] == 'ADMINISTRADOR':
                file_paths.append(_file_path)
    return file_paths

In [174]:
def pretty_print(list):
    print(f'Largo lista: {len(list)}')
    for i, elem in enumerate(list):
        pprint(f'{i}: {elem}')
        print()
        print()
        print()
        print()
        print()
        print()

In [175]:
def extract_data(html, path):
    soup = BeautifulSoup(html, 'html.parser')
    object_divs = soup.find_all('div', class_='bloque')
    df_data = list()
    for obj in object_divs:
        
        img_filename = obj.find('img').get('src').split('/')[-1]
        tbody = obj.find_all('tbody')[0]
        trs_inside_tbody = tbody.find_all('tr', recursive=False)

        ### INICIO 'IDENTIFICACION' TO 'EXPOSICIONES'
        table_with_first_info = trs_inside_tbody[0].find_all('table')[0].find_all('table')
        
        data_dict = dict()
        identification_data = get_identification_data(table_with_first_info[2])
        culture_space_time_data = get_culture_space_time_data(table_with_first_info[5])
        measurements_physical_description_data = get_measurements_physical_description_data(table_with_first_info[6])
        conservation_data = get_conservation_data(table_with_first_info[10])
        cataloging_data = get_cataloging_data(table_with_first_info[11])
        additional_resources_data = get_additional_resources_data(table_with_first_info[12])
        exhibitions_data = get_exhibitions_data(table_with_first_info[13])
        ### FIN 'IDENTIFICACION' TO 'EXPOSICIONES'

        ### INICIO 'TECNICA' TO 'ZONAS CERAMICAS ANATOMICAS'
        base_table = trs_inside_tbody[0].find_all('table')[15].find_all('table')

        # TÉCNICA
        tecnique_data = get_ceramic_technique_data(base_table[0])


        # Se chequea si hay tabla con zonas anatómicas
        if len(trs_inside_tbody) == 3:

            # ORIFICIO
            table_with_second_info_orificio = base_table[1]
            table_name = table_with_second_info_orificio.select('td > strong')[0].text.strip()
            anatomical_zone_table_rows = table_with_second_info_orificio.find('table', class_='tableBorder').findAll('tr')
            orificio_data = get_anatomical_zones_ceramic_data(table_name, anatomical_zone_table_rows)
            orificio_data[f'{table_name}_Forma'] = table_with_second_info_orificio.findAll('table', {'class': 'borde'})[0].find_all('td')[1].text.strip().replace(';', ',')

            # CUERPO
            table_with_second_info_cuerpo = base_table[4]
            table_name = table_with_second_info_cuerpo.select('td > strong')[0].text.strip()
            secciones_cuerpo = table_with_second_info_cuerpo.find_all('table', class_='tableBorder')
            cuerpo_data = dict()
            for c, sec_cuerpo in enumerate(secciones_cuerpo):
                anatomical_zone_table_rows = sec_cuerpo.findAll('tr')
                secc_data = get_anatomical_zones_ceramic_data(table_name, anatomical_zone_table_rows, f'seccion{c+1}')
                cuerpo_data.update(secc_data)
                cuerpo_data[f'{table_name}_seccion{c+1}_Forma'] = table_with_second_info_cuerpo.findAll('table', {'class': 'borde'})[c].find_all('td')[1].text.strip().replace(';', ',')

            # BASE 
            base_table_base = trs_inside_tbody[0].find_all('table')[26].find_all('table')
            table_with_second_info_base = base_table_base[0]
            table_name = table_with_second_info_base.select('td > strong')[0].text.strip()
            anatomical_zone_table_rows = table_with_second_info_base.find('table', class_='tableBorder').findAll('tr')
            _base_data = get_anatomical_zones_ceramic_data(table_name, anatomical_zone_table_rows)
            _base_data[f'{table_name}_Forma'] = table_with_second_info_base.findAll('table', {'class': 'borde'})[0].find_all('td')[1].text.strip().replace(';', ',')

            ### FIN 'TECNICA' TO 'ZONAS CERAMICAS ANATOMICAS'

            ### INICIO 'COMENTARIOS'
            table_with_third_info = trs_inside_tbody[2].find_all('table')[0]
            comments_data = get_comments_data(table_with_third_info)
            iconography_data = {}
            ### FIN 'COMENTARIOS'
        
        else:
            #pprint(trs_inside_tbody)
            orificio_data = {}
            cuerpo_data = {}
            _base_data = {}
            comments_data = {}
            iconography_data = get_iconography_data(base_table[1])
        


        data_dict.update(identification_data)
        data_dict.update(culture_space_time_data)
        data_dict.update(measurements_physical_description_data)
        data_dict.update(conservation_data)
        data_dict.update(cataloging_data)
        data_dict.update(additional_resources_data)
        data_dict.update(exhibitions_data)
        data_dict.update(tecnique_data)
        data_dict.update(comments_data)
        data_dict.update(orificio_data)
        data_dict.update(cuerpo_data)
        data_dict.update(_base_data)
        data_dict.update(iconography_data)
        data_dict['File path'] = path
        data_dict['Image path'] = modify_image_file(data_dict['Código Catalogación'], path, img_filename)
        print(data_dict['Código Catalogación'])
        df_data.append(data_dict)

    return df_data
            
    

In [176]:
def run(file_path: str = None):

    try:
        os.mkdir(os.path.join('data', 'images'))
    except Exception as err:
        print(err)

    if file_path is not None:
        data = open(file_path, 'r')
        df_data = extract_data(data, file_path)
    else:
        file_paths = get_file_paths()
        print('Extracting data!!')
        df_data = list()
        for path in file_paths:
            print(path)
            with open(path, 'r') as _file:
                data = _file.read()

            page_data = extract_data(data, path)
            df_data.extend(page_data)
    
    return df_data
            
        

# MAIN CODE

In [177]:

#data = run('data/cajamarca_7/1 ADMINISTRADOR COLECCIONES VIRTUALES - MUSEO LARCO.html')
#data = run('data/moche_6/623 ADMINISTRADOR COLECCIONES VIRTUALES - MUSEO LARCO.htm')
data = run()
df = pd.DataFrame(data)
print(len(df))
df.head()

[Errno 17] File exists: 'data/images'
Extracting data!!
data/sican_7/7 ADMINISTRADOR COLECCIONES VIRTUALES - MUSEO LARCO.htm
ML020107
ML020108
ML020109
ML020110
ML020111
ML020112
ML020113
ML020114
ML020115
ML020116
ML020117
ML020118
ML020119
ML020120
ML020121
ML020122
ML020123
ML020124
ML020125
ML020126
data/sican_7/82 ADMINISTRADOR COLECCIONES VIRTUALES - MUSEO LARCO.htm
ML101419
ML101420
ML200026
ML200027
ML200028
ML200029
ML200030
ML200032
ML200033
ML200035
ML200036
ML200037
ML300042
ML300043
ML400874
data/sican_7/58 ADMINISTRADOR COLECCIONES VIRTUALES - MUSEO LARCO.htm
ML024182
ML024183
ML024184
ML024185
ML024186
ML024187
ML024188
ML024189
ML024190
ML024191
ML024192
ML024193
ML024194
ML024195
ML024196
ML024197
ML024198
ML024199
ML024200
ML024201
data/sican_7/69 ADMINISTRADOR COLECCIONES VIRTUALES - MUSEO LARCO.htm
ML040198
ML040202
ML040205
ML040321
ML040323
ML100106
ML100107
ML100109
ML100112
ML100113
ML100114
ML100115
ML100116
ML100166
ML100239
ML100240
ML100551
ML100552
ML100638

Unnamed: 0,Código Catalogación,Código anterior,Nro reg. nacional,Nro antiguo reg. INC,Código de Ubicación,Inscripciones,Situación,Valor (US$),Cultura / Estilo,Región,...,FormaCilíndrico,INTERNOEXTERNOTécnica DecoraciónPintado ColorCrema y rojoPersonaje 1Personaje 2AcciónEscena,INTERNOEXTERNOTécnica DecoraciónColorPersonaje 1Personaje 2AcciónEscena,INTERNOEXTERNOTécnica DecoraciónPintado ColorGrisPersonaje 1Personaje 2AcciónEscena,INTERNOEXTERNOTécnica DecoraciónPintado ColorCremaPersonaje 1Personaje 2AcciónEscena,FormaComplejo/Escultórico,INTERNOEXTERNOTécnica DecoraciónPintadoEscultórico ColorCremaPersonaje 1Personaje 2AcciónEscena,INTERNOEXTERNOTécnica DecoraciónPintado ColorCrema y RojoPersonaje 1Personaje 2AcciónEscena,Comentario #3,Comentario #4
0,ML020107,,34758,,173-006-004,7794/,activo,,Lambayeque,Costa Norte,...,,,,,,,,,,
1,ML020108,,34759,,173-006-005,Tarmona 23-7-1940,activo,,Lambayeque,Costa Norte,...,,,,,,,,,,
2,ML020109,,34760,,173-006-006,7783/,activo,,Lambayeque,Costa Norte,...,,,,,,,,,,
3,ML020110,,34761,,173-006-007,7805/,activo,,Lambayeque,Costa Norte,...,,,,,,,,,,
4,ML020111,,34762,,173-006-008,B 1911 J 1,activo,,Lambayeque,Costa Norte,...,,,,,,,,,,


In [178]:
pretty_print(df.columns.tolist())

Largo lista: 115
'0: Código Catalogación'






'1: Código anterior'






'2: Nro reg. nacional'






'3: Nro antiguo reg. INC'






'4: Código de Ubicación'






'5: Inscripciones'






'6: Situación'






'7: Valor (US$)'






'8: Cultura / Estilo'






'9: Región'






'10: Cronología'






'11: Valle'






'12: Sitio'






'13: Material Primario'






'14: Tipo de Material'






'15: Materiales secundarios'






'16: Categoría Morfofuncional'






'17: Alto'






'18: Largo'






'19: Ancho'






'20: Peso'






'21: Escena Principal'






'22: Descripción'






'23: Ficha de conservación completa'






'24: Examinador'






'25: Estado actual de conservación'






'26: Constancia del estado'






'27: Fecha de catalogación'






'28: Catalogador'






'29: Recursos Adicionales'






'30: Exposiciones'






'31: Manufactura'






'32: Acabado'






'33: ORIFICIO_INTERNO_Técnica Decoración'






'34: ORIFICIO_EXTERNO_Técnica Decoración'






'35: OR

In [183]:
df_length = len(df)
columns_to_drop = []

for column in df.columns.tolist():
    
    nan_values = len(df[df[column].isnull()])
    empty_values = len(df[df[column] == ''])
    if (nan_values / df_length) > 0.95 or (empty_values / df_length) > 0.95:
        print(f'nan_values "{column}" pctg: {(nan_values / df_length)}')
        print(f'empty_values "{column}" pctg: {(nan_values / df_length)}')
        print()
        columns_to_drop.append(column)

pretty_print(columns_to_drop)

nan_values "Nro antiguo reg. INC" pctg: 0.0
empty_values "Nro antiguo reg. INC" pctg: 0.0

nan_values "Materiales secundarios" pctg: 0.0
empty_values "Materiales secundarios" pctg: 0.0

nan_values "Recursos Adicionales" pctg: 0.0
empty_values "Recursos Adicionales" pctg: 0.0

nan_values "Exposiciones" pctg: 0.0
empty_values "Exposiciones" pctg: 0.0

nan_values "Forma" pctg: 0.9626783754116356
empty_values "Forma" pctg: 0.9626783754116356

nan_values "Unión" pctg: 0.9631067441300099
empty_values "Unión" pctg: 0.9631067441300099

nan_values "Recubrimiento" pctg: 0.9631067441300099
empty_values "Recubrimiento" pctg: 0.9631067441300099

nan_values "Comentario #1" pctg: 0.9816604642445985
empty_values "Comentario #1" pctg: 0.9816604642445985

nan_values "Comentario #2" pctg: 0.9986613477550802
empty_values "Comentario #2" pctg: 0.9986613477550802

nan_values "INTERNOEXTERNOTécnica DecoraciónPintado ColorNaranjaPersonaje 1Personaje 2AcciónEscena" pctg: 0.9999196808653048
empty_values "INTERN

In [184]:
columns_to_drop.remove('Materiales secundarios')
columns_to_drop.remove('Recursos Adicionales')
columns_to_drop.remove('Forma')
columns_to_drop.remove('Exposiciones')
columns_to_drop.remove('Unión')
columns_to_drop.remove('Recubrimiento')
columns_to_drop.remove('Comentario #1')
columns_to_drop.remove('Comentario #2')
columns_to_drop.remove('Comentario #3')
columns_to_drop.remove('Comentario #4')
columns_to_drop.remove('Técnica Decoración')
columns_to_drop.remove('Escena')


In [185]:

modified_df = df.drop(columns_to_drop, axis=1)


In [186]:
len(modified_df)

37351

In [187]:
df.to_csv('raw_data_1.csv', header=True, sep=';', index=False)

In [117]:
#data = run('data/cajamarca_7/1 ADMINISTRADOR COLECCIONES VIRTUALES - MUSEO LARCO.html')
data2 = run('data/moche_6/623 ADMINISTRADOR COLECCIONES VIRTUALES - MUSEO LARCO.htm')
#data2 = run()
df2 = pd.DataFrame(data2)
print(len(df2))
df2.head()

[Errno 17] File exists: 'data/images'
ML013592
ML013593
ML013594
ML013595
ML013596
ML013597
ML013598
ML013599
ML013600
ML013601
Largo lista: 1
'0: <td colspan="5"><br/></td>'






list index out of range
Largo lista: 3
'0: \n'






('1: <td align="left" bgcolor="#888888" colspan="5" '
 'style="background-color:#888888;color:#ffffff;font-size:12px;"><strong>ORIFICIO</strong></td>')






'2: \n'






list index out of range
Largo lista: 1
'0: <td colspan="5"><br/></td>'






list index out of range
ML013602
11


Unnamed: 0,Código Catalogación,Código anterior,Nro reg. nacional,Nro antiguo reg. INC,Código de Ubicación,Inscripciones,Situación,Valor (US$),Cultura / Estilo,Región,...,Image path,Forma,INTERNOEXTERNOTécnica DecoraciónPintado ColorNaranjaPersonaje 1Personaje 2AcciónEscena,Unnamed: 15,Técnica Decoración,Color,Personaje 1,Personaje 2,Acción,Escena
0,ML013592,XXc-000-003,28208,,054-001-XCP,44# 3 Casa 279 ML 13 26 A-112 Mo-112/112,activo,8000,Mochica,Costa Norte,...,data/images/ML013592a.jpg,,,,,,,,,
1,ML013593,XXc,28209,,054-004-XCP,J. 30696 22.7.43 Tanguche Tanguche J.,activo,8000,Mochica,Costa Norte,...,data/images/ML013593a.jpg,,,,,,,,,
2,ML013594,XXc,28210,,047-007-XCP,A-47 Mo-47/47,activo,8500,Mochica,Costa Norte,...,data/images/ML013594a.jpg,,,,,,,,,
3,ML013595,XXc,28211,,053-001-XCP,A-111 Mo-111/111,activo,8000,Mochica,Costa Norte,...,data/images/ML013595a.jpg,,,,,,,,,
4,ML013596,XXc-000-032,28212,,049-002-XCP,Casa 32 A-77 Mo-77/77 R-2,activo,8000,Mochica,Costa Norte,...,data/images/ML013596a.jpg,,,,,,,,,


In [109]:
pretty_print(df2.columns.tolist())

Largo lista: 96
'0: Código Catalogación'






'1: Código anterior'






'2: Nro reg. nacional'






'3: Nro antiguo reg. INC'






'4: Código de Ubicación'






'5: Inscripciones'






'6: Situación'






'7: Valor (US$)'






'8: Cultura / Estilo'






'9: Región'






'10: Cronología'






'11: Valle'






'12: Sitio'






'13: Material Primario'






'14: Tipo de Material'






'15: Materiales secundarios'






'16: Categoría Morfofuncional'






'17: Alto'






'18: Largo'






'19: Ancho'






'20: Peso'






'21: Escena Principal'






'22: Descripción'






'23: Ficha de conservación completa'






'24: Examinador'






'25: Estado actual de conservación'






'26: Constancia del estado'






'27: Fecha de catalogación'






'28: Catalogador'






'29: Recursos Adicionales'






'30: Exposiciones'






'31: Manufactura'






'32: Acabado'






'33: ORIFICIO_INTERNO_Técnica Decoración'






'34: ORIFICIO_EXTERNO_Técnica Decoración'






'35: ORI