<h3 align="center"><FONT size="7pt">ARTHENA - Parsing Challenge</FONT></h3>
<br><h3 align="center"><FONT size="5pt">Pierre-Charles Paret-Van Wolput</FONT></h3></br>

### Import part

In [1]:
import os
from os import listdir
from html.parser import HTMLParser
import json
import sys
import numpy as np

In [2]:
path = './data/2015-03-18/'

### Parser class modification

I used an HTML parser whose methods I can override to produce expected results. It could also be done by starting the whole string parsing if needed.

This parser is case specific.

In [19]:
class MyHTMLParser(HTMLParser):
    
    #Variables to know when we are treating specific strings
    is_name = False
    is_title = False
    
    #Specific variables during a single file processing
    name = ""
    title = ""
    count_h3 = 0
    
    #Variables for storage
    dict_artists_works = {}
    
    
    #Getter for output
    def get_artists_works_array_of_dicts(self):
        dict_artists_works = self.dict_artists_works
        print(dict_artists_works)
        artists = list(dict_artists_works) #access the keys of the dict
        array_dicts = [{} for x in range(len(artists))]
        
        #fill the array of new dicts 
        for i in range(len(artists)):
            artist = artists[i]
            dict = {'artist': artist, 'works': dict_artists_works[artist]}
            array_dicts[i] = dict
        
        return(array_dicts)
    
    #Adds a artwork to the dicts
    def add_artwork(self):
        name = self.name
        if name in self.dict_artists_works:
            self.dict_artists_works[name].append(self.title)
        else:
            self.dict_artists_works[name] = [self.title]
        
        
    
    #(Re)setter for variables used to locate informations (name, etc.)
    def reset_infile_variables(self):
        self.is_name = False
        self.is_title = False
        self.name = ""
        self.title = ""
        self.count_h3 = 0
    
    #Method treating beginning of tags to know what to do with the data 
    def handle_starttag(self, tag, attrs):
        #reset the variables that are file-related
        if tag == 'html':
            self.reset_infile_variables()
        #warn that next data to treat is name
        if tag == 'h2':
            self.is_name = True
        #warn that next data to treat is title
        if (tag=='h3') :
            if self.count_h3 == 0:
                self.is_title = True
            self.count_h3 += 1
    
    #Method treating beginning of tags to stop the special treatment of the data
    def handle_endtag(self, tag):
        #reset the variables that are file-related and add new object
        if tag == 'html':
            self.add_artwork()
            self.reset_infile_variables()
        #warn that we are done treating name
        if tag == 'h2':
            self.is_name = False
        #warn that we are done treating title
        if tag=='h3':
            self.is_title = False

    def handle_data(self, data):
        #name treatement
        if self.is_name:
            string_components = data.split(' ')
            string_name = "" 
            for s in string_components:
                if (s[0].isalpha()):
                    string_name = string_name + s + ' '
            string_name = string_name[0:len(string_name)-1] #remove space after end of last name
            self.name = string_name
        #title treatment
        if self.is_title:
            string_components = data.split(' ')
            self.title = data 

In [20]:
def main_parsing_HTML_to_JSON(path_directory):
    
    filenames = os.listdir(path_directory)
    
    parser = MyHTMLParser()
    encoder = json.JSONEncoder()
    
    #Parser feeding
    for filename in filenames:
        
        #File Opening
        f = open(path + filename, 'r')
        string_file = f.read()
        f.close()
        print(string_file)
        
        print('Opening finished for file' + filename)
        
        parser.feed(string_file)
    
    #Preparation of outputs of the function
    array_artists_dicts = parser.get_artists_works_array_of_dicts()
    array_artists_json = encoder.encode(array_artists_dicts)
        
    return(array_artists_json)

In [21]:
out = main_parsing_HTML_to_JSON(path)

<html>
    <title>
        Rembrandt Harmensz. van Rijn: Christ at Emmaus: The smaller Plate
    </title>
    <body>
      <h2>Rembrandt Harmensz. van Rijn</h2>
      <h3>Christ at Emmaus: The smaller Plate</h3>
        <div>
          Rembrandt Harmensz. van Rijn
          Christ at Emmaus: The smaller Plate
          etching with drypoint, 1634, without watermark, a good impression of this scarce print, New Hollstein's only state, beginning to show some wear in places, trimmed to or on the platemark, generally in very good condition
          P., S. 104 x 75 mm.
        </div>
      <h3>Price realised</h3>
      <div>GBP 6,875</div>
    </body>
</html>

Opening finished for filelot1.html
<html>
    <title>
        Marc Chagall: Self-Portrait
    </title>
    <body>
      <h2>Marc Chagall</h2>
      <h3>Self-Portrait</h3>
        <div>
          Marc Chagall
          Self-Portrait
          lithograph in colours, 1960, on Arches wove paper, signed in pencil, numbered 35/40, published

In [22]:
sys.stdout.write(out)

[{"artist": "Rembrandt Harmensz. van Rijn", "works": ["Christ at Emmaus: The smaller Plate"]}, {"artist": "Marc Chagall", "works": ["Self-Portrait"]}, {"artist": "Pablo Picasso", "works": ["Quatre Femmes nues et T\u00eate sculpt\u00e9e, from: La Suite Vollard"]}, {"artist": "Joan Mir\u00f3", "works": ["Femme et Chien devant la Lune"]}, {"artist": "Louis Marcoussis", "works": ["La Table"]}]