<h3 align="center"><FONT size="7pt">ARTHENA - Parsing Challenge</FONT></h3>
<br><h3 align="center"><FONT size="5pt">Pierre-Charles Paret-Van Wolput</FONT></h3></br>

### Import part

In [1]:
import os
from os import listdir
from html.parser import HTMLParser
import json
import sys
import numpy as np

In [2]:
path = './data/2015-03-18/'

### Parser class modification

I used an HTML parser whose methods I can override to produce expected results. It could also be done by starting the whole string parsing if needed.

This parser is case specific.

In [3]:
class MyHTMLParser(HTMLParser):
    
    #Variables to know when we are treating specific strings
    is_name = False
    is_title = False
    is_price = False
    next_div_is_price = False #warns that the next div tag encountered gives the price
    count_h3 = 0 #counts the number of h3 tags encountered
    
    #Specific variables during a single file processing
    name = ""
    title = ""
    price = ""
    
    #Variables for storage
    dict_artists_works = {}
    
    
    #Getter for output
    def get_artists_works_array_of_dicts(self):
        dict_artists_works = self.dict_artists_works
        artists = list(dict_artists_works) #access the keys of the dict
        array_dicts = [{} for x in range(len(artists))]
        
        #fill the array of new dicts 
        for i in range(len(artists)):
            artist = artists[i]
            dict = {"artist": artist, "works": dict_artists_works[artist]}
            array_dicts[i] = dict
        
        return(array_dicts)
    
    #Adds a artwork to the dicts
    def add_artwork(self):
        name = self.name
        artwork = {"title": self.title, "price": self.price}
        if name in self.dict_artists_works:
            self.dict_artists_works[name].append(artwork)
        else:
            self.dict_artists_works[name] = [artwork]
           
    
    #(Re)setter for variables used to locate informations (name, etc.)
    def reset_infile_variables(self):
        self.is_name = False
        self.is_title = False
        self.name = ""
        self.title = ""
        self.price = ""
        self.count_h3 = 0       
    
    #Method treating beginning of tags to know what to do with the data 
    def handle_starttag(self, tag, attrs):
        #reset the variables that are file-related
        if tag == 'html':
            self.reset_infile_variables()
        #warn that next data to treat is name
        if tag == 'h2':
            self.is_name = True
        #warn that next data to treat is title
        if tag == 'h3':
            if self.count_h3 == 0:
                self.is_title = True
            if self.count_h3 == 1:
                self.next_div_is_price = True
            self.count_h3 += 1
        #warn that next data to treat is price
        if (tag == 'div') & self.next_div_is_price:
            self.is_price = True
            self.next_div_is_price = False
    
    #Method treating beginning of tags to stop the special treatment of the data
    def handle_endtag(self, tag):
        #reset the variables that are file-related and add new object
        if tag == 'html':
            self.add_artwork()
            self.reset_infile_variables()
        #warn that we are done treating name
        if tag == 'h2':
            self.is_name = False
        #warn that we are done treating title
        if tag == 'h3':
            self.is_title = False
        #warn that we are done treating price
        if tag == 'div':
            self.is_price = False

    def handle_data(self, data):
        #name treatement
        if self.is_name:
            string_components = data.split(' ')
            string_name = "" 
            for s in string_components:
                if (s[0].isalpha()):
                    string_name = string_name + s + ' '
            string_name = string_name[0:len(string_name)-1] #remove space after end of last name
            self.name = string_name
        #title treatment
        if self.is_title:
            self.title = data
        #price treatment
        if self.is_price:
            self.price = data

In [4]:
def main_parsing_HTML_to_JSON(path_directory):
    
    filenames = os.listdir(path_directory)
    
    parser = MyHTMLParser()
    encoder = json.JSONEncoder()
    
    #Parser feeding
    for filename in filenames:
        
        #File Opening
        f = open(path + filename, 'r')
        string_file = f.read()
        f.close()
        
        print('Opening finished for file' + filename)
        
        parser.feed(string_file)
    
    #Preparation of outputs of the function
    array_artists_dicts = parser.get_artists_works_array_of_dicts()
    array_artists_json = encoder.encode(array_artists_dicts)
        
    return(array_artists_json)

In [5]:
out = main_parsing_HTML_to_JSON(path)

Opening finished for filelot1.html
Opening finished for filelot5.html
Opening finished for filelot4.html
Opening finished for filelot3.html
Opening finished for filelot2.html


In [6]:
sys.stdout.write(out)

[{"artist": "Rembrandt Harmensz. van Rijn", "works": [{"title": "Christ at Emmaus: The smaller Plate", "price": "GBP 6,875"}]}, {"artist": "Marc Chagall", "works": [{"title": "Self-Portrait", "price": "GBP 6,000"}]}, {"artist": "Pablo Picasso", "works": [{"title": "Quatre Femmes nues et T\u00eate sculpt\u00e9e, from: La Suite Vollard", "price": "USD 25,000"}]}, {"artist": "Joan Mir\u00f3", "works": [{"title": "Femme et Chien devant la Lune", "price": "GBP 11,250"}]}, {"artist": "Louis Marcoussis", "works": [{"title": "La Table", "price": "GBP 9,200"}]}]