In [69]:
import requests
from bs4 import BeautifulSoup
import string
import re
import os
from lxml import etree as ET
from lxml.builder import ElementMaker

In [3]:
request_url = 'https://www.kindsnacks.com/ingredients-a-z'
response = requests.get(request_url)

In [52]:
NAME_BAD_CHARS = re.compile(r'([^a-zA-Z0-9_ -])')

def clean_name(name, camel_case = False):
    name = NAME_BAD_CHARS.sub('', name)
    if camel_case == False:
        return name.replace('  ', ' ').replace(' ', '-').lower()
    else:
        name = name.title().replace(' ', '').replace('-', '').replace('_', '')
        return name[0].lower() + name[1:]

In [40]:
alpha_list = list(string.ascii_lowercase)

In [60]:
E = ElementMaker(namespace="http://www.demandware.com/xml/impex/library/2006-10-31",
                 nsmap={None : "http://www.demandware.com/xml/impex/library/2006-10-31"})

In [78]:
soup = BeautifulSoup(response.text, 'html.parser')

FOLDERS = []
CONTENT = []

for count, alpha in enumerate(alpha_list):
    alpha_soup = soup.find(id=alpha)
    
    if alpha_soup:
        folder_id = 'ingredients-' + alpha_soup.h1.string
        folder_name = alpha_soup.h1.string
        FOLDERS.append(
            E("folder",
              E("display-name", str(folder_name)),
              E("online-flag", "true"),
              E("parent", "ingredients-a-z"),
              E("position", str(count)),
              **{"folder-id": str(folder_id)}
            )
        )

        for ingredient_name in alpha_soup.find_all('h3'):
            ingredient_name = ingredient_name.string
            ingredient_id = clean_name(ingredient_name)
            ingredient_body = ingredient_name.findNext('p').string
            
            CONTENT.append(
                E("content",
                  E("display-name", str(ingredient_name)),
                  E("online-flag", "true"),
                  E("searchable-flag", "false"),
                  E("custom-attributes",
                    E("custom-attribute",
                     E("value", str(ingredient_body)),
                      **{"attribute-id": "body"}
                     )
                   ),
                  E("folder-links",
                   E("classification-link",
                    **{"folder-id": folder_id})
                   ),
                  **{"content-id": str(ingredient_id)}
                 )
            )
            
INGREDIENTS = E("library",
                E("folder",
                 **{"folder-id": "ingredients", "mode": "delete"}),
                E("folder",
                 E("display-name", "Ingredients A-Z"),
                  E("description", "take a look at everything we put in our snacks"),
                  E("online-flag", "true"),
                  E("template", "rendering/folder/glossary"),
                  **{"folder-id": "ingredients-a-z"}
                 ),
                *FOLDERS,
                *CONTENT,
                **{"library-id": "KINDSnacksSharedLibrary"}
               )

In [79]:
schema = ET.XMLSchema(file="../schemas/library.xsd")
if not schema.validate(INGREDIENTS):
    print(schema.error_log)

with open(os.path.join("../data/ingredients.xml"), 'wb') as f:
    str_xml = ET.tostring(INGREDIENTS, xml_declaration=True, pretty_print=True, encoding="UTF-8")
    f.write(str_xml)