# Get Data

## Imports and Setup


In [1]:
# Imports
import requests
from bs4 import BeautifulSoup
from lxml import etree
from time import sleep
import cssselect
import re

In [2]:
# Set constants for use later
BASE_URL = "https://www.mailleartisans.org/weaves/weavedisplay.php?key="
FIRST_ARTICLE = 1
LAST_ARTICLE = 1487

In [3]:

# Create function to get multiple articles
def get_articles(articles) -> dict:
    out = dict()
    for i in articles:
        print(f"Getting article {i}")
        response = requests.get(f"{BASE_URL}{i}")
        if response.status_code == 200:
            out[i] = response.content
            print(f"Got article {i} sleeping for 10 seconds.")
        else:
            out[i] = 'failure'
            print(f"Could not get article {i} sleeping for 10 seconds.")
        sleep(10)
    
    return out


## Test Getting Data From Articles

### Define Function To Get Data From Articles

In [47]:

def get_articles_and_tags(tree) -> dict:
    out = dict()

    # Get Article and Tag information
    path = '/html/body/div[5]/div/table/tr/td[2]/div[2]//text()'
    lines = [i for i in tree.xpath(path) if i not in ('[', ']')]

    ## Get indicies to split the tag and article data
    tags_start = lines.index('Weave Tags (Click to Search Weaves) ')
    article_start = lines.index('Related Articles ')
    gallery_start = lines.index('Random Gallery Items Tagged as using this Weave ')


    # Get Articles

    ## Get only article lines
    article_lines = lines[article_start+1:gallery_start]

    ## Get Article headings
    path = '/html/body/div[5]/div/table/tr/td[2]/div[2]/b//text()'
    headings = tree.xpath(path)

    if len(headings) > 0:
        ## Parse list and headings into dict of lists of articles by heading

        ### Set starting values
        tmp = list()
        level = list()
        start = True

        ### Split lines into on list per heading
        for i in article_lines:
            if (i in headings) and start:   
                level = list()
                level.append(i)
                start = False
            elif (i in headings) and not start:
                tmp.append(level)
                level = list()
                level.append(i)
            else:
                level.append(i)
        tmp.append(level)

        ## Clean up list into list of article titles
        lol = [''.join(i).split('\n') for i in tmp]

        ### Turn list into dictionary
        tmp = dict()
        for i in lol:
            tmp[i[0]] = [j for j in i[1:] if j != '']

        out['Articles'] = tmp
    else:
        out['Articles'] = None

    # Get Tags
    
    ## Get tag lines
    tag_lines = lines[tags_start+1:article_start]
    
    ## Remove unecessary info and clean tag lines
    out['Tags'] = [i.strip() for i in tag_lines if i not in ('\n', ', ')] 

    return out



def article_parser(article: bytes, art_number: int) -> dict:
    # Create output dict
    out = dict()
    out['Article Number'] = art_number

    # Convert article into an etree
    soup = BeautifulSoup(article, 'html.parser')
    dom = etree.HTML(str(soup.html))


    # Parse etree to find values of interest

    ## Determine if an article exists
    if len(dom.xpath('/html/body/div[5]/div/h2')) > 0:
        out = {
            'Weave Title':   None,
            'Max AR':        None,
            'Ideal AR':      None,
            'Min AR':        None,
            'Date Uploaded': None,
            'Last Edited':   None,
            'Articles':      None,
            'Tags':          None,
        }
        return out
    else:
        ## Weave Title
        path = '/html/body/div[5]/div/table/tr/td[2]/div[2]/font'
        title = dom.xpath(path)[0].text
        out['Weave Title'] = title

        ## Get AR Values

        ### Get AR val string from page
        path = '/html/body/div[5]/div/table/tr/td[1]/table/tr[2]/td/div[not(@class)][2]//text()'
        ar_val_string = ' '.join([i for i in dom.xpath(path) if i != '\n'][:-1])

        ### Extract Aspect Ratio Values from the AR val string
        max_ar = re.search('%s(.*)%s' % ('Max. AR :', ' Ideal AR :'), ar_val_string).group(1).replace(' ', '')
        ideal_ar = re.search('%s(.*)%s' % (' Ideal AR :', 'Min. AR :'), ar_val_string).group(1).replace(' ', '')
        min_ar = re.search('%s(.*)' % ('Min. AR :'), ar_val_string).group(1).replace(' ', '')

        ### Add AR values to output dictionary
        out.update({'Max AR': max_ar, 'Ideal AR': ideal_ar, 'Min AR': min_ar})


        ## Get Dates

        ### Get date string
        path = '/html/body/div[5]/div/table/tr/td[1]/table/tr[2]/td/div[not(@class)][3]//text()'
        results = dom.xpath(path)
        date_string = ''.join(results).replace('\n', '')

        ### Extract Dates from date string
        date_uploaded = re.search('%s(.*)%s' % ('Date Uploaded', 'Last Edited'), date_string).group(1).strip()
        last_edited = re.search('%s(.*)' % ('Last Edited'), date_string).group(1).strip()

        ### Add AR values to output dictionary
        out.update({'Date Uploaded': date_uploaded, 'Last Edited': last_edited})


        ## Get Article and Tag information
        out.update(get_articles_and_tags(dom))

        return out



### Get Test Articles


In [5]:
# Get some articles and store them to avoid repeated calls to the website

to_get = [
    1,    # single AR no Min no Max
    2,    # multi AR no Min no Max
    6,    # single AR no Max
    11,   # multi AR no Max
    189,  # No ARs given
    1086, # Single ar all ARs given
    1173, # multi AR all values given
    1488, # Error page with No AR values
]

arts = get_articles(to_get)



Getting article 1
Got article 1 sleeping for 10 seconds.
Getting article 2
Got article 2 sleeping for 10 seconds.
Getting article 6
Got article 6 sleeping for 10 seconds.
Getting article 11
Got article 11 sleeping for 10 seconds.
Getting article 189
Got article 189 sleeping for 10 seconds.
Getting article 1086
Got article 1086 sleeping for 10 seconds.
Getting article 1173
Got article 1173 sleeping for 10 seconds.
Getting article 1488
Got article 1488 sleeping for 10 seconds.


### Test Single Article

In [48]:

article_parser(arts[1086], 1086)


{'Article Number': 1086,
 'Weave Title': 'Cloudy Day',
 'Max AR': '4.5',
 'Ideal AR': '3.9',
 'Min AR': '3.6',
 'Date Uploaded': 'January 14, 2011, 6:04 pm',
 'Last Edited': 'January 10, 2016, 8:21 pm',
 'Articles': {'Weave Tutorials': ['Cloudy Day Tutorial']},
 'Tags': ['European', 'Persian', 'Sheet', 'Variant', 'Rhino', 'Mage']}

### Xpath Testing

In [6]:
# Goal: Get data on articles

path = '/html/body/div[5]/div/table/tr/td[2]/div[2]'

num = 0
results = dom.xpath(path)
result = results[num]

print(f"Current path results: {dom.xpath(path)}")

print(f"Selected Element at path results{result}")

print("Children of selected element at path results:")

for i in result:
    print('\t',i, sep='')


Current path results: [<Element div at 0x228ab544580>]
Selected Element at path results<Element div at 0x228ab544580>
Children of selected element at path results:
	<Element font at 0x228ab544180>
	<Element br at 0x228aae309c0>
	<Element br at 0x228ab544740>
	<Element u at 0x228ab544f00>
	<Element br at 0x228ab544b80>
	<Element a at 0x228ab544940>
	<Element a at 0x228ab544680>
	<Element br at 0x228ab544dc0>
	<Element a at 0x228ab547500>
	<Element br at 0x228ab544f40>
	<Element br at 0x228ab547dc0>
	<Element a at 0x228ab547480>
	<Element br at 0x228ab547200>
	<Element br at 0x228ab547d00>
	<Element br at 0x228ab547c00>
	<Element br at 0x228ab5477c0>
	<Element a at 0x228ab547a40>
	<Element br at 0x228aae309c0>
	<Element br at 0x228ab547280>
	<Element h3 at 0x228ab5473c0>
	<Element a at 0x228ab547340>
	<Element a at 0x228ab547b80>
	<Element a at 0x228ab547580>
	<Element a at 0x228ab5474c0>
	<Element a at 0x228ab547c40>
	<Element a at 0x228ab547140>
	<Element br at 0x228aae309c0>
	<Element

### Test All Articles

In [50]:

for key, article in arts.items():
    print(f'Article: {key}')
    print(article_parser(article, key))
    print()


Article: 1
{'Article Number': 1, 'Weave Title': 'Trizantine', 'Max AR': '', 'Ideal AR': '5.2', 'Min AR': '', 'Date Uploaded': 'May 3, 2008, 6:36 pm', 'Last Edited': 'June 22, 2017, 11:45 am', 'Articles': {'Weave Tutorials': ['Trizantine (CGI)']}, 'Tags': ['European', 'Chain', 'Progression', 'Alpha']}

Article: 2
{'Article Number': 2, 'Weave Title': 'Hizashi 1', 'Max AR': '', 'Ideal AR': '8.0|3.0', 'Min AR': '', 'Date Uploaded': 'December 28, 2008, 7:41 pm', 'Last Edited': 'August 2, 2011, 7:13 pm', 'Articles': {'Weave Tutorials': ['Hizashi Subfamily']}, 'Tags': ['Japanese', 'Sheet', 'Radial', 'Modification']}

Article: 6
{'Article Number': 6, 'Weave Title': 'European 4 in 1', 'Max AR': '', 'Ideal AR': '4.0', 'Min AR': '2.83', 'Date Uploaded': 'April 6, 2010, 9:19 am', 'Last Edited': 'December 14, 2015, 10:29 pm', 'Articles': {'Weave Tutorials': ['European 4 in 1 (CGI; chain)', 'European 4 in 1 (CGI; One at a Time)', 'European 4 in 1 (CGI; ribbon)', 'European 4 in 1 (Ribbon)', 'European