# Get Data

## Imports and Setup


In [1]:
# Imports
import requests
from bs4 import BeautifulSoup
from lxml import etree
from time import sleep
import cssselect
import re
import datetime
import glob

In [2]:
# Set constants for use later
BASE_URL = "https://www.mailleartisans.org/weaves/weavedisplay.php?key="
FIRST_ARTICLE = 1
LAST_ARTICLE = 1487

In [3]:

# Create function to download articles
def download_articles(articles: iter) -> None:
    for article in articles:
        for i in range(1, 4):
            sleep(10)
            response = requests.get(f"{BASE_URL}{article}")
            now = datetime.datetime.now()
            if response.status_code == 200:
                print(f'{now}: Article {article} attempt {i}/3 succeeded moving on in 10 seconds.')
                with open(f'../articles/article_{article}.html', mode='wb') as outfile:
                    outfile.write(response.content)
                break
            elif i < 3:
                print(f'{now}: Article {article} attempt {i}/3 failed trying again in 10 seconds.')
            else:
                print(f'{now}: Article {article} attempt {i}/3 failed moving on in 10 seconds.')



## Test Getting Data From Articles

### Define Function To Get Data From Articles

In [4]:

def get_articles_and_tags(tree) -> dict:
    out = dict()

    # Get Article and Tag information
    path = '/html/body/div[5]/div/table/tr/td[2]/div[2]//text()'
    lines = [i for i in tree.xpath(path) if i not in ('[', ']')]

    ## Get indicies to split the tag and article data
    tags_start = lines.index('Weave Tags (Click to Search Weaves) ')
    article_start = lines.index('Related Articles ')
    gallery_start = lines.index('Random Gallery Items Tagged as using this Weave ')


    # Get Articles

    ## Get only article lines
    article_lines = lines[article_start+1:gallery_start]

    ## Get Article headings
    path = '/html/body/div[5]/div/table/tr/td[2]/div[2]/b//text()'
    headings = tree.xpath(path)

    if len(headings) > 0:
        ## Parse list and headings into dict of lists of articles by heading

        ### Set starting values
        tmp = list()
        level = list()
        start = True

        ### Split lines into on list per heading
        for i in article_lines:
            if (i in headings) and start:   
                level = list()
                level.append(i)
                start = False
            elif (i in headings) and not start:
                tmp.append(level)
                level = list()
                level.append(i)
            else:
                level.append(i)
        tmp.append(level)

        ## Clean up list into list of article titles
        lol = [''.join(i).split('\n') for i in tmp]

        ### Turn list into dictionary
        tmp = dict()
        for i in lol:
            tmp[i[0]] = [j for j in i[1:] if j != '']

        out['Articles'] = tmp
    else:
        out['Articles'] = None

    # Get Tags
    
    ## Get tag lines
    tag_lines = lines[tags_start+1:article_start]
    
    ## Remove unecessary info and clean tag lines
    out['Tags'] = [i.strip() for i in tag_lines if i not in ('\n', ', ')] 

    return out



def article_parser(article: bytes, art_number: int) -> dict:
    # Create output dict
    out = dict()
    out['Article Number'] = art_number

    # Convert article into an etree
    soup = BeautifulSoup(article, 'html.parser')
    dom = etree.HTML(str(soup.html))


    # Parse etree to find values of interest

    ## Determine if an article exists
    if len(dom.xpath('/html/body/div[5]/div/h2')) > 0:
        out = {
            'Weave Title':   None,
            'Max AR':        None,
            'Ideal AR':      None,
            'Min AR':        None,
            'Date Uploaded': None,
            'Last Edited':   None,
            'Articles':      None,
            'Tags':          None,
        }
        return out
    else:
        ## Weave Title
        path = '/html/body/div[5]/div/table/tr/td[2]/div[2]/font'
        title = dom.xpath(path)[0].text
        out['Weave Title'] = title

        ## Get AR Values

        ### Get AR val string from page
        path = '/html/body/div[5]/div/table/tr/td[1]/table/tr[2]/td/div[not(@class)][2]//text()'
        ar_val_string = ' '.join([i for i in dom.xpath(path) if i != '\n'][:-1])

        ### Extract Aspect Ratio Values from the AR val string
        max_ar = re.search('%s(.*)%s' % ('Max. AR :', ' Ideal AR :'), ar_val_string).group(1).replace(' ', '')
        ideal_ar = re.search('%s(.*)%s' % (' Ideal AR :', 'Min. AR :'), ar_val_string).group(1).replace(' ', '')
        min_ar = re.search('%s(.*)' % ('Min. AR :'), ar_val_string).group(1).replace(' ', '')

        ### Add AR values to output dictionary
        out.update({'Max AR': max_ar, 'Ideal AR': ideal_ar, 'Min AR': min_ar})


        ## Get Dates

        ### Get date string
        path = '/html/body/div[5]/div/table/tr/td[1]/table/tr[2]/td/div[not(@class)][3]//text()'
        results = dom.xpath(path)
        date_string = ''.join(results).replace('\n', '')

        ### Extract Dates from date string
        date_uploaded = re.search('%s(.*)%s' % ('Date Uploaded', 'Last Edited'), date_string).group(1).strip()
        last_edited = re.search('%s(.*)' % ('Last Edited'), date_string).group(1).strip()

        ### Add AR values to output dictionary
        out.update({'Date Uploaded': date_uploaded, 'Last Edited': last_edited})


        ## Get Article and Tag information
        out.update(get_articles_and_tags(dom))

        return out



### Get Test Articles


In [5]:

# Get some articles and store them to avoid repeated calls to the website

to_get = [
    1,    # single AR no Min no Max
    2,    # multi AR no Min no Max
    6,    # single AR no Max
    11,   # multi AR no Max
    189,  # No ARs given
    1086, # Single ar all ARs given
    1173, # multi AR all values given
    1488, # Error page with No AR values
]

# Download test articles
download_articles(to_get)


2023-04-05 08:31:10.716649: Article 1 attempt 1/3 succeeded moving on in 10 seconds.
2023-04-05 08:31:20.874008: Article 2 attempt 1/3 succeeded moving on in 10 seconds.
2023-04-05 08:31:31.035498: Article 6 attempt 1/3 succeeded moving on in 10 seconds.
2023-04-05 08:31:41.215062: Article 11 attempt 1/3 succeeded moving on in 10 seconds.
2023-04-05 08:31:51.377708: Article 189 attempt 1/3 succeeded moving on in 10 seconds.
2023-04-05 08:32:01.526538: Article 1086 attempt 1/3 succeeded moving on in 10 seconds.
2023-04-05 08:32:11.683812: Article 1173 attempt 1/3 succeeded moving on in 10 seconds.
2023-04-05 08:32:21.822933: Article 1488 attempt 1/3 succeeded moving on in 10 seconds.


### Test Single Article

In [6]:

with open('../articles/article_1086.html', mode='r') as article:
    print(article_parser(article.read(), 1086))


{'Article Number': 1086, 'Weave Title': 'Cloudy Day', 'Max AR': '4.5', 'Ideal AR': '3.9', 'Min AR': '3.6', 'Date Uploaded': 'January 14, 2011, 6:04 pm', 'Last Edited': 'January 10, 2016, 8:21 pm', 'Articles': {'Weave Tutorials': ['Cloudy Day Tutorial']}, 'Tags': ['European', 'Persian', 'Sheet', 'Variant', 'Rhino', 'Mage']}


### Xpath Testing

In [7]:
# Goal: Get data on articles

# path = '/html/body/div[5]/div/table/tr/td[2]/div[2]'

# num = 0
# results = dom.xpath(path)
# result = results[num]

# print(f"Current path results: {dom.xpath(path)}")

# print(f"Selected Element at path results{result}")

# print("Children of selected element at path results:")

# for i in result:
#     print('\t',i, sep='')


### Test All Articles

In [8]:

# Iterate through all test articles
for article_path in glob.glob('../articles/*'):
    article_number = int(article_path[:-5].split('_')[1])
    with open(article_path, mode='r') as article:
        print(article_parser(article.read(), article_number))


{'Article Number': 1, 'Weave Title': 'Trizantine', 'Max AR': '', 'Ideal AR': '5.2', 'Min AR': '', 'Date Uploaded': 'May 3, 2008, 6:36 pm', 'Last Edited': 'June 22, 2017, 11:45 am', 'Articles': {'Weave Tutorials': ['Trizantine (CGI)']}, 'Tags': ['European', 'Chain', 'Progression', 'Alpha']}
{'Article Number': 1086, 'Weave Title': 'Cloudy Day', 'Max AR': '4.5', 'Ideal AR': '3.9', 'Min AR': '3.6', 'Date Uploaded': 'January 14, 2011, 6:04 pm', 'Last Edited': 'January 10, 2016, 8:21 pm', 'Articles': {'Weave Tutorials': ['Cloudy Day Tutorial']}, 'Tags': ['European', 'Persian', 'Sheet', 'Variant', 'Rhino', 'Mage']}
{'Article Number': 11, 'Weave Title': 'Dragonscale', 'Max AR': '', 'Ideal AR': '3.9|6.1', 'Min AR': '3.7|5.7', 'Date Uploaded': 'August 6, 2008, 4:14 am', 'Last Edited': 'July 16, 2019, 7:53 pm', 'Articles': {'Weave Tutorials': ['Dragonscale (CGI; Preclose Large Rings)', 'Dragonscale (CGI; Preclose Small Rings)', 'Dragonscale', 'Dragonscale Expansion (Horizontal Direction)', 'Drago