# Get Data

## Imports and Setup


In [1]:
# Imports
import requests
from bs4 import BeautifulSoup
from lxml import etree
from time import sleep
import cssselect
import re

In [2]:
# Set constants for use later
BASE_URL = "https://www.mailleartisans.org/weaves/weavedisplay.php?key="
FIRST_ARTICLE = 1
LAST_ARTICLE = 1487

In [3]:

# Create function to get multiple articles
def get_articles(articles) -> dict:
    out = dict()
    for i in articles:
        print(f"Getting article {i}")
        response = requests.get(f"{BASE_URL}{i}")
        if response.status_code == 200:
            out[i] = response.content
            print(f"Got article {i} sleeping for 10 seconds.")
        else:
            out[i] = 'failure'
            print(f"Could not get article {i} sleeping for 10 seconds.")
        sleep(10)
    
    return out


## Test Getting Data From Articles


### Get Test Articles


In [8]:
# Get some articles and store them to avoid repeated calls to the website

to_get = [
    1,    # single AR no Min no Max
    2,    # multi AR no Min no Max
    6,    # single AR no Max
    11,   # multi AR no Max
    189,  # No ARs given
    1086, # Single ar all ARs given
    1173, # multi AR all values given
    1488, # Error page with No AR values
]

arts = get_articles(to_get)



Getting article 1
Got article 1 sleeping for 10 seconds.
Getting article 2
Got article 2 sleeping for 10 seconds.
Getting article 6
Got article 6 sleeping for 10 seconds.
Getting article 11
Got article 11 sleeping for 10 seconds.
Getting article 189
Got article 189 sleeping for 10 seconds.
Getting article 1086
Got article 1086 sleeping for 10 seconds.
Getting article 1173
Got article 1173 sleeping for 10 seconds.
Getting article 1488
Got article 1488 sleeping for 10 seconds.


### Test Single Article

In [9]:
# Select Article
article = arts[1]

# Convert first article into an etree
soup = BeautifulSoup(article, 'html.parser')
dom = etree.HTML(str(soup.html))

# Parse etree to find values of interest

## Weave Title
path = '/html/body/div[5]/div/table/tr/td[2]/div[2]/font'
title = dom.xpath(path)[0].text
print(f'Weave Title: {title}')

## Get AR Values

### Get AR val string from page
path = '/html/body/div[5]/div/table/tr/td[1]/table/tr[2]/td/div[not(@class)][2]//text()'
ar_val_string = ' '.join([i for i in dom.xpath(path) if i != '\n'][:-1])

### Extract max ar value from ar string
max_ar = re.search('%s(.*)%s' % ('Max. AR :', ' Ideal AR :'), ar_val_string).group(1).replace(' ', '')

### Extract max ar value from ar string
ideal_ar = re.search('%s(.*)%s' % (' Ideal AR :', 'Min. AR :'), ar_val_string).group(1).replace(' ', '')

min_ar = re.search('%s(.*)' % ('Min. AR :'), ar_val_string).group(1).replace(' ', '')

print(f'Max AR: {max_ar}, Ideal AR: {ideal_ar}, Min AR: {min_ar}')



IndexError: list index out of range

### Xpath Testing

In [6]:
# New Goal get all text from weave data box
# Weave data box is second 


path = '/html/body/div[5]/div/table/tr/td[1]/table/tr[2]/td/div[not(@class)][2]//text()'

num = 0
results = dom.xpath(path)
result = results[num]

print(f"Current path results: {dom.xpath(path)}")

print(f"Selected Element at path results{result}")

print("Children of selected element at path results:")

for i in result:
    print('\t',i, sep='')


Current path results: ['\n', 'Max. AR', ': ', '\n', 'Ideal AR', ': ', '5.2', '\n', 'Min. AR', ':  ', '\n', 'Hover over an AR to see a list of ring sizes.', '\n']
Selected Element at path results

Children of selected element at path results:
	



### Test All Articles

In [7]:

# Iterate through all articles
for key, value in arts.items():
    print(key)

    # Convert article to etree
    soup = BeautifulSoup(value, 'html.parser')
    dom = etree.HTML(str(soup.html))

    # Parse etree to find values of interest

    ## Weave Title
    path = '/html/body/div[5]/div/table/tr/td[2]/div[2]/font'
    title = dom.xpath(path)[0].text
    print(f'Weave Title: {title}')
    
    ## Get AR Values

    ### Get AR val string from page
    path = '/html/body/div[5]/div/table/tr/td[1]/table/tr[2]/td/div[not(@class)][2]//text()'
    ar_val_string = ' '.join([i for i in dom.xpath(path) if i != '\n'][:-1])

    ### Extract max ar value from ar string
    max_ar = re.search('%s(.*)%s' % ('Max. AR :', ' Ideal AR :'), ar_val_string).group(1).replace(' ', '')

    ### Extract max ar value from ar string
    ideal_ar = re.search('%s(.*)%s' % (' Ideal AR :', 'Min. AR :'), ar_val_string).group(1).replace(' ', '')

    min_ar = re.search('%s(.*)' % ('Min. AR :'), ar_val_string).group(1).replace(' ', '')

    print(f'Max AR: {max_ar}, Ideal AR: {ideal_ar}, Min AR: {min_ar}')



1
Weave Title: Trizantine
Max AR: , Ideal AR: 5.2, Min AR: 
2
Weave Title: Hizashi 1
Max AR: , Ideal AR: 8.0|3.0, Min AR: 
6
Weave Title: European 4 in 1
Max AR: , Ideal AR: 4.0, Min AR: 2.83
11
Weave Title: Dragonscale
Max AR: , Ideal AR: 3.9|6.1, Min AR: 3.7|5.7
189
Weave Title: Captive Persian Round Sheet
Max AR: , Ideal AR: , Min AR: 
1086
Weave Title: Cloudy Day
Max AR: 4.5, Ideal AR: 3.9, Min AR: 3.6
1173
Weave Title: Onering
Max AR: 3.0|6.9, Ideal AR: 2.9|5.0, Min AR: 2.7|3.0
