# Get Data

## Imports and Setup


In [1]:
# Imports
import requests
from bs4 import BeautifulSoup
from lxml import etree
from time import sleep
# import cssselect
import re
import datetime
import json
import glob

In [2]:
# Set constants for use later
BASE_URL = "https://www.mailleartisans.org/weaves/weavedisplay.php?key="
FIRST_ARTICLE = 1
LAST_ARTICLE = 1487

## Get The Data


### Define Function To Download Data From M.A.I.L.


In [None]:

# Create function to download articles
def download_articles(articles: iter) -> None:
    for article in articles:
        for i in range(1, 4):
            sleep(10)
            response = requests.get(f"{BASE_URL}{article}")
            now = datetime.datetime.now()
            if response.status_code == 200:
                print(f'{now}: Article {article} attempt {i}/3 succeeded moving on in 10 seconds.')
                with open(f'../articles/article_{article}.html', mode='wb') as outfile:
                    outfile.write(response.content)
                break
            elif i < 3:
                print(f'{now}: Article {article} attempt {i}/3 failed trying again in 10 seconds.')
            else:
                print(f'{now}: Article {article} attempt {i}/3 failed moving on in 10 seconds.')



### Download All Articles


In [None]:

download_articles(range(FIRST_ARTICLE, LAST_ARTICLE+1))


### Define Function To Get Data From Article

In [3]:

def get_articles_and_tags(tree) -> dict:
    out = dict()

    # Get Article and Tag information
    path = '/html/body/div[5]/div/table/tr/td[2]/div[2]//text()'
    lines = [i for i in tree.xpath(path) if i not in ('[', ']')]

    ## Get indicies to split the tag and article data
    tags_start = lines.index('Weave Tags (Click to Search Weaves) ')
    article_start = lines.index('Related Articles ')
    gallery_start = lines.index('Random Gallery Items Tagged as using this Weave ')


    # Get Articles

    ## Get only article lines
    article_lines = lines[article_start+1:gallery_start]

    ## Get Article headings
    path = '/html/body/div[5]/div/table/tr/td[2]/div[2]/b//text()'
    headings = tree.xpath(path)

    if len(headings) > 0:
        ## Parse list and headings into dict of lists of articles by heading

        ### Set starting values
        tmp = list()
        level = list()
        start = True

        ### Split lines into on list per heading
        for i in article_lines:
            if (i in headings) and start:   
                level = list()
                level.append(i)
                start = False
            elif (i in headings) and not start:
                tmp.append(level)
                level = list()
                level.append(i)
            else:
                level.append(i)
        tmp.append(level)

        ## Clean up list into list of article titles
        lol = [''.join(i).split('\n') for i in tmp]

        ### Turn list into dictionary
        tmp = dict()
        for i in lol:
            tmp[i[0]] = [j for j in i[1:] if j != '']

        out['Articles'] = tmp
    else:
        out['Articles'] = None

    # Get Tags
    
    ## Get tag lines
    tag_lines = lines[tags_start+1:article_start]
    
    ## Remove unecessary info and clean tag lines
    out['Tags'] = [i.strip() for i in tag_lines if i not in ('\n', ', ')] 

    return out



def article_parser(article: bytes) -> dict:
    # Create output dict
    out = dict()

    # Convert article into an etree
    soup = BeautifulSoup(article, 'html.parser')
    dom = etree.HTML(str(soup.html))


    # Parse etree to find values of interest

    ## Determine if an article exists
    if len(dom.xpath('/html/body/div[5]/div/h2')) > 0:
        out = {
            'Weave Title':   None,
            'Max AR':        None,
            'Ideal AR':      None,
            'Min AR':        None,
            'Date Uploaded': None,
            'Last Edited':   None,
            'Articles':      None,
            'Tags':          None,
        }
        return out
    else:
        ## Weave Title
        path = '/html/body/div[5]/div/table/tr/td[2]/div[2]/font'
        title = dom.xpath(path)[0].text
        out['Weave Title'] = title

        ## Get AR Values

        ### Get AR val string from page
        path = '/html/body/div[5]/div/table/tr/td[1]/table/tr[2]/td/div[not(@class)][2]//text()'
        ar_val_string = ' '.join([i for i in dom.xpath(path) if i != '\n'][:-1])

        ### Extract Aspect Ratio Values from the AR val string
        max_ar = re.search('%s(.*)%s' % ('Max. AR :', ' Ideal AR :'), ar_val_string).group(1).replace(' ', '')
        ideal_ar = re.search('%s(.*)%s' % (' Ideal AR :', 'Min. AR :'), ar_val_string).group(1).replace(' ', '')
        min_ar = re.search('%s(.*)' % ('Min. AR :'), ar_val_string).group(1).replace(' ', '')

        ### Add AR values to output dictionary
        out.update({'Max AR': max_ar, 'Ideal AR': ideal_ar, 'Min AR': min_ar})


        ## Get Dates

        ### Get date string
        path = '/html/body/div[5]/div/table/tr/td[1]/table/tr[2]/td/div[not(@class)][3]//text()'
        results = dom.xpath(path)
        date_string = ''.join(results).replace('\n', '')

        ### Extract Dates from date string
        date_uploaded = re.search('%s(.*)%s' % ('Date Uploaded', 'Last Edited'), date_string).group(1).strip()
        last_edited = re.search('%s(.*)' % ('Last Edited'), date_string).group(1).strip()

        ### Add AR values to output dictionary
        out.update({'Date Uploaded': date_uploaded, 'Last Edited': last_edited})


        ## Get Article and Tag information
        out.update(get_articles_and_tags(dom))

        return out


### Iterate Through Articles And Get Data

In [4]:

# Define output dict
out = dict()

# Iterate through all test articles
for article_path in glob.glob('../articles/*'):
    article_number = int(article_path[:-5].split('_')[1])
    with open(article_path, mode='rb') as article:
        out[article_number] = article_parser(article.read())


### Save The Data For Analysis

In [None]:

with open('../data/chainmail_data.json', mode='w') as json_file:
    json.dump(out, json_file, sort_keys=True, indent=4)


### Convert the Data to List


In [6]:

# Set headers for csv data
headers = [
    'id', 
    'weave_title', 
    'max_ar', 
    'ideal_ar', 
    'min_ar',
    'date_uploaded',
    'last_edited',
    'articles',
    'tags',
]

# Convert data into 2 dimensional list
data_2d_list = [
    [
        id,
        info['Weave Title'],
        info['Max AR'],
        info['Ideal AR'],
        info['Min AR'],
        info['Date Uploaded'],
        info['Last Edited'],
        '||'.join(info['Articles'].get('Weave Tutorials', []) if info['Articles'] else []),
        '||'.join(info['Articles'].get('Other Related Articles', []) if info['Articles'] else []),
        info['Tags'],
    ] 
    for id, info in out.items()
]



{1: {'Weave Title': 'Trizantine',
  'Max AR': '',
  'Ideal AR': '5.2',
  'Min AR': '',
  'Date Uploaded': 'May 3, 2008, 6:36 pm',
  'Last Edited': 'June 22, 2017, 11:45 am',
  'Articles': {'Weave Tutorials': ['Trizantine (CGI)']},
  'Tags': ['European', 'Chain', 'Progression', 'Alpha']},
 10: {'Weave Title': 'European 12 in 2',
  'Max AR': '',
  'Ideal AR': '8.0',
  'Min AR': '6.9',
  'Date Uploaded': 'February 8, 2008, 1:08 am',
  'Last Edited': 'July 25, 2013, 12:18 am',
  'Articles': None,
  'Tags': ['European', 'Sheet', 'Kinged', 'Modification']},
 100: {'Weave Title': 'Persian 4 in 1 Sheet',
  'Max AR': '',
  'Ideal AR': '5.4',
  'Min AR': '',
  'Date Uploaded': 'February 22, 2005, 11:47 pm',
  'Last Edited': 'April 21, 2019, 3:02 pm',
  'Articles': {'Weave Tutorials': ['Persian 4 in 1 Sheet (Flip Method)',
    'Persian 4 in 1 Sheet (Eye Method)',
    'Half Persian 2 Sheet 4 in 1 Tutorial'],
   'Other Related Articles': ['Understanding the Persian Family [Persian,Connections,Weave