In [1]:
from lxml import html, etree

import requests
import time

url_form = 'http://forum.spore.com/jforum/posts/list/{0}/{1}.page'
page_cache = {}

In [2]:
def make_tree(offset, topic_id):
    url = url_form.format(offset, topic_id)
    
    if topic_id in page_cache and offset//15 in page_cache[topic_id]:
        page = page_cache[topic_id][offset // 15]
    else:
        page = requests.get(url)
            
    if offset is not 0:
        page_cache[topic_id].append( page )
    else:
        page_cache[topic_id] = [page]
    
    return html.fromstring(page.content)

In [3]:
def scrape_topic(topic_id):
    tree = make_tree(0, topic_id)
    
    page_numbers = [el.text_content() for el in tree.cssselect('.pagination a')]
    # The last two pagination buttons are a "next page" arrow and "Go" button,
    # so the actual last page number is third from the end.
    # If the topic only has one page, there's no pagination buttons at all.
    page_count = int(page_numbers[-3]) if page_numbers else 1

    def scrape(offset, topic_id):
        tree = make_tree(offset, topic_id) # TODO: rate limiter
        print(offset//15 + 1, '/', page_count)
        
        usernames = [el.text_content() for el in tree.cssselect('.genmed')]
        # if not usernames: return None  # if no usernames, this is a void topic. bail out!

        dates = [el.text_content() for el in tree.cssselect('.date')]
        post_ids = [el.cssselect('a')[1].get('name') for el in tree.cssselect('.postinfo')]
        for i, post_id in enumerate(post_ids):
            
            root = tree.cssselect('#post_text_{}'.format(post_id))[0]
            # for el in root.iter('blockquote'):
                # if el.text: print(usernames[i], el.text_content().strip())
                # if el.text is not None: el.text = '> ' + el.text # codecs.escape_decode(el.content)[0].decode()
                    
            content = root.cssselect('.postbody')
            for el in content[0].iter('*'):
                if el.text: el.text = el.text.strip('\t\n\r\f')
                if el.tail: el.tail = el.tail.strip('\t\n\r\f')
                if el.tag == 'textarea': 
                    el.text = el.text.replace('\r\n', '\r') # ASSUME codeblock contains no tags
                    
            attach = root.cssselect('.gensmall')
                    
            yield {
                'id': post_id,
                'user': usernames[i],
                'date': dates[i].strip(),
                'content': etree.tostring(content[0], encoding='utf-8'),
                'edits': etree.tostring(attach[0], encoding='utf-8') if attach else '',
            }

    ret = [scrape(j*15, topic_id) for j in range(0, page_count)] # make list of generators
    
    return [post for page in ret for post in page] # spool out all posts

In [None]:
tsv_cols = ['id', 'user', 'date', 'content', 'edits']
tsv_headers = '\t'.join(tsv_cols)

# FEED scraper in background1111
# TODO: collect changes in title of thread from its replies

topics = []
print( len(topics) )

In [None]:
for i,thread in enumerate(topics):
    print('topic {} ({}/{})'.format(thread, i+1, len(topics)))
    
    with open('./{}.tsv'.format(thread), 'wb') as tsv:
        tsv.write((tsv_headers + '\n').encode('utf-8'))
        topic = scrape_topic(thread)
        
        for post in topic:
            for col in tsv_cols:
                res = post[col] if isinstance(post[col], bytes) else post[col].encode('utf-8')
                tsv.write(res)
                tsv.write(('\t' if col is not tsv_cols[-1] else '\n').encode('utf-8'))
                
            # tsv_line = '\t'.join([str(post[col]) for col in tsv_cols])            
        # time.sleep(1)  # to avoid overwhelming the poor server :(

#### Test serialization roundtrip

In [65]:
import csv

# 20495, 22  # taxonomy shop
# 45652, 1   # short-lived evolution game reboot
# 300, 800   # autohotkey script to fix graphics?
# 34634, 7148 # linux joke

with open('./{}.tsv'.format(topics[0]), 'r') as f:
    ret = list( csv.DictReader(f, delimiter='\t') )

In [None]:
for i,u in enumerate(ret):
    print( i, u['user'], u['date'] )

In [None]:
from IPython.core.display import HTML
# strip markup
#print( '\n'.join([(el.text or '') + ' ' + (el.tail or '')
#                  for el in etree.fromstring(ret[800]['content']).iter('*')]) )

# TODO: trim tabs but not spaces.
HTML(ret[0]['content'])

#### Repair utilities

In [None]:
import os
for root,dirs,files in os.walk('.'):
    for file in files:
        if file.endswith(".tsv"):
            with open('./{}'.format(file), 'r') as f:
                ret = list( csv.DictReader(f, delimiter='\t') )
                try:
                    for line in ret:
                        if 'textarea' in line['content']:
                            print(file, 'codeblock')
                            break
                except:
                    print(file, 'trouble')