In [1]:
from lxml import html, etree

import requests
import time

url_form = 'http://forum.spore.com/jforum/posts/list/{0}/{1}.page'
page_cache = {}

In [2]:
def make_tree(offset, topic_id):
    url = url_form.format(offset, topic_id)
    
    if topic_id in page_cache and offset//15 in page_cache[topic_id]:
        page = page_cache[topic_id][offset // 15]
    else:
        page = requests.get(url)
            
    if offset is not 0:
        page_cache[topic_id].append( page )
    else:
        page_cache[topic_id] = [page]
    
    return html.fromstring(page.content)

In [53]:
def scrape_topic(topic_id):
    tree = make_tree(0, topic_id)
    
    page_numbers = [el.text_content() for el in tree.cssselect('.pagination a')]
    # The last two pagination buttons are a "next page" arrow and "Go" button,
    # so the actual last page number is third from the end.
    # If the topic only has one page, there's no pagination buttons at all.
    page_count = int(page_numbers[-3]) if page_numbers else 1

    def scrape(offset, topic_id):
        tree = make_tree(offset, topic_id) # TODO: rate limiter
        print(offset//15 + 1, '/', page_count)
        
        usernames = [el.text_content() for el in tree.cssselect('.genmed')]
        # if not usernames: return None  # if no usernames, this is a void topic. bail out!

        dates = [el.text_content() for el in tree.cssselect('.date')]
        post_ids = [el.cssselect('a')[1].get('name') for el in tree.cssselect('.postinfo')]
        for i, post_id in enumerate(post_ids):
            
            root = tree.cssselect('#post_text_{}'.format(post_id))[0]
            # for el in root.iter('blockquote'):
                # if el.text: print(usernames[i], el.text_content().strip())
                # if el.text is not None: el.text = '> ' + el.text # codecs.escape_decode(el.content)[0].decode()
                    
            content = root.cssselect('.postbody')
            for el in content[0].iter('*'):
                if el.text: el.text = el.text.strip('\t\n\r\f')
                if el.tail: el.tail = el.tail.strip('\t\n\r\f')
                if el.tag == 'textarea': 
                    el.text = el.text.replace('\r\n', '\r') # ASSUME codeblock contains no tags
                    
            attach = root.cssselect('.gensmall')
                    
            yield {
                'id': post_id,
                'user': usernames[i],
                'date': dates[i].strip(),
                'content': etree.tostring(content[0], encoding='utf-8'),
                'edits': etree.tostring(attach[0], encoding='utf-8') if attach else '',
            }

    ret = [scrape(j*15, topic_id) for j in range(0, page_count)] # make list of generators
    
    return [post for page in ret for post in page] # spool out all posts

In [77]:
tsv_cols = ['id', 'user', 'date', 'content', 'edits']
tsv_headers = '\t'.join(tsv_cols)

# FEED scraper in background
# TODO: collect changes in title of thread from its replies

topics = '''64027
30084
17489
55189
43567
55225
58125
58991
74506
43568
20495
17610
74664
72414
58559
14795
72634
41538
45708
57096
56446
49553
45652
52134
64863
78116
55544
52386
47166
45328
68928
46926
52686
53024
41427
45383
47032
48660
49839
52783
53762
65289'''.split('\n')
print( len(topics) )

42


In [None]:
for i,thread in enumerate(topics):
    print('topic {} ({}/{})'.format(thread, i+1, len(topics)))
    
    with open('./{}.tsv'.format(thread), 'wb') as tsv:
        tsv.write((tsv_headers + '\n').encode('utf-8'))
        topic = scrape_topic(thread)
        
        for post in topic:
            for col in tsv_cols:
                res = post[col] if isinstance(post[col], bytes) else post[col].encode('utf-8')
                tsv.write(res)
                tsv.write(('\t' if col is not tsv_cols[-1] else '\n').encode('utf-8'))
                
            # tsv_line = '\t'.join([str(post[col]) for col in tsv_cols])            
        # time.sleep(1)  # to avoid overwhelming the poor server :(

topic 64027 (1/42)
1 / 193


#### Test serialization roundtrip

In [65]:
import csv

# 20495, 22  # taxonomy shop
# 45652, 1   # short-lived evolution game reboot
# 300, 800   # autohotkey script to fix graphics?
# 34634, 7148 # linux joke

with open('./{}.tsv'.format(topics[0]), 'r') as f:
    ret = list( csv.DictReader(f, delimiter='\t') )

In [55]:
for i,u in enumerate(ret):
    print( i, u['user'], u['date'] )

0 Grimbot 09/10/2008 22:04:06
1 Zechs13 09/10/2008 22:20:56
2 ThomastheCat 09/10/2008 22:55:00
3 MadJack 09/10/2008 23:03:27
4 Sykotic 09/11/2008 00:42:44
5 MadJack 09/11/2008 02:48:01
6 Kowagaru 09/11/2008 02:54:39
7 Bluhman 09/11/2008 02:55:06
8 MadJack 09/11/2008 03:09:05
9 Yesrah 09/11/2008 03:35:08
10 Erazil 09/11/2008 06:11:51
11 SporeMasterKaliena 09/11/2008 08:43:11
12 Michlo 09/11/2008 08:52:47
13 General_Pothead 09/11/2008 10:06:42
14 Drish 09/11/2008 13:17:58
15 Grimbot 09/11/2008 15:22:57
16 Mustikos 09/11/2008 16:07:59
17 Drish 09/11/2008 16:53:56
18 Grimbot 09/11/2008 17:00:42
19 Mustikos 09/11/2008 17:30:02
20 morneau502 09/11/2008 20:35:29
21 maxisandrew 09/11/2008 22:04:51
22 MadJack 09/11/2008 22:11:31
23 General_Pothead 09/11/2008 22:52:34
24 bristoe 09/11/2008 23:06:06
25 Riamus 09/12/2008 00:33:44
26 Bluhman 09/12/2008 00:37:41
27 Michlo 09/12/2008 01:03:09
28 Unkpoot 09/12/2008 01:45:43
29 Grimbot 09/12/2008 02:19:34
30 Unkpoot 09/12/2008 02:25:52
31 lrdfrfx1 09/1

In [69]:
from IPython.core.display import HTML

# strip markup
#print( '\n'.join([(el.text or '') + ' ' + (el.tail or '')
#                  for el in etree.fromstring(ret[800]['content']).iter('*')]) )

# TODO: trim tabs but not spaces.
HTML(ret[1]['content'])

#### Repair utilities

In [63]:
import os
for root,dirs,files in os.walk('.'):
    for file in files:
        if file.endswith(".tsv"):
            with open('./{}'.format(file), 'r') as f:
                ret = list( csv.DictReader(f, delimiter='\t') )
                try:
                    for line in ret:
                        if 'textarea' in line['content']:
                            print(file, 'codeblock')
                            break
                except:
                    print(file, 'trouble')

55268.tsv codeblock
300.tsv codeblock
46878.tsv codeblock
55732.tsv codeblock
42791.tsv codeblock
35157.tsv codeblock
4822.tsv codeblock
34634.tsv codeblock
84432.tsv codeblock
38543.tsv codeblock
49452.tsv codeblock
56345.tsv codeblock
70930.tsv codeblock
62569.tsv codeblock
27177.tsv codeblock
47070.tsv codeblock
65694.tsv codeblock
17774.tsv codeblock
62736.tsv codeblock
40741.tsv codeblock
53381.tsv codeblock
81757.tsv codeblock
