In [1]:
from requests import get
import re
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
def simple_get(url):
    """
    Attempts to get the content at `url` by making an HTTP GET request.
    If the content-type of response is some kind of HTML/XML, return the
    text content, otherwise return None.
    """
    try:
        with closing(get(url, stream=True)) as resp:
            if is_good_response(resp):
                return resp.content
            else:
                return None

    except RequestException as e:
        return None


def is_good_response(resp):
    """
    Returns True if the response seems to be HTML, False otherwise.
    """
    content_type = resp.headers['Content-Type'].lower()
    return (resp.status_code == 200 
            and content_type is not None 
            and content_type.find('html') > -1)


def log_error(e):
    """
    It is always a good idea to log errors. 
    This function just prints them, but you can
    make it do anything.
    """
    print(e)



In [2]:
def parse_html(url):
    try:
        raw_html = simple_get(url)
        html = BeautifulSoup(raw_html, 'html.parser') 

        header = html.find('h3',class_='page-title').get_text()
        game_name = header.strip().partition("\r")[0]

        if game_name != "Hex-Size 13":
            return []
        end = header.partition('#')[2]
        gid = end[0:end.find('\r')]

        game = html.find_all('div', class_='portlet-body')[3] #for move list

        players = html.find_all('div', class_ = "col-xs-6 col-md-6")
        black = players[0].find('a').get_text()
        white = players[1].find('a').get_text()

        black_rating = players[0].find_all('br')[1].get_text().strip()
        white_rating = players[1].find_all('br')[1].get_text().strip()

        moves = game.find_all('b')
        move_list = []

        if moves[1].get_text() == "2.swap":
            m1 = moves[0].get_text().split(".",1)[1]
            move_list.append(m1+"*")
        else:
            move_list.append(moves[0].get_text().split(".",1)[1])
            move_list.append(moves[1].get_text().split(".",1)[1])

        turn = "black"
        for move in moves[2:]:
            m = move.get_text().split(".",1)[1]

            if m != "resign":
                move_list.append(m)

            if turn == "black":
                turn = "white"
            else: turn = "black"

        winner = turn    
        move_string = ''.join(move_list)
        return [gid,black,white,black_rating,white_rating,move_string,winner]
    except: 
        return []    

In [None]:
import csv
import random

nums = [i for i in range(1800000,2500000)]
urls = []

for n in nums:
    urls.append("http://littlegolem.net/jsp/game/game.jsp?gid=" + str(n))

urls.append("http://littlegolem.net/jsp/game/game.jsp?gid=2002838") #example
header = ["gid","black","white","black_rating","white_rating","move_list","winner"]

with open('games.csv', 'w') as csvFile:
    writer = csv.writer(csvFile)
    writer.writerow(header)
    
    for url in urls:
        resp = parse_html(url)
        if resp != []:
            writer.writerow(resp)

csvFile.close()