#Using Trickery to Get Review Data from Steam

In [1]:
# Let's say that the URL for a game is the following
url = 'http://steamcommunity.com/app/730/homecontent/?userreviewsoffset=0&p=1&itemspage=1&screenshotspage=1&videospage=1&artpage=1&allguidepage=1&webguidepage=1&integratedguidepage=1&discussionspage=1&appid=730&appHubSubSection=10&appHubSubSection=10&l=english&browsefilter=toprated&filterLanguage=default&searchText=&forceanon=1'
# The game is actually Counter-Strike: Global Offensive (CS: GO)
# It's difficult to see, but there are a few parts of the URL that are of
# interest to us
# 1. '/app/730' tells us that the appid is 730, which corresponds to
#    Counter-Strike: Global Offensive (CS: GO)
#    - this info is also later in the URL: 'appid=730'
# 2. 'userreviewsoffset=0' - this is the start number for the review results
#    - if 0, that means that the reviews we see will be 0 through 9
#    - if 1, 10 through 19, etc.
# 3. there are numerous parts that end in 'page=1' - the number here refers
#    to an iterator that stores how many 10-review pages you've seen (i.e.,
#    usually by hovering at the bottom of the reviews results page to make
#    more visible)
#    - userreviewsoffset = 0, then page should be 1
#    - if userreviewsoffset 10, then page should be 2,
#    - if userreviewsoffset 20, thenpage should be 3,
#    - etc.
# So, this will allow us to make a for loop to go through all possible pages
# of reviews!

In [2]:
# Let's redefine the URL with format specifiers so that we use the URL in
# a loop
# We'll set the game ID (appid), range beginning value (range_begin), and
# increment value (i)
appid = '730'
range_begin = '0'
i = '1'
url = 'http://steamcommunity.com/app/{0}/homecontent/?userreviewsoffset={1}&p=1&itemspage={2}&screenshotspage={2}&videospage={2}&artpage={2}&allguidepage={2}&webguidepage={2}&integratedguidepage={2}&discussionspage={2}&appid={0}&appHubSubSection=10&appHubSubSection=10&l=english&browsefilter=toprated&filterLanguage=default&searchText=&forceanon=1'.format(appid, range_begin, i)
print url

http://steamcommunity.com/app/730/homecontent/?userreviewsoffset=0&p=1&itemspage=1&screenshotspage=1&videospage=1&artpage=1&allguidepage=1&webguidepage=1&integratedguidepage=1&discussionspage=1&appid=730&appHubSubSection=10&appHubSubSection=10&l=english&browsefilter=toprated&filterLanguage=default&searchText=&forceanon=1


In [3]:
# The link above will not work if you try to go to it directly
# However, if you go to the link and then try to view the source HTML,
# you will be able to see the HTML

In [4]:
# We can read, parse, and then extract the content at the URL using
# lxml and requests modules
from lxml import html
import requests

In [5]:
# Let's use requests.get() to get the page
page = requests.get(url)

In [6]:
# Let's take a look at the attributes of the page object
[a for a in dir(page) if not a.startswith('_') and not a.endswith('_')]
# Don't worry about the code here, it's just a trick to see public
# methods for a requests object

['apparent_encoding',
 'close',
 'connection',
 'content',
 'cookies',
 'elapsed',
 'encoding',
 'headers',
 'history',
 'iter_content',
 'iter_lines',
 'json',
 'links',
 'ok',
 'raise_for_status',
 'raw',
 'reason',
 'request',
 'status_code',
 'text',
 'url']

In [7]:
# We see that there are attributes for the text, json, lines, etc.,
# so let's take a look at some of this stuff
page.text[:1000] # Here's the raw HTML

u'\t\t<div id="page1">\r\n\t\t<div class="apphub_Card interactable" style="display: none" onclick="ShowModalContent( \'http://steamcommunity.com/id/kadiv/recommended/730/?insideModal=1\', \'http://steamcommunity.com/id/kadiv/recommended/730/\', \'http://steamcommunity.com/id/kadiv/recommended/730/\',false );">\r\n\t<div class="apphub_CardContentMain">\r\n\t\t<div class="apphub_UserReviewCardContent">\r\n\t\t\t<div class="found_helpful">\r\n\t\t\t\t15,443 of 17,222 people (90%) found this review helpful<br>11 people found this review funny\t\t\t</div>\r\n\r\n\t\t\t<div class="vote_header">\r\n\t\t\t\t\t\t\t\t<div class="reviewInfo">\r\n\t\t\t\t\t<div class="thumb">\r\n\t\t\t\t\t\t<img src="http://steamcommunity-a.akamaihd.net/public/shared/images/userreviews/icon_thumbsUp.png?v=1" width="44" height="44">\r\n\t\t\t\t\t</div>\r\n\r\n\t\t\t\t\t\t\t\t\t\t<div class="title">Recommended</div>\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<div class="hours">1,066.0 hrs on record</div>\r\n\t\t\t\t\t\t\t\t\t

In [8]:
# After looking at some of the other attributes, I've determined
# that the text attribute is probably the only thing that
# concerns us, so let's use it
text = page.text

In [9]:
# The text, as you can see from the view above, has lots of \r,
# \n, \t characters in it, which might not be good for HTML
# parsing in our case (I won't get into why, partly because I'm
# not completely sure I get why), so let's get rid of all such
# characters, replacing them with spaces instead
# To do this we will use the re module, which allows us to use
# regular expressions
import re
# While we're at it, it's best to get rid of all "<br>" tags since
# they could also present problems during parsing
# We can use the re.sub() method to find one regular expression and
# replace it with another in a given text
text = re.sub(r'\<br\>', r' ', text) # Looks for the string "<br>"
    # and replaces it with a space
text = re.sub(r'[\n\t\r ]+', r' ', text) # Looks for any sequence
    # of carriage returns or whitespace characters and replaces
    # them with a single space
text = text.strip() # Get rid of spaces at either end

In [10]:
# Let's take a look at the cleaned-up version of the source HTML
text[:1000]

u'<div id="page1"> <div class="apphub_Card interactable" style="display: none" onclick="ShowModalContent( \'http://steamcommunity.com/id/kadiv/recommended/730/?insideModal=1\', \'http://steamcommunity.com/id/kadiv/recommended/730/\', \'http://steamcommunity.com/id/kadiv/recommended/730/\',false );"> <div class="apphub_CardContentMain"> <div class="apphub_UserReviewCardContent"> <div class="found_helpful"> 15,443 of 17,222 people (90%) found this review helpful 11 people found this review funny </div> <div class="vote_header"> <div class="reviewInfo"> <div class="thumb"> <img src="http://steamcommunity-a.akamaihd.net/public/shared/images/userreviews/icon_thumbsUp.png?v=1" width="44" height="44"> </div> <div class="title">Recommended</div> <div class="hours">1,066.0 hrs on record</div> </div> <div style="clear: left"></div> </div> <div class="apphub_CardTextContent"> <div class="date_posted">Posted: February 27, 2014</div> you either die a noob or live long enough to get called a hacker 

In [11]:
# Ok, that looks much nicer!

In [12]:
# Now, let's use lxml to get the tree by parsing the source html
tree = html.fromstring(text)

In [13]:
# Using the xpath method, we can find the content that resides
# between two tags
# To find the reviews, what we're actually looking for is between
# the <div> tags with attribute class="apphub_CardTextContent"
# And, to find the game-hours played values, we're looking for <div>
# tags with attribute class="hours"
# We can do that in the following way
reviews = tree.xpath('//div[@class="apphub_CardTextContent"]/text()')
hours_played_list = tree.xpath('//div[@class="hours"]/text()')

In [14]:
# These resulting lists should line up nicely, but they actually
# have different numbers of total items, unfortunately
# Observe:
print "length of review list = {}\nlength of hours-played list = {}".format(len(reviews),
                                                                            len(hours_played_list))

length of review list = 21
length of hours-played list = 10


In [15]:
# So, let's see what's happening here
# There should only be 10 reviews, so the hours-played list is right
# Let's take a look at the values
hours_played_list

['1,066.0 hrs on record',
 '675.3 hrs on record',
 '76.6 hrs on record',
 '923.4 hrs on record',
 '63.7 hrs on record',
 '936.2 hrs on record',
 '2,837.1 hrs on record',
 '213.0 hrs on record',
 '722.4 hrs on record',
 '170.7 hrs on record']

In [16]:
# That looks good, so let's now take a look at the reviews list
reviews

[' ',
 ' you either die a noob or live long enough to get called a hacker ',
 ' ',
 ' Kill someone with a P90 - "You\'re a fuc**** noob!! Noob weapon!!" Kill someone with a P90 through a smoke - "You\'re a fuc**** hacker!!" Kill someone with a AWP - "You\'re a fuc**** noob!! Noob weapon!!" Kill someone with a AWP through a door - "You\'re a fuc**** hacker!!" In a 1 vs 5 you die - "You\'re a fuc**** noob!!" In a 1 vs 5 you win - "You\'re a fuc**** hacker!!" Kill someone with a headshot - "Hacker!!" Get headshoted by someone - "Owned!!" and get teabagged Kill someone with a grenade - "Luck!!" Get killed by someone with a grenade - "AHAHAHAHA" Get teamkilled by someone - "Get out of the way you fuc**** idiot!!" Accidentally teamkill someone - "You\'re a fuc**** idiot!!" Blocked by someone - Dies Accidentally blocks someone - "Get out the way you fuc**** idiot!!" Decide to save - "You\'re a fuc**** coward!!" Decide not to save - "Save you fuc**** idiot!!" Kill someone while defending the b

In [17]:
# As you can see, there are, for some reason a bunch of elements
# in the list that only consist of a single space
# Let's get rid of those elements and see if we end up with ten
# reviews
# The line of code I use below utilizes something known in Python as
# a "list comprehension":
# Ex: [x - 1 for x in [1, 3, 4]] => [0, 2, 3]
# Ex2: [len(s) for x in ["one", "two", "three"]] => [3, 3, 5]
# Basically the first part (x - 1 or len(s)) is applied to every x/s
# (whatever you decide to call it) in the list, one at a time, and a
# list is output as a result
reviews = [r for r in reviews if r.strip()]
len(reviews)

10

In [18]:
# Ok, great, so, there are now 10 reviews in the list after getting
# rid of the weird no-text elements!

In [19]:
# Now what we want to do is create a list of 2-element tuples
# containing 1. the review text itself and 2. a float representing
# the game-hours played value corresponding to a given review
# Let's use another list comprehension for this (be forewarned, though,
# that this list comprehension will be much more complicated - at first
# glance, at least!)
review_tuples = [(r.strip(),
                  float(re.sub(r',',
                               r'',
                               g.split(' ',
                                       1)[0]))) for r, g in zip(reviews,
                                                                     hours_played_list)]
# What I'm doing above is putting both values into a tuple and I'm
# iterating through both lists by "zipping" them together.
# The zip function is really useful function that takes a number of
# lists and puts all elements at the same index in a tuple and does it
# for every index that occurs across all the input lists
# Ex. zip(["a", "b", "c"], [3, 4, 5]) => [("a", 3), ("b", 4), ("c", 5)]
# So, I'm iterating over a list that looks a little like what's in the
# result above and then I'm basically keeping the values in tuples the
# way that they currently are, but I'm applying some functions to the
# game-hours played values, namely 1. getting the number part only and
# 2. removing any commas in the string representing the number, and 3.
# typecasting the string as a float, so '1,066.0 hrs on record' =>
# 1066.0 (and I'm also applying the strip() method to the review part
# of the tuple)

In [20]:
# Finally, let's take a look at the resulting list
review_tuples

[('you either die a noob or live long enough to get called a hacker', 1066.0),
 ('Kill someone with a P90 - "You\'re a fuc**** noob!! Noob weapon!!" Kill someone with a P90 through a smoke - "You\'re a fuc**** hacker!!" Kill someone with a AWP - "You\'re a fuc**** noob!! Noob weapon!!" Kill someone with a AWP through a door - "You\'re a fuc**** hacker!!" In a 1 vs 5 you die - "You\'re a fuc**** noob!!" In a 1 vs 5 you win - "You\'re a fuc**** hacker!!" Kill someone with a headshot - "Hacker!!" Get headshoted by someone - "Owned!!" and get teabagged Kill someone with a grenade - "Luck!!" Get killed by someone with a grenade - "AHAHAHAHA" Get teamkilled by someone - "Get out of the way you fuc**** idiot!!" Accidentally teamkill someone - "You\'re a fuc**** idiot!!" Blocked by someone - Dies Accidentally blocks someone - "Get out the way you fuc**** idiot!!" Decide to save - "You\'re a fuc**** coward!!" Decide not to save - "Save you fuc**** idiot!!" Kill someone while defending the bomb 

In [21]:
# So, that's how we could get data for reviews occurring in one
# single page for one single game
# Can we design a function, which, given a game ID, can get all
# possible reviews?
# Let's try

In [33]:
# Might need this unicode module and maybe the newer version of the
# print method
from bs4 import UnicodeDammit
import time
import sys
codecs = ["windows-1252", "utf8", "ascii", "cp500", "cp850", "cp852",
          "cp858", "cp1140", "cp1250", "iso-8859-1", "iso8859_2",
          "iso8859_15", "iso8859_16", "mac_roman", "mac_latin2", "utf32",
          "utf16"]

In [34]:
def get_review_data_for_game(appid, time_out=0.5, limit=0):
    '''
    Get list of tuples representing all review/game-hours played pairs for a given game.
    :param appid: ID corresponding to a given game
    :type appid: str
    :param timeout: amount of time allowed to go by without hearing response while using requests.get() method
    :type timeout: float
    :param limit: the maximum number of reviews to collect
    :type limit: int (default: 0, which signifies all)
    :yields: lists of tuples
    '''
    # Get reviews from each page that has content, starting at range_begin
    # = 0 and i = 1, yielding the list of review tuples as they're found
    range_begin = 0
    i = 1
    while True:
        # Get unique URL for values of range_begin and i
        url = 'http://steamcommunity.com/app/{2}/homecontent/?userreviews' \
              'offset={0}&p=1&itemspage={1}&screenshotspage={1}&videospag' \
              'e={1}&artpage={1}&allguidepage={1}&webguidepage={1}&integr' \
              'atedguidepage={1}&discussionspage={1}&appid={2}&appHubSubS' \
              'ection=10&appHubSubSection=10&l=english&browsefilter=topra' \
              'ted&filterLanguage=default&searchText=&forceanon=1'.format(
                  range_begin,
                  i,
                  appid)
        # Try to get the URL content
        page = None
        time.sleep(60)
        try:
            page = requests.get(url, timeout=time_out)
        except requests.exceptions.Timeout as e:
            print("There was a Timeout error...")
            break
        # If there's nothing at this URL, page might have no value at all,
        # in which case we should break out of the loop
        if not page:
            break
        elif not page.text.strip():
            break
        # Preprocess the HTML source a little bit
        text = re.sub(r'[\n\t\r ]+',
                      r' ',
                      re.sub(r'\<br\>',
                             r' ',
                             page.text.strip())) # Replace the string "<br>"
            # with a space and replace any sequence of carriage returns or
            # whitespace characters with a single space
        # Convert to UTF-8
        text = UnicodeDammit(text,
                             codecs).unicode_markup.encode('ascii',
                                                           'ignore')
        # Get the parse tree from source html
        tree = html.fromstring(text.strip())
        # Get lists of review texts and values for game-hours played
        range_reviews = \
            tree.xpath('//div[@class="apphub_CardTextContent"]/text()')
        hours = tree.xpath('//div[@class="hours"]/text()')
        # Truncate the list of reviews by getting rid of elements that are
        # either empty or have only a single space
        range_reviews = [x.strip() for x in range_reviews if x.strip()]
        # Try to decode the reviews with a number of different formats and
        # then encode all to utf-8
        # Zip the values together, processing them also
        yield [(z.strip(),
                float(re.sub(r',',
                             r'',
                             w.split(' ',
                                     1)[0]))) for z, w in zip(range_reviews,
                                                              hours)]
        # If a limit was defined and processing 10 more essays will push us
        # over the limit, stop here
        if limit and range_begin > limit:
            break
        range_begin += 10
        i += 1

In [None]:
# Alright, let's see if that worked
# We'll create an empty set of reviews and as each new set of reviews is
# generated by the function, we'll add in all of the reviews in the set
# to the set of reviews (if any are duplicates, they will not be added)
reviews = set()
for review_set in get_review_data_for_game('730',
                                           time_out=2.0,
                                           limit=150):
    [reviews.add(review) for review in review_set]