#Collecting Review/User Data from Steam

In [2]:
# Let's say that the URL for a game is the following
url = 'http://steamcommunity.com/app/730/homecontent/?userreviewsoffset=0&p=1&itemspage=1&screenshotspage=1&videospage=1&artpage=1&allguidepage=1&webguidepage=1&integratedguidepage=1&discussionspage=1&appid=730&appHubSubSection=10&appHubSubSection=10&l=english&browsefilter=toprated&filterLanguage=default&searchText=&forceanon=1'
# The game is actually Counter-Strike: Global Offensive (CS: GO)
# It's difficult to see, but there are a few parts of the URL that are of
# interest to us
# 1. '/app/730' tells us that the appid is 730, which corresponds to
#    Counter-Strike: Global Offensive (CS: GO)
#    - this info is also later in the URL: 'appid=730'
# 2. 'userreviewsoffset=0' - this is the start number for the review results
#    - if 0, that means that the reviews we see will be 0 through 9
#    - if 1, 10 through 19, etc.
# 3. there are numerous parts that end in 'page=1' - the number here refers
#    to an iterator that stores how many 10-review pages you've seen (i.e.,
#    usually by hovering at the bottom of the reviews results page to make
#    more visible)
#    - userreviewsoffset = 0, then page should be 1
#    - if userreviewsoffset = 10, then page should be 2,
#    - if userreviewsoffset = 20, then page should be 3,
#    - etc.
# So, this will allow us to make a for loop to go through all possible pages
# of reviews!

In [3]:
# Let's redefine the URL with format specifiers so that we use the URL in
# a loop
# We'll set the game ID (appid), range beginning value (range_begin), and
# increment value (i)
appid = '730'
range_begin = '0'
i = '1'
url = 'http://steamcommunity.com/app/{0}/homecontent/?userreviewsoffset={1}&p=1&itemspage={2}&screenshotspage={2}&videospage={2}&artpage={2}&allguidepage={2}&webguidepage={2}&integratedguidepage={2}&discussionspage={2}&appid={0}&appHubSubSection=10&appHubSubSection=10&l=english&browsefilter=toprated&filterLanguage=default&searchText=&forceanon=1'.format(appid, range_begin, i)
print(url)

http://steamcommunity.com/app/730/homecontent/?userreviewsoffset=0&p=1&itemspage=1&screenshotspage=1&videospage=1&artpage=1&allguidepage=1&webguidepage=1&integratedguidepage=1&discussionspage=1&appid=730&appHubSubSection=10&appHubSubSection=10&l=english&browsefilter=toprated&filterLanguage=default&searchText=&forceanon=1


In [4]:
# The link above will not work if you try to go to it directly
# However, if you go to the link and then try to view the source HTML,
# you will be able to see the HTML

In [5]:
# We can read, parse, and then extract the content at the URL using
# requests and bs4 (and lxml) modules
from bs4 import BeautifulSoup
from lxml import html
import requests

In [6]:
# Let's use requests.get() to get the page
page = requests.get(url)

In [7]:
# Let's take a look at the attributes of the page object
[a for a in dir(page) if not a.startswith('_') and not a.endswith('_')]
# Don't worry about the code here, it's just a trick to see public
# methods for a requests object

['apparent_encoding',
 'close',
 'connection',
 'content',
 'cookies',
 'elapsed',
 'encoding',
 'headers',
 'history',
 'is_permanent_redirect',
 'is_redirect',
 'iter_content',
 'iter_lines',
 'json',
 'links',
 'ok',
 'raise_for_status',
 'raw',
 'reason',
 'request',
 'status_code',
 'text',
 'url']

In [8]:
# We see that there are attributes for the text, json, lines, etc.,
# so let's take a look at some of this stuff
page.text[:1000] # Here's the raw HTML

'\t\t<div id="page1">\r\n\t\t<div class="apphub_Card interactable" style="display: none" onclick="ShowModalContent( \'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1\', \'http://steamcommunity.com/profiles/76561198092689293/recommended/730/\', \'http://steamcommunity.com/profiles/76561198092689293/recommended/730/\',false );">\r\n\t<div class="apphub_CardContentMain">\r\n\t\t<div class="apphub_UserReviewCardContent">\r\n\t\t\t<div class="found_helpful">\r\n\t\t\t\t5,624 of 6,054 people (93%) found this review helpful<br>19 people found this review funny\t\t\t</div>\r\n\r\n\t\t\t<div class="vote_header">\r\n\t\t\t\t\t\t\t\t<div class="reviewInfo">\r\n\t\t\t\t\t<div class="thumb">\r\n\t\t\t\t\t\t<img src="http://steamcommunity-a.akamaihd.net/public/shared/images/userreviews/icon_thumbsUp.png?v=1" width="44" height="44">\r\n\t\t\t\t\t</div>\r\n\r\n\t\t\t\t\t\t\t\t\t\t<div class="title">Recommended</div>\r\n\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t<div class="hours

In [9]:
# After looking at some of the other attributes, I've determined
# that the text attribute is probably the only thing that
# concerns us, so let's use it
text = page.text

In [10]:
# The text, as you can see from the view above, has lots of \r,
# \n, \t characters in it, which might not be good for HTML
# parsing in our case (I won't get into why, partly because I'm
# not completely sure I get why), so let's get rid of all such
# characters, replacing them with spaces instead
# To do this we will use the re module, which allows us to use
# regular expressions
import re
# While we're at it, it's best to get rid of all "<br>" tags since
# they could also present problems during parsing
# We can use the re.sub() method to find one regular expression and
# replace it with another in a given text
text = re.sub(r'\<br\>', r' ', text) # Looks for the string "<br>"
    # and replaces it with a space
text = re.sub(r'[\n\t\r ]+', r' ', text) # Looks for any sequence
    # of carriage returns or whitespace characters and replaces
    # them with a single space
text = text.strip() # Get rid of spaces at either end

In [11]:
# Let's take a look at the cleaned-up version of the source HTML
text[:1000]

'<div id="page1"> <div class="apphub_Card interactable" style="display: none" onclick="ShowModalContent( \'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1\', \'http://steamcommunity.com/profiles/76561198092689293/recommended/730/\', \'http://steamcommunity.com/profiles/76561198092689293/recommended/730/\',false );"> <div class="apphub_CardContentMain"> <div class="apphub_UserReviewCardContent"> <div class="found_helpful"> 5,624 of 6,054 people (93%) found this review helpful 19 people found this review funny </div> <div class="vote_header"> <div class="reviewInfo"> <div class="thumb"> <img src="http://steamcommunity-a.akamaihd.net/public/shared/images/userreviews/icon_thumbsUp.png?v=1" width="44" height="44"> </div> <div class="title">Recommended</div> <div class="hours">156.2 hrs on record</div> </div> <div style="clear: left"></div> </div> <div class="apphub_CardTextContent"> <div class="date_posted">Posted: February 13</div> If i had a dollar for 

In [12]:
# Ok, that looks much nicer!

In [13]:
# Now, let's try to use bs4 to extract data from the cleaned-up HTML
soup = BeautifulSoup(text, "lxml")

In [14]:
soup.contents[0]

<html><body><div id="page1"> <div class="apphub_Card interactable" onclick="ShowModalContent( 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1', 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/', 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/',false );" style="display: none"> <div class="apphub_CardContentMain"> <div class="apphub_UserReviewCardContent"> <div class="found_helpful"> 5,624 of 6,054 people (93%) found this review helpful 19 people found this review funny </div> <div class="vote_header"> <div class="reviewInfo"> <div class="thumb"> <img height="44" src="http://steamcommunity-a.akamaihd.net/public/shared/images/userreviews/icon_thumbsUp.png?v=1" width="44"/> </div> <div class="title">Recommended</div> <div class="hours">156.2 hrs on record</div> </div> <div style="clear: left"></div> </div> <div class="apphub_CardTextContent"> <div class="date_posted">Posted: February 13</div> If i had a dolla

In [55]:
# The parts of the review that we want are the following:
# 1) <div class="apphub_Card interactable"... ==> contains links to the reviewer's profile,
#    typically like the following entry: "onclick="ShowModalContent(
#    'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1',
#    'http://steamcommunity.com/profiles/76561198092689293/recommended/730/',
#    'http://steamcommunity.com/profiles/76561198092689293/recommended/730/',false );""
#    - these links are notable for the following reasons:
#      i) is the number in the link itself the reviwer's ID number? if so, maybe it can be used
#         with Steam API...
#      ii) if the link is followed, the profile page lists some info that is not included here,
#          such as the reviewer's handle (ok, that is included here, but it's a little buried),
#          how many hours the reviewer has logged (for the game being reviewed) in the last two
#          weeks, whether or not the reviewer is "recommended" (at least seemingly), AND any
#          resulting discussion prompted by the review, etc.
#      iii) ALSO, the link itself can be shortened to lead you to the reviewer's profile page,
#           which has a lot of information, such as in-game achievements, level of player
#           (how many games owned), location of player (at least sometimes), and a lot more
# 2) <div div class="found_helpful"... ==> which will contain the number of people who found the
#    review helpful (and the total people who could have rated it (?) and the percentage
#    resulting from those two numbers) AND it will also contain the number of people who found
#    the review funny. typically, it will look like this "4,719 of 5,081 people (93%) found this
#    review helpful 73 people found this review funny"

# Given the fact that the first part includes the link all we really need to do is get the link,
# so let's just try to extract the links
review_sections = soup.find_all("div", "apphub_Card interactable")
review = review_sections[0]

In [56]:
# There should be about 10 such reviews
len(review_sections)

10

In [57]:
review.text

"    5,624 of 6,054 people (93%) found this review helpful 19 people found this review funny       Recommended 156.2 hrs on record     Posted: February 13 If i had a dollar for each time someone screamed at me in another language, i'd still have no money because i spent it on skins    \xa0         BakeACake 20 products in account     30    "

In [58]:
# Let's use the attrs attribute to get the list of links
link_str = review.attrs['onclick']
link = link_str.split(' ', 2)[1].strip('\',')
link

'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1'

In [59]:
list(review.children)

[' ',
 <div class="apphub_CardContentMain"> <div class="apphub_UserReviewCardContent"> <div class="found_helpful"> 5,624 of 6,054 people (93%) found this review helpful 19 people found this review funny </div> <div class="vote_header"> <div class="reviewInfo"> <div class="thumb"> <img height="44" src="http://steamcommunity-a.akamaihd.net/public/shared/images/userreviews/icon_thumbsUp.png?v=1" width="44"/> </div> <div class="title">Recommended</div> <div class="hours">156.2 hrs on record</div> </div> <div style="clear: left"></div> </div> <div class="apphub_CardTextContent"> <div class="date_posted">Posted: February 13</div> If i had a dollar for each time someone screamed at me in another language, i'd still have no money because i spent it on skins </div> </div> <div class="UserReviewCardContent_Footer"> <div class="gradient"> </div> </div> </div>,
 ' ',
 <div class="apphub_CardContentAuthorBlock tall"> <div class="apphub_friend_block_container"> <a href="http://steamcommunity.com/pro

In [24]:
list(review.descendants)

[' ',
 <div class="apphub_CardContentMain"> <div class="apphub_UserReviewCardContent"> <div class="found_helpful"> 5,624 of 6,054 people (93%) found this review helpful 19 people found this review funny </div> <div class="vote_header"> <div class="reviewInfo"> <div class="thumb"> <img height="44" src="http://steamcommunity-a.akamaihd.net/public/shared/images/userreviews/icon_thumbsUp.png?v=1" width="44"/> </div> <div class="title">Recommended</div> <div class="hours">156.2 hrs on record</div> </div> <div style="clear: left"></div> </div> <div class="apphub_CardTextContent"> <div class="date_posted">Posted: February 13</div> If i had a dollar for each time someone screamed at me in another language, i'd still have no money because i spent it on skins </div> </div> <div class="UserReviewCardContent_Footer"> <div class="gradient"> </div> </div> </div>,
 ' ',
 <div class="apphub_UserReviewCardContent"> <div class="found_helpful"> 5,624 of 6,054 people (93%) found this review helpful 19 peo

In [25]:
review.getText(separator=",,,")

" ,,, ,,, ,,, 5,624 of 6,054 people (93%) found this review helpful 19 people found this review funny ,,, ,,, ,,, ,,, ,,, ,,, ,,,Recommended,,, ,,,156.2 hrs on record,,, ,,, ,,, ,,, ,,, ,,,Posted: February 13,,, If i had a dollar for each time someone screamed at me in another language, i'd still have no money because i spent it on skins ,,, ,,, ,,, ,,,\xa0,,, ,,, ,,, ,,, ,,, ,,, ,,, ,,, ,,, ,,,BakeACake,,, ,,,20 products in account,,, ,,, ,,, ,,, ,,, ,,,30,,, ,,, ,,, ,,, "

In [31]:
stripped_strings = list(review.stripped_strings)
stripped_strings

['5,624 of 6,054 people (93%) found this review helpful 19 people found this review funny',
 'Recommended',
 '156.2 hrs on record',
 'Posted: February 13',
 "If i had a dollar for each time someone screamed at me in another language, i'd still have no money because i spent it on skins",
 'BakeACake',
 '20 products in account',
 '30']

In [42]:
# Given the stripped_strings attribute, the relevant information can be extracted
# exceedingly easily
helpful_funny = stripped_strings[0]
recommended = stripped_strings[1]
hours = re.sub(r',',
               r'',
               stripped_strings[2].split()[0])
date_posted = stripped_strings[3][8:]
review = stripped_strings[4]
products_in_account = stripped_strings[6].split()[0]

In [43]:
recommended

'Recommended'

In [50]:
helpful = helpful_funny.split()[:9]
funny = helpful_funny.split()[9:]
found_helpful = re.sub(r',',
                       r'',
                       helpful[0])
total_helpful_candidates = re.sub(r',',
                                  r'',
                                  helpful[2])
helpful_percentage = float(found_helpful)/float(total_helpful_candidates)
print("found helpful: {}\ntotal people that could have found it helpful: {}"
      "\npercentage of people who found the review helpful: "
      "{}%".format(found_helpful,
                   total_helpful_candidates,
                   helpful_percentage))

found_funny = funny[0]
print("found review funny: {}".format(found_funny))

found helpful: 5624
total people that could have found it helpful: 6054
percentage of people who found the review helpful: 0.9289725801123224%
found review funny: 19


In [45]:
date_posted += ', 2015'

In [46]:
date_posted

'February 13, 2015'

In [51]:
products_in_account

'20'

In [60]:
# Let's define a dictionary with all of the stuff that's being collected
review_dict = dict(review_url=link,
                   recommended=recommended,
                   hours=hours,
                   date_posted=date_posted,
                   review=review,
                   products_in_account=products_in_account,
                   found_helpful=found_helpful,
                   total_found_helpful_candidates=total_helpful_candidates,
                   found_helpful_percentage=helpful_percentage,
                   found_funny=found_funny)
review_dict

{'date_posted': 'February 13, 2015',
 'found_funny': '19',
 'found_helpful': '5624',
 'found_helpful_percentage': 0.9289725801123224,
 'hours': '156.2',
 'products_in_account': '20',
 'recommended': 'Recommended',
 'review': <div class="apphub_Card interactable" onclick="ShowModalContent( 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1', 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/', 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/',false );" style="display: none"> <div class="apphub_CardContentMain"> <div class="apphub_UserReviewCardContent"> <div class="found_helpful"> 5,624 of 6,054 people (93%) found this review helpful 19 people found this review funny </div> <div class="vote_header"> <div class="reviewInfo"> <div class="thumb"> <img height="44" src="http://steamcommunity-a.akamaihd.net/public/shared/images/userreviews/icon_thumbsUp.png?v=1" width="44"/> </div> <div class="title">Recommended</div>

In [62]:
# Now, let's go to the review link and the profile link and collect other info
review_dict['steam_id_number'] = review_dict['review_url'].split('/')[4]
review_dict['profile_url'] = 'http://steamcommunity.com/profiles/{}'.format(review_dict['steam_id_number'])
review_dict
# Note the profule url actually cannot be constructed in this way all the time:
# sometimes a shortened form of the user-name is used in place of the ID number
# AND instead of the word "profiles" "id" is used in the URL. Due to this, it is
# probably a better idea to get the profile URL directly from the link that is
# provided rather than try to construct it.

{'date_posted': 'February 13, 2015',
 'found_funny': '19',
 'found_helpful': '5624',
 'found_helpful_percentage': 0.9289725801123224,
 'hours': '156.2',
 'products_in_account': '20',
 'profile_url': 'http://steamcommunity.com/profiles/76561198092689293',
 'recommended': 'Recommended',
 'review': <div class="apphub_Card interactable" onclick="ShowModalContent( 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1', 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/', 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/',false );" style="display: none"> <div class="apphub_CardContentMain"> <div class="apphub_UserReviewCardContent"> <div class="found_helpful"> 5,624 of 6,054 people (93%) found this review helpful 19 people found this review funny </div> <div class="vote_header"> <div class="reviewInfo"> <div class="thumb"> <img height="44" src="http://steamcommunity-a.akamaihd.net/public/shared/images/userreviews/icon_th

In [63]:
review_page = requests.get(review_dict['review_url'])
profile_page = requests.get(review_dict['profile_url'])
review_page_html = review_page.text
profile_page_html = profile_page.text
# Preprocess HTML
review_page_html = re.sub(r'\<br\>',
                          r' ',
                          review_page_html)
review_page_html = re.sub(r'[\n\t\r ]+',
                          r' ',
                          review_page_html)
review_page_html = review_page_html.strip()
profile_page_html = re.sub(r'\<br\>',
                           r' ',
                           profile_page_html)
profile_page_html = re.sub(r'[\n\t\r ]+',
                           r' ',
                           profile_page_html)
profile_page_html = profile_page_html.strip()

In [64]:
# Now use BeautifulSoup to parse the HTML
review_soup = BeautifulSoup(review_page_html, "lxml")
profile_soup = BeautifulSoup(profile_page_html, "lxml")

In [65]:
# Let's take a look first at the review page and see if there's anything there that
# can be collected
review_soup.contents

['html',
 <html> <head> <title>Steam Community :: BakeACake :: Review for Counter-Strike: Global Offensive</title> <link href="http://steamcommunity-a.akamaihd.net/public/shared/css/motiva_sans.css?v=F3z3QpekjE2f" rel="stylesheet" type="text/css"/> <link href="http://steamcommunity-a.akamaihd.net/public/shared/css/buttons.css?v=4iAytERcUqWU" rel="stylesheet" type="text/css"/> <link href="http://steamcommunity-a.akamaihd.net/public/shared/css/shared_global.css?v=57x2Z4EBPbIl" rel="stylesheet" type="text/css"/> <link href="http://steamcommunity-a.akamaihd.net/public/css/globalv2.css?v=yWZ8QbCWv1se" rel="stylesheet" type="text/css"/> <link href="http://steamcommunity-a.akamaihd.net/public/shared/css/motiva_sans.css?v=F3z3QpekjE2f" rel="stylesheet" type="text/css"/> <link href="http://steamcommunity-a.akamaihd.net/public/css/skin_1/workshop.css?v=KMIaI6YCOfoc" rel="stylesheet" type="text/css"/> <link href="http://steamcommunity-a.akamaihd.net/public/css/skin_1/workshop_itemdetails.css?v=M2

In [67]:
username_span = review_soup.find('span', 'profile_small_header_name')
review_dict['username'] = username_span.string
review_dict

{'date_posted': 'February 13, 2015',
 'found_funny': '19',
 'found_helpful': '5624',
 'found_helpful_percentage': 0.9289725801123224,
 'hours': '156.2',
 'products_in_account': '20',
 'profile_url': 'http://steamcommunity.com/profiles/76561198092689293',
 'recommended': 'Recommended',
 'review': <div class="apphub_Card interactable" onclick="ShowModalContent( 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1', 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/', 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/',false );" style="display: none"> <div class="apphub_CardContentMain"> <div class="apphub_UserReviewCardContent"> <div class="found_helpful"> 5,624 of 6,054 people (93%) found this review helpful 19 people found this review funny </div> <div class="vote_header"> <div class="reviewInfo"> <div class="thumb"> <img height="44" src="http://steamcommunity-a.akamaihd.net/public/shared/images/userreviews/icon_th

In [33]:
# Now let's get the number of hours the reviewer has played in the last 2 weeks
last_2_weeks_hours_div = review_soup.find('div', 'playTime')
review_dict['hours_last_2_weeks'] = re.sub(r',',
                                           r'',
                                           last_2_weeks_hours_div.string.split()[0])
review_dict

{'date_posted': 'February 13, 2015',
 'found_funny': '242',
 'found_helpful': '5163',
 'found_helpful_percentage': 0.9302702702702703,
 'hours': '151.9',
 'hours_last_2_weeks': '44.3',
 'products_in_account': '20',
 'profile_url': 'http://steamcommunity.com/profiles/76561198092689293',
 'recommended': 'Recommended',
 'review': "If i had a dollar for each time someone screamed at me in another language, i'd still have no money because i spent it on skins",
 'review_url': 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1',
 'steam_id_number': '76561198092689293',
 'total_found_helpful_candidates': '5550',
 'username': 'BakeACake'}

In [34]:
# Now let's get the comments (if any)
div_comments = review_soup.find('div', 'commentthread_count')
review_dict['num_comments'] = re.sub(r',',
                                     r'',
                                     list(div_comments.strings)[1])
review_dict

{'date_posted': 'February 13, 2015',
 'found_funny': '242',
 'found_helpful': '5163',
 'found_helpful_percentage': 0.9302702702702703,
 'hours': '151.9',
 'hours_last_2_weeks': '44.3',
 'num_comments': '27',
 'products_in_account': '20',
 'profile_url': 'http://steamcommunity.com/profiles/76561198092689293',
 'recommended': 'Recommended',
 'review': "If i had a dollar for each time someone screamed at me in another language, i'd still have no money because i spent it on skins",
 'review_url': 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1',
 'steam_id_number': '76561198092689293',
 'total_found_helpful_candidates': '5550',
 'username': 'BakeACake'}

In [35]:
# Note: We could try to collect the comment text, but it's a little complicated
# due to the fact that comments could stretch across multiple pages (10 per page)
# We will just save this for a later time since it's unclear if this data would
# be useful, anyway.

In [36]:
# Now let's try to look at the profile page and collect data from there

In [37]:
profile_soup.contents

['html',
 <html> <head> <meta content="text/html; charset=utf-8" http-equiv="Content-Type"/> <title>Steam Community :: BakeACake</title> <link href="/favicon.ico" rel="shortcut icon" type="image/x-icon"/> <link href="http://steamcommunity-a.akamaihd.net/public/shared/css/motiva_sans.css?v=F3z3QpekjE2f" rel="stylesheet" type="text/css"/> <link href="http://steamcommunity-a.akamaihd.net/public/shared/css/buttons.css?v=4iAytERcUqWU" rel="stylesheet" type="text/css"/> <link href="http://steamcommunity-a.akamaihd.net/public/shared/css/shared_global.css?v=N7ooHVKbii2d" rel="stylesheet" type="text/css"/> <link href="http://steamcommunity-a.akamaihd.net/public/css/globalv2.css?v=yWZ8QbCWv1se" rel="stylesheet" type="text/css"/> <link href="http://steamcommunity-a.akamaihd.net/public/css/skin_1/modalContent.css?v=.3ZY4VGdKrzFz" rel="stylesheet" type="text/css"/> <link href="http://steamcommunity-a.akamaihd.net/public/css/skin_1/profilev2.css?v=UK9TJRIBrQKu" rel="stylesheet" type="text/css"/> <li

In [38]:
# First, we can get the player's "level" (not exactly sure what the level pertains
# to, but we'll try to find out later)
review_dict['friend_player_level'] = profile_soup.find('div', 'friendPlayerLevel').string
review_dict

{'date_posted': 'February 13, 2015',
 'found_funny': '242',
 'found_helpful': '5163',
 'found_helpful_percentage': 0.9302702702702703,
 'friend_player_level': '5',
 'hours': '151.9',
 'hours_last_2_weeks': '44.3',
 'num_comments': '27',
 'products_in_account': '20',
 'profile_url': 'http://steamcommunity.com/profiles/76561198092689293',
 'recommended': 'Recommended',
 'review': "If i had a dollar for each time someone screamed at me in another language, i'd still have no money because i spent it on skins",
 'review_url': 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1',
 'steam_id_number': '76561198092689293',
 'total_found_helpful_candidates': '5550',
 'username': 'BakeACake'}

In [39]:
# Let's collect the game achievements summary data
achievements_string_split = list(profile_soup.find('span',
                                                   'game_info_achievement_summary').stripped_strings)[1].split()
review_dict['achievement_progress'] = dict(achievements=achievements_string_split[0],
                                           total_achievements_possible=achievements_string_split[2])
review_dict

{'achievement_progress': {'achievements': '88',
  'total_achievements_possible': '167'},
 'date_posted': 'February 13, 2015',
 'found_funny': '242',
 'found_helpful': '5163',
 'found_helpful_percentage': 0.9302702702702703,
 'friend_player_level': '5',
 'hours': '151.9',
 'hours_last_2_weeks': '44.3',
 'num_comments': '27',
 'products_in_account': '20',
 'profile_url': 'http://steamcommunity.com/profiles/76561198092689293',
 'recommended': 'Recommended',
 'review': "If i had a dollar for each time someone screamed at me in another language, i'd still have no money because i spent it on skins",
 'review_url': 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1',
 'steam_id_number': '76561198092689293',
 'total_found_helpful_candidates': '5550',
 'username': 'BakeACake'}

In [40]:
# We can also collect the total number of badges the player has
review_dict['num_badges'] = list(profile_soup.find('div', 'profile_badges').stripped_strings)[1]
review_dict

{'achievement_progress': {'achievements': '88',
  'total_achievements_possible': '167'},
 'date_posted': 'February 13, 2015',
 'found_funny': '242',
 'found_helpful': '5163',
 'found_helpful_percentage': 0.9302702702702703,
 'friend_player_level': '5',
 'hours': '151.9',
 'hours_last_2_weeks': '44.3',
 'num_badges': '4',
 'num_comments': '27',
 'products_in_account': '20',
 'profile_url': 'http://steamcommunity.com/profiles/76561198092689293',
 'recommended': 'Recommended',
 'review': "If i had a dollar for each time someone screamed at me in another language, i'd still have no money because i spent it on skins",
 'review_url': 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1',
 'steam_id_number': '76561198092689293',
 'total_found_helpful_candidates': '5550',
 'username': 'BakeACake'}

In [41]:
# Let's also collect the total number of reviews and screenshots (total number of
# games is also available, but it is the same as that stored under the
# products_in_account key)
profile_counts = list(profile_soup.find('div', 'profile_item_links').stripped_strings)
review_dict['num_screenshots'] = profile_counts[3]
review_dict['num_reviews'] = profile_counts[5]
review_dict

{'achievement_progress': {'achievements': '88',
  'total_achievements_possible': '167'},
 'date_posted': 'February 13, 2015',
 'found_funny': '242',
 'found_helpful': '5163',
 'found_helpful_percentage': 0.9302702702702703,
 'friend_player_level': '5',
 'hours': '151.9',
 'hours_last_2_weeks': '44.3',
 'num_badges': '4',
 'num_comments': '27',
 'num_reviews': '3',
 'num_screenshots': '113',
 'products_in_account': '20',
 'profile_url': 'http://steamcommunity.com/profiles/76561198092689293',
 'recommended': 'Recommended',
 'review': "If i had a dollar for each time someone screamed at me in another language, i'd still have no money because i spent it on skins",
 'review_url': 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1',
 'steam_id_number': '76561198092689293',
 'total_found_helpful_candidates': '5550',
 'username': 'BakeACake'}

In [42]:
# Finally, let's get the number of groups the reviewer is a member of and the
# number of friends the reviewer has on Steam
group_counts = list(profile_soup.find('div', 'profile_group_links').stripped_strings)
review_dict['num_groups'] = group_counts[1]
review_dict

{'achievement_progress': {'achievements': '88',
  'total_achievements_possible': '167'},
 'date_posted': 'February 13, 2015',
 'found_funny': '242',
 'found_helpful': '5163',
 'found_helpful_percentage': 0.9302702702702703,
 'friend_player_level': '5',
 'hours': '151.9',
 'hours_last_2_weeks': '44.3',
 'num_badges': '4',
 'num_comments': '27',
 'num_groups': '21',
 'num_reviews': '3',
 'num_screenshots': '113',
 'products_in_account': '20',
 'profile_url': 'http://steamcommunity.com/profiles/76561198092689293',
 'recommended': 'Recommended',
 'review': "If i had a dollar for each time someone screamed at me in another language, i'd still have no money because i spent it on skins",
 'review_url': 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1',
 'steam_id_number': '76561198092689293',
 'total_found_helpful_candidates': '5550',
 'username': 'BakeACake'}

In [43]:
friend_counts = list(profile_soup.find('div', 'profile_friend_links').stripped_strings)
review_dict['num_friends'] = friend_counts[1]
review_dict

{'achievement_progress': {'achievements': '88',
  'total_achievements_possible': '167'},
 'date_posted': 'February 13, 2015',
 'found_funny': '242',
 'found_helpful': '5163',
 'found_helpful_percentage': 0.9302702702702703,
 'friend_player_level': '5',
 'hours': '151.9',
 'hours_last_2_weeks': '44.3',
 'num_badges': '4',
 'num_comments': '27',
 'num_friends': '70',
 'num_groups': '21',
 'num_reviews': '3',
 'num_screenshots': '113',
 'products_in_account': '20',
 'profile_url': 'http://steamcommunity.com/profiles/76561198092689293',
 'recommended': 'Recommended',
 'review': "If i had a dollar for each time someone screamed at me in another language, i'd still have no money because i spent it on skins",
 'review_url': 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1',
 'steam_id_number': '76561198092689293',
 'total_found_helpful_candidates': '5550',
 'username': 'BakeACake'}

In [85]:
import sys
try:
    profile_soup.find('div', 'profile_friend_links').stripped_strings
    int("t")
except (AttributeError,
        ValueError) as e:
    print(e)

invalid literal for int() with base 10: 't'


In [44]:
# Now that we have figured out a way to collect a lot of data, let's create a
# function that will automatically create dictionaries like the one above

In [45]:
# Will need to make use of the UnicodeDammit method in teh bs4
# (BeautifulSoup) because some of the reviews/HTML source contains
# text in a non-ascii encoding
# I won't get too much into this, just know that it's often an issue and
# that we can deal with it by trying a to decode a text with a lot of
# different encoding and then re-encode it to ascii
# We will also be making use of the sys module, which is part of the
# standard library
# One other module we will need to make use of is the time module, another
# standard library module that allows us to make a program wait for a bit
# before moving on, which I'll talk about it in a minute
from bs4 import UnicodeDammit
import time
import sys
# These are the codecs that we will try to decode the HTML with. I'm
# probably going a little overboard, but I like to be exhaustive
codecs = ["windows-1252", "utf8", "ascii", "cp500", "cp850", "cp852",
          "cp858", "cp1140", "cp1250", "iso-8859-1", "iso8859_2",
          "iso8859_15", "iso8859_16", "mac_roman", "mac_latin2", "utf32",
          "utf16"]

In [46]:
# Imports
import re
import sys
import time
import requests
from bs4 import (BeautifulSoup,
                 UnicodeDammit)

# Define a couple useful regular expressions
SPACE = re.compile(r'[\s]+')
BREAKS_REGEX = re.compile(r'\<br\>')
COMMA = re.compile(r',')

# Codecs for use with UnicodeDammit
codecs = ["windows-1252", "utf8", "ascii", "cp500", "cp850", "cp852",
          "cp858", "cp1140", "cp1250", "iso-8859-1", "iso8859_2",
          "iso8859_15", "iso8859_16", "mac_roman", "mac_latin2", "utf32",
          "utf16"]

def get_review_data_for_game(appid, time_out=0.5, limit=0, sleep=10):
    '''
    Generate dictionaries for each review for a given game.

    The dictionaries will contain keys for the review text, the reviewer ID,
    the reviewer's user-name, the number of friends the reviewer has, the
    the number of reviews the reviewer has written, and much more.

    :param appid: ID corresponding to a given game
    :type appid: str
    :param timeout: amount of time allowed to go by without hearing
                    response while using requests.get() method
    :type timeout: float
    :param limit: the maximum number of reviews to collect
    :type limit: int (default: 0, which signifies all)
    :param sleep: amount of time to wait between reading different pages on
                  Steam
    :type sleep: int/float
    :yields: dictionary with keys for various pieces of data related to a
             single review, including the review itself, the number of hours
             the reviewer has played the game, etc.
    '''

    if limit == 0:
        limit = -1
    range_begin = 0
    i = 1
    reviews_count = 0
    while True:
        # Get unique URL for values of range_begin and i
        base_url = 'http://steamcommunity.com/app/{2}/homecontent/?user' \
                   'reviewsoffset={0}&p=1&itemspage={1}&screenshotspage' \
                   '={1}&videospage={1}&artpage={1}&allguidepage={1}&web' \
                   'guidepage={1}&integratedguidepage={1}&discussionspage' \
                   '={1}&appid={2}&appHubSubSection=10&appHubSubSection=' \
                   '10&l=english&browsefilter=toprated&filterLanguage=' \
                   'default&searchText=&forceanon=1'.format(range_begin,
                                                            i,
                                                            appid)
        # Get the URL content
        base_page = None
        time.sleep(sleep)
        # Get the HTML page; if there's a timeout error, then catch it and
        # exit out of the loop, effectively ending the function.
        try:
            base_page = requests.get(base_url,
                                     timeout=time_out)
        except requests.exceptions.Timeout as e:
            print("There was a Timeout error...")
            break
        # If there's nothing at this URL, page might have no value at all,
        # in which case we should break out of the loop
        # Another situation where we'd want to exit from the loop is if the
        # page.text contains only an empty string or a string that has only
        # a sequence of one or more spaces
        if not base_page:
            break
        elif not base_page.text.strip():
            break
        # Preprocess the HTML source, getting rid of "<br>" tags and
        # replacing any sequence of one or more carriage returns or
        # whitespace characters with a single space
        base_html = SPACE.sub(r' ',
                              BREAKS_REGEX.sub(r' ',
                                               base_page.text.strip()))
        # Try to decode the HTML to unicode and then re-encode the text
        # with ASCII, ignoring any characters that can't be represented
        # with ASCII
        base_html = UnicodeDammit(base_html,
                                  codecs).unicode_markup.encode('ascii',
                                                                'ignore')

        # Parse the source HTML with BeautifulSoup
        source_soup = BeautifulSoup(base_html,
                                    'lxml')
        reviews = soup.find_all('div',
                                'apphub_Card interactable')

        # Iterate over the reviews in the source HTML and find data for
        # each review, yielding a dictionary
        for review in reviews:

            # Get links to review URL, profile URL, Steam ID number
            review_url = review.attrs['onclick'].split(' ',
                                                       2)[1].strip("',")
            review_url_split = review_url.split('/')
            steam_id_number = review_url_split[4]
            profile_url = '/'.join(review_url_split[:5])

            # Get other data within the base reviews page
            stripped_strings = list(review.stripped_strings)
            # Parsing the HTML in this way depends on stripped_strings
            # having a length of at least 8
            if len(stripped_strings) >= 8:
                print(stripped_strings)
                # Extracting data from the text that supplies the number
                # of users who found the review helpful and/or funny
                # depends on a couple facts
                helpful_and_funny_list = stripped_strings[0].split()
                if (helpful_and_funny_list[8] == 'helpful'
                    and len(helpful_and_funny_list) == 15):
                    helpful = helpful_and_funny_list[:9]
                    funny = helpful_and_funny_list[9:]
                    num_found_helpful = int(COMMA.sub(r'',
                                                  helpful[0]))
                    num_voted_helpfulness = int(COMMA.sub(r'',
                                                          helpful[2]))
                    num_found_unhelpful = \
                        num_voted_helpfulness - num_found_helpful
                    found_helpful_percentage = \
                        float(num_found_helpful)/num_voted_helpfulness
                    num_found_funny = funny[0]
                recommended = stripped_strings[1]
                total_game_hours = COMMA.sub(r'',
                                             stripped_strings[2]
                                             .split()[0])
                date_posted = '{}, 2015'.format(stripped_strings[3][8:])
                review_text = ' '.join(stripped_strings[4:-3])
                num_games_owned = stripped_strings[-2].split()[0]
            else:
                sys.stderr.write('Found incorrect number of '
                                 '"stripped_strings" in review HTML '
                                 'element. stripped_strings: {}\n'
                                 'Continuing.'
                                 .format(stripped_strings))
                continue

            # Make dictionary for holding all the data related to the
            # review
            review_dict = \
                dict(review_url=review_url,
                     recommended=recommended,
                     total_game_hours=total_game_hours,
                     date_posted=date_posted,
                     review=review_text,
                     num_games_owned=num_games_owned,
                     num_found_helpful=num_found_helpful,
                     num_found_unhelpful=num_found_unhelpful,
                     num_voted_helpfulness=num_voted_helpfulness,
                     found_helpful_percentage=found_helpful_percentage,
                     num_found_funny=num_found_funny,
                     steam_id_number=steam_id_number,
                     profile_url=profile_url)

            # Follow links to profile and review pages and collect data
            # from there
            time.sleep(sleep)
            review_page = requests.get(review_dict['review_url'])
            time.sleep(sleep)
            profile_page = requests.get(review_dict['profile_url'])
            review_page_html = review_page.text
            profile_page_html = profile_page.text

            # Preprocess HTML and try to decode the HTML to unicode and
            # then re-encode the text with ASCII, ignoring any characters
            # that can't be represented with ASCII
            review_page_html = \
                SPACE.sub(r' ',
                          BREAKS_REGEX.sub(r' ',
                                           review_page_html.strip()))
            review_page_html = \
                UnicodeDammit(review_page_html,
                              codecs).unicode_markup.encode('ascii',
                                                            'ignore')
            profile_page_html = \
                SPACE.sub(r' ',
                          BREAKS_REGEX.sub(r' ',
                                           profile_page_html.strip()))
            profile_page_html = \
                UnicodeDammit(profile_page_html,
                              codecs).unicode_markup.encode('ascii',
                                                            'ignore')

            # Now use BeautifulSoup to parse the HTML
            review_soup = BeautifulSoup(review_page_html,
                                        'lxml')
            profile_soup = BeautifulSoup(profile_page_html,
                                         'lxml')

            # Get the user-name from the review page
            review_dict['username'] = \
                review_soup.find('span',
                                 'profile_small_header_name').string

            # Get the number of hours the reviewer played the game in the
            # last 2 weeks
            review_dict['hours_previous_2_weeks'] = \
                COMMA.sub(r'',
                          review_soup.find('div',
                                           'playTime').string.split()[0])

            # Get the number of comments users made on the review (if any)
            review_dict['num_comments'] = \
                COMMA.sub(r'',
                          list(review_soup
                               .find('div',
                                     'commentthread_count')
                               .strings)[1])

            # Get the reviewer's "level" (friend player level)
            friend_player_level = profile_soup.find('div',
                                                    'friendPlayerLevel')
            if friend_player_level:
                review_dict['friend_player_level'] = \
                    friend_player_level.string
            else:
                review_dict['friend_player_level'] = None

            # Get the game achievements summary data
            achievements = \
                profile_soup.find('span',
                                  'game_info_achievement_summary')
            if achievements:
                achievements = achievements.stripped_strings
                if achievements:
                    achievements = list(achievements)[1].split()
                    review_dict['achievement_progress'] = \
                        dict(num_achievements_attained=achievements[0],
                             num_achievements_possible=achievements[2])
                else:
                    review_dict['achievement_progress'] = \
                        dict(num_achievements_attained=None,
                             num_achievements_possible=None)
            else:
                review_dict['achievement_progress'] = \
                    dict(num_achievements_attained=None,
                         num_achievements_possible=None)

            # Get the number of badges the reviewer has earned on the site
            badges = profile_soup.find('div',
                                       'profile_badges')
            if badges:
                badges = badges.stripped_strings
                if badges:
                    review_dict['num_badges'] = list(badges)[1]
                else:
                    review_dict['num_badges'] = None
            else:
                review_dict['num_badges'] = None

            # Get the number of reviews the reviewer has written across all
            # games and the number of screenshots he/she has taken
            reviews_screens = profile_soup.find('div',
                                                'profile_item_links')
            if reviews_screens:
                reviews_screens = reviews_screens.stripped_strings
                if reviews_screens:
                    reviews_screens = list(reviews_screens)
                    review_dict['num_screenshots'] = reviews_screens[3]
                    review_dict['num_reviews'] = reviews_screens[5]
                else:
                    review_dict['num_screenshots'] = None
                    review_dict['num_reviews'] = None
            else:
                review_dict['num_screenshots'] = None
                review_dict['num_reviews'] = None

            # Get the number of groups the reviewer is part of on the site
            groups = profile_soup.find('div',
                                       'profile_group_links')
            if groups:
                groups = groups.stripped_strings
                if groups:
                    review_dict['num_groups'] = list(groups)[1]
                else:
                    review_dict['num_groups'] = None
            else:
                review_dict['num_groups'] = None

            # Get the number of friends the reviwer has on the site
            friends = profile_soup.find('div',
                                        'profile_friend_links')
            if friends:
                friends = friends.stripped_strings
                if friends:
                    review_dict['num_friends'] = list(friends)[1]
                else:
                    review_dict['num_friends'] = None
            else:
                review_dict['num_friends'] = None

            yield review_dict

            reviews_count += 1
            if reviews_count == limit:
                break

        if reviews_count == limit:
            break

        # Increment the range_begin and i variables, which will be used in
        # the generation of the next page of reviews
        range_begin += 10
        i += 1

In [47]:
# Alright, let's see if that worked
# We'll create an empty set of reviews and as each new set of reviews is
# generated by the function, we'll add in all of the reviews in the set
# to the set of reviews (if any are duplicates, they will not be added)
reviews = [review for review in get_review_data_for_game('730',
                                                         time_out=3.0,
                                                         limit=33)]

['5,163 of 5,550 people (93%) found this review helpful 242 people found this review funny', 'Recommended', '151.9 hrs on record', 'Posted: February 13', "If i had a dollar for each time someone screamed at me in another language, i'd still have no money because i spent it on skins", 'BakeACake', '20 products in account', '27']
['8,925 of 9,744 people (92%) found this review helpful 145 people found this review funny', 'Recommended', '341.3 hrs on record', 'Posted: February 1', "It's like roulette; fun until it turns into Russian.", 'Delta', '124 products in account', '69']
['15,517 of 17,300 people (90%) found this review helpful 1 person found this review funny', 'Recommended', '1,176.1 hrs on record', 'Posted: February 27, 2014', 'you either die a noob or live long enough to get called a hacker', 'compliKATIEd', '114 products in account', '174']
['3,571 of 3,956 people (90%) found this review helpful 42 people found this review funny', 'Recommended', '554.6 hrs on record', 'Posted: 

In [48]:
len(reviews)
reviews = list(reviews)
reviews

[{'achievement_progress': {'num_achievements_attained': '88',
   'num_achievements_possible': '167'},
  'date_posted': 'February 13, 2015',
  'found_helpful_percentage': 0.9302702702702703,
  'friend_player_level': '5',
  'hours_previous_2_weeks': '44.3',
  'num_badges': '4',
  'num_comments': '27',
  'num_found_funny': '242',
  'num_found_helpful': 5163,
  'num_found_unhelpful': 387,
  'num_friends': '70',
  'num_games_owned': '20',
  'num_groups': '21',
  'num_reviews': '3',
  'num_screenshots': '113',
  'num_voted_helpfulness': 5550,
  'profile_url': 'http://steamcommunity.com/profiles/76561198092689293',
  'recommended': 'Recommended',
  'review': "If i had a dollar for each time someone screamed at me in another language, i'd still have no money because i spent it on skins",
  'review_url': 'http://steamcommunity.com/profiles/76561198092689293/recommended/730/?insideModal=1',
  'steam_id_number': '76561198092689293',
  'total_game_hours': '151.9',
  'username': 'BakeACake'},
 {'ac

In [49]:
len(reviews)

33