# Part 1: Get top 50 top seller games with mixed reviews (< 80%)

The page only loads 50 reviews at once. You need to scroll down to see more, and so we will use an interactive scraper such as Selenium.

In [1]:
# Install selenium and chromedriver
!apt-get update 
!pip install selenium
!apt install chromium-chromedriver

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Connecting to security.0% [Connecting to archive.ubuntu.com (185.125.190.36)] [Connecting to security.0% [1 InRelease gpgv 3,626 B] [Connecting to archive.ubuntu.com (185.125.190.36                                                                               Get:2 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
0% [1 InRelease gpgv 3,626 B] [Waiting for headers] [2 InRelease 14.2 kB/88.7 k                                                                               Hit:3 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Ign:5 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  I

In [2]:
# Import selenium and configure the settings (headless)
from selenium import webdriver
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')

# Import other methods for scrolling and finding elements
from selenium.webdriver.common.by import By
import time
from bs4 import BeautifulSoup as bs

In [19]:
# Load the page
driver = webdriver.Chrome('chromedriver',options=options)
driver.get("https://store.steampowered.com/search/?filter=topsellers")

# Variable to store games that meet the criteria
items = {}

# Initiate DOM?
last_height = driver.execute_script("return document.body.scrollHeight")

# Number of games desired, review score upper limit, review count lower limit, scrolling amount upper limit
itemTargetCount = 25
score_thresh = 80
count_thresh = 20000
max_scroll = 100

# -th scroll for debugging and stop condition and variable to break out of nested loop
scroll = 0
is_looping = True

# Start the loop (infinite scrolling)
while (itemTargetCount > len(items)) and (scroll < max_scroll) and (is_looping):

  # Debugging tool
  print(f'Scroll #{scroll} started.')

  # Scroll down to the bottom of the page
  driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

  # Add a 3 second delay to wait for the page to load
  time.sleep(3)

  # Update the page
  new_height = driver.execute_script("return document.body.scrollHeight")
  # If scrolling does not change anything, break from loop
  if new_height == last_height:
    break

  # Update the height
  last_height = new_height

  # Get the row box containing game information
  games = driver.find_elements(by=By.CLASS_NAME, value="search_result_row")

  # Iterate for all rows
  for game in games:
    # Parse the HTML using BeautifulSoup
    soup = bs(game.get_attribute('outerHTML'), 'html.parser')

    # Try getting the game info (name, id, review score, review count) if available
    try:
      game_name = soup.find(class_="title").text
      game_id = soup.find("a")['data-ds-appid']
      review_score = int(soup.find(class_="search_review_summary")['data-tooltip-html'].split('<br>')[1].split('%')[0])
      review_count = int(soup.find(class_="search_review_summary")['data-tooltip-html'].split(' the ')[1].split(' ')[0].replace(',', ''))
    except:
      continue

    # Select the "bad" games with decent amount of reviews as set by the thresholds
    # also remove games that come in bundles (more than 1 id tags)
    if (review_score < score_thresh) and (review_count > count_thresh) and ('Bundle' not in game_name) and (len(game_id.split(','))==1):
      items[game_name] = {'id': game_id, 'review_score': review_score, 'review_count': review_count}

      # Check game count after adding a new game
      if itemTargetCount == len(items):
        is_looping = False
        break

  # Debugging tool
  print(f'Scroll #{scroll} finished.')
  print(f'Number of games obtained = {len(items)} \n')
  scroll += 1

  if not is_looping:
    break

# Close the session
driver.quit()

Scroll #0 started.
Scroll #0 finished.
Number of games obtained = 4 

Scroll #1 started.
Scroll #1 finished.
Number of games obtained = 9 

Scroll #2 started.
Scroll #2 finished.
Number of games obtained = 10 

Scroll #3 started.
Scroll #3 finished.
Number of games obtained = 11 

Scroll #4 started.
Scroll #4 finished.
Number of games obtained = 13 

Scroll #5 started.
Scroll #5 finished.
Number of games obtained = 13 

Scroll #6 started.
Scroll #6 finished.
Number of games obtained = 16 

Scroll #7 started.
Scroll #7 finished.
Number of games obtained = 18 

Scroll #8 started.
Scroll #8 finished.
Number of games obtained = 21 

Scroll #9 started.
Scroll #9 finished.
Number of games obtained = 24 

Scroll #10 started.
Scroll #10 finished.
Number of games obtained = 25 



In [22]:
# Save result in a csv file
import pandas as pd

df = pd.DataFrame(items).T.sort_values(by='review_count', ascending=False).reset_index().rename(columns={'index': 'name'})

df

# df.to_csv('game_list_new.csv', index=False)

Unnamed: 0,name,id,review_score,review_count
0,Cyberpunk 2077,1091500,77,423516
1,DayZ,221100,72,263793
2,New World,1063730,67,199699
3,No Man's Sky,275850,74,190203
4,Battlefield™ 2042,1517290,30,113600
5,NARAKA: BLADEPOINT,1203220,79,110761
6,Battlefield V,1238810,70,100851
7,Grand Theft Auto IV: The Complete Edition,12210,77,94058
8,Dying Light 2 Stay Human,534380,78,81327
9,Elite Dangerous,359320,76,64208
