# Taking reference from https://jovian.com/altamashwaseem04/scraping-top-selling-games-on-steam 

#### Credits to altamashwaseem04 for the reference script, which formed the basis of this scraping script. 

## Using Selenium webdriver for automation on browsing webpage

In [None]:
#Utilizing webdriver on selenium to browse webpages
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.select import Select
import re #used to import the regular expressions module
from tqdm import tqdm  # Import tqdm for the progress bar
from bs4 import BeautifulSoup #new import for using BeautifulSoup
#Using pandas for dataframe and csv conversion
import pandas as pd

In [None]:
import os
from selenium import webdriver

In [None]:
#IMPORTANT, DOWNLOAD chromedriver.exe AND PATH IN YOUR LOCAL DEVICE
#Specify the path to the directory containing chromedriver.exe
webdriver_path = "Local Device - Path"

# Add the directory to the system PATH
os.environ["PATH"] += os.pathsep + webdriver_path

# Initialize the WebDriver
driver = webdriver.Chrome()

# Use the WebDriver as needed (e.g., navigate to a website)
driver.get("https://www.example.com")

# Perform your scraping operations

# Close the WebDriver
driver.quit()


In [None]:
#webdriver creation
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
wd = webdriver.Chrome(options=options)

In [None]:
#Initial url that we are using
url ='https://store.steampowered.com/search/?filter=topsellers'

wd.get(url)

#Just double checking webdriver is currenctly at the correct page
wd.title

In [None]:
#Function to create webdriver and downloads page
def create_driver(url):
  '''Takes the url as an input, creates the webdriver and returns the driver with the page'''
  options = webdriver.ChromeOptions()
  options.add_argument('--headless')
  options.add_argument('--no-sandbox')
  options.add_argument('--disable-dev-shm-usage')
  wd = webdriver.Chrome(options=options)
  wd.get(url)
  return wd

In [None]:
#To simulate scrolling through the page
import time

SCROLL_PAUSE_TIME = 1

# Get scroll height
last_height = wd.execute_script("return document.body.scrollHeight")

for i in range(6):
    # Scroll down to bottom
    wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = wd.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

In [None]:
last_height

In [None]:
#To simulate scrolling through the page
def scroll_page(wd):
  '''Takes the driver as an input and simulates the scrolling to get all the games'''  
  SCROLL_PAUSE_TIME = 2

  # Get scroll height
  last_height = wd.execute_script("return document.body.scrollHeight")

  for i in range(6):
    # Scroll down to bottom
    wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")

    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)

    # Calculate new scroll height and compare with last scroll height
    new_height = wd.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height
  return

## Getting all game titles from Steam Top Sellers

In [None]:
#Selects game titles
def get_games(wd):
    '''Takes the driver as an input and returns a selenium web element with all the games'''
    games_rows = wd.find_element(By.ID, 'search_resultsRows')
    games = games_rows.find_elements(By.TAG_NAME, 'a')
    return games

# Assuming you have initialized the WebDriver as wd

# Call the function to get the game elements
games = get_games(wd)

In [None]:
#Creating an empty list
games_list = []

#looping through all the games
for i in range(len(games)): 
  title =  games[i].find_element(By.CLASS_NAME, 'title').text
  game_url = games[i].get_attribute('href')
  
  #storing the extracted information inside a dictionary
  my_game = {
      'title': title,
      'url': game_url,
  }

  #adding the dictionary inside the list
  games_list.append(my_game)

In [None]:
# Number of games in steam's top sellers
len(games_list)

In [None]:
#Parses web elements and places into games_list with game titles and url
def parse_data(games):
  '''Takes the web element "games" as input and returns a list of dictionary with the title and urls of the games'''
  #Creating an empty list
  games_list = []

  #looping through all the games
  for i in range(len(games)): 
    title =  games[i].find_element_by_class_name('title').text
    game_url = games[i].get_attribute('href')
    
    #storing the extracted information inside a dictionary
    my_game = {
        'title': title,
        'url': game_url,
    }

    #adding the dictionary inside the list
    games_list.append(my_game)
  return games_list

In [None]:
#Converts to created list into dataframe 
gamesdf = pd.DataFrame(games_list)
print(gamesdf.head())

In [None]:
def clean_data(games_list):
    '''Takes the list of games as an input, cleans the list of any anomalies and returns its dataframe'''
    gamesdf = pd.DataFrame(games_list)
    new_gamesdf = gamesdf.copy()  # Initialize new_gamesdf with a copy of the original dataframe
    for i in range(len(gamesdf)):
        if gamesdf['title'][i] == '' or gamesdf['title'][i] in ['EA Play', 'Valve Index VR Kit', 'Steam Deck', 'Valve Index® Base Station']:
            new_gamesdf.drop(i, inplace=True)
    return new_gamesdf

new_gamesdf = clean_data(games_list)
print(new_gamesdf)

In [None]:
# Dataframe that contains game titles and url links
new_gamesdf = clean_data(gamesdf)
print(new_gamesdf)

## Defining functions to enable webdriver to go into each individual url
### Script will then extract the relevant information from the webpages

In [None]:
#Skipping age restriction pages
def check_page(wd):
    '''Takes the driver as an input with a game url and, checks the page and returns the driver'''
    try: 
       try:
          info_tag = wd.find_element(By.CLASS_NAME, 'glance_ctn_responsive_left')
          return wd
       except:
          year_tag = wd.find_element(By.CLASS_NAME, 'agegate_birthday_selector')
          year = year_tag.find_element(By.ID, 'ageYear')
          yearDD = Select(year)
          yearDD.select_by_value('1900')
          view_button = wd.find_element(By.XPATH, '//*[@id="view_product_page_btn"]')
          view_button.click()
          time.sleep(4)
    except:
         return wd

    return wd

In [None]:
from selenium.common.exceptions import NoSuchElementException  # Import the exception

def get_price(wd):
    '''Takes the driver and returns the original or default price of the game'''

    # Check if there's a 'game_purchase_price' appearing before the first instance of 'game_wrapper'
    try:
        preceding_price_tag = wd.find_element(By.CSS_SELECTOR, '.game_purchase_price')
        preceding_price = preceding_price_tag.text.strip()
        if preceding_price.lower() == 'free to play' or preceding_price.lower() == 'free':
            return preceding_price
    except NoSuchElementException:
        pass  # Continue to the next step if preceding 'game_purchase_price' is not found

    # Check if there's a 'game_purchase_price' or 'discount_original_price'
    # by looking for any game_wrapper that contains these elements
    game_wrappers = wd.find_elements(By.CSS_SELECTOR, '.game_area_purchase_game_wrapper')
    for game_wrapper in game_wrappers:
        try:
            # Try to extract the price from the 'game_purchase_price' element
            price_tag = game_wrapper.find_element(By.CLASS_NAME, 'game_purchase_price')
            price = price_tag.text.strip('S$')
            return price

        except NoSuchElementException:
            pass  # Continue to the next game_wrapper if 'game_purchase_price' is not found

        try:
            # Try to extract the original price from the 'discount_original_price' element
            price_tag = game_wrapper.find_element(By.CLASS_NAME, 'discount_original_price')
            price = price_tag.text.strip('S$')
            return price

        except NoSuchElementException:
            pass  # Continue to the next game_wrapper if 'discount_original_price' is not found

    return 'not available'


In [None]:
def get_discounted(wd, is_free_to_play, price):
    '''Takes the driver, a flag indicating if the game is free to play, and the price; returns the discounted price of the game'''

    if is_free_to_play or price.lower() in ['free', 'not available']:
        return price

    try:
        # Find the first instance of 'game_area_purchase_game_wrapper'
        game_wrapper = wd.find_element(By.CSS_SELECTOR, '.game_area_purchase_game_wrapper')

        # Check if the game is part of downloadable content (DLC)
        dlc_parent_element = game_wrapper.find_elements(By.CLASS_NAME, 'gameDlcBlocks')

        if len(dlc_parent_element) == 0:
            try:
                # Try to extract the discounted price from the 'discount_final_price' element in the same game wrapper
                price_tag = game_wrapper.find_element(By.CLASS_NAME, 'discount_final_price')
                disc_price = re.search(r'\d+\.\d+', price_tag.text)
                return disc_price.group() if disc_price else 'not available'

            except NoSuchElementException:
                pass  # Continue to the next step if discounted price is not found

    except NoSuchElementException:
        pass  # Continue to the next step if game_wrapper is not found

    return 'not available'


In [None]:
#Finds release date of game IF available
def get_release(wd):
   '''Takes the driver and returns the release date of the game'''
   try: 
      info_tag = wd.find_element(By.CLASS_NAME, 'glance_ctn_responsive_left')

      release = info_tag.find_element(By.CLASS_NAME, 'release_date').text.strip().replace('RELEASE DATE:\n','')
   except:
      release = 'not available'
       
   return release

In [None]:
#Finds reviews % of game IF available
def get_reviews(wd):
  '''Takes the driver and returns the reviews of the game'''
  try:  
    info_tag = wd.find_element(By.CLASS_NAME, 'glance_ctn_responsive_left')
    try:
      reviews = info_tag.find_element(By.XPATH, '//*[@id="userReviews"]/div[2]').text.replace('ALL REVIEWS:\n', '')
    except:
      reviews = info_tag.find_element(By.CLASS_NAME, 'user_reviews').text.replace('ALL REVIEWS:\n', '')
  except:
    reviews = 'not available'
  return reviews

In [None]:
#Retrieves genre from page
def get_genre_from_html(html):
    '''Extract genres from the provided HTML snippet'''
    soup = BeautifulSoup(html, 'html.parser')
    genre_b_tag = soup.find('b', text='Genre:')
    if genre_b_tag:
        span_tag = genre_b_tag.find_next('span')
        if span_tag:
            genres = [a.text for a in span_tag.find_all('a')]
            return ', '.join(genres)
    return 'not available'

In [None]:
#Using url found in above dataframe 
games_url = new_gamesdf['url']

In [None]:
#Webdriver will dive into the url link
def get_page(url):
  '''Takes the url and returns the driver with the page'''
  wd.get(url)
  return wd

In [None]:
#Webdriver goes into links to find info based on above user defined functions
def get_game_info(url):
    '''Takes the url and returns a dictionary with the price, discounted price, release date, and the reviews'''
    wd_1 = get_page(url)
    wd_new = check_page(wd_1)
    price = get_price(wd_new)
    
    # Determine if the game is free to play to pass to get_discounted
    is_free_to_play = price.lower() == 'free to play'
    
    discounted = get_discounted(wd_new, is_free_to_play, price)  # Pass is_free_to_play
    release = get_release(wd_new)
    reviews = get_reviews(wd_new)
    
    # Extract the HTML content
    html_content = wd_new.page_source

    # Get the genre from the HTML content
    genre = get_genre_from_html(html_content)
    
    mygame = {
        'price': price,
        'discounted_price': discounted,
        'release_date': release,
        'reviews': reviews,
        'genre': genre
    }
    return mygame


In [None]:
# Assume you have a function get_game_info(url) that fetches game information from a URL

def get_all_games(games_url):
    '''Takes all the urls of the games, creates a list of dictionary of info of the games and returns a dataframe of this'''
    game_info_list = []

    # Use tqdm to create a progress bar
    for i in tqdm(games_url, desc="Scraping games"):
        game_info = get_game_info(i)
        game_info_list.append(game_info)

    game_info_df = pd.DataFrame(game_info_list)
    return game_info_df

## Main scraping script, generating dataframe with all relevant information

In [None]:
# THIS IS THE MAIN BLOCK THAT RETRIEVES ALL INFORMATION. 
game_info_df = get_all_games(games_url)
game_info_df

In [None]:
# Splitting the DataFrame into three based on columns
game_infosplit_df = game_info_df[['release_date', 'reviews']]
game_price_df = game_info_df[['price', 'discounted_price']]
game_genre_df = game_info_df[['genre']]

# Check split works
print(game_infosplit_df)
print(game_price_df)
print(game_genre_df)

In [None]:
#Concat title and new dataframes together 
games_info_result = pd.concat([new_gamesdf, game_infosplit_df], axis=1, join='inner')
games_info_result.reset_index(drop=True, inplace=True)
print(games_info_result)

#Concat title and new dataframes together 
games_price_result = pd.concat([new_gamesdf['title'], game_price_df], axis=1, join='inner')
games_price_result.reset_index(drop=True, inplace=True)
print(games_price_result)

#Concat title and new dataframes together 
games_genre_result = pd.concat([new_gamesdf['title'], game_genre_df], axis=1, join='inner')
games_genre_result.reset_index(drop=True, inplace=True)
print(games_genre_result)

## Converting dataframe into .csv files for further usage
### Rename files to preference

In [None]:
#Creating csv file
games_info_result.to_csv('games_info' + '.csv', index=False)
games_price_result.to_csv('games_price' + '.csv', index=False)
games_genre_result.to_csv('games_genre' + '.csv', index=False)