# Scraping steam to get game description and tags

In [1]:
from bs4 import BeautifulSoup
import requests
import numpy as np
import time

import xml.etree.ElementTree

In [2]:
page = requests.get("http://store.steampowered.com/app/396560") #/Front_Defense/
soup = BeautifulSoup(page.text, 'html.parser') #page.content

## Getting the game description

In [3]:
description = soup.select('[name="Description"]')
print(description[0])

<meta content="“Front Defense” immerses you into the WW2 battlefield, When you shoot, throw grenades, Vive will track your location accurately. Destroy enemy with rocket tube and guns. Fully equipped troops with advanced weapons are waiting for you in the game. Make the best of your weapons to gain the victory." name="Description"/>


In [4]:
type(description[0])

bs4.element.Tag

In [5]:
text = str(description[0])
text[15:-22]

'“Front Defense” immerses you into the WW2 battlefield, When you shoot, throw grenades, Vive will track your location accurately. Destroy enemy with rocket tube and guns. Fully equipped troops with advanced weapons are waiting for you in the game. Make the best of your weapons to gain the victory.'

## Getting the game genres

In [6]:
genres = soup.select('body div [class="app_tag"]')
for genre in genres:
    print(genre.get_text().strip())

Action
Violent
VR
World War II


## Getting a list of all relevant game ids

In [7]:
e = xml.etree.ElementTree.parse('data/api_steampowered.xml')
for atype in e.findall('applist'):
    print(atype)

In [8]:
root = e.getroot()

In [9]:
game_ids = []
for idx in range(len(root[0])):
    game_id = root[0][idx][0].text
    name = root[0][idx][1].text
    game_ids.append(game_id)

## Looping over all games

In [10]:
print('The number of games on steam is: ', len(game_ids))

The number of games on steam is:  44248


In [None]:
#game_descriptions = []
#game_genres = []
#complete_genre_list = []
for game_id in game_ids[26143:]:
    # Sleeping the loop between 1 and 2 secs
    sleep_time = np.random.randint(1,3)
    time.sleep(sleep_time)
    
    # Requesting the page
    page = requests.get("http://store.steampowered.com/app/"+str(game_id), allow_redirects=False )
    soup = BeautifulSoup(page.text, 'html.parser')
    
    # Getting the game description
    description = soup.select('meta[name="Description"]')
    if len(description)>0:
        #print('game_id: ', game_id)
        
        text = str(description[0])
        description = text[15:-22]
        #print(str(description))
        game_descriptions.append(description)
    
        # Getting the genres
        genres = []
        tags = soup.select('body div [class="app_tag"]')
        for item in tags:
            genre = item.get_text().strip()
            genres.append(genre)
            if genre not in complete_genre_list:
                complete_genre_list.append(genre)
        #print(genres)
        game_ids.append(game_id)
        
        game_genres.append(genres)

In [None]:
import csv

with open('data/steam_game_descriptions.txt', 'w') as fp:
    a = csv.writer(fp) 
    a.writerow(game_descriptions)
    
with open('data/steam_game_tags.txt', 'w') as fp:
    a = csv.writer(fp) 
    for genres in game_genres:
        a.writerow(genres)
    
with open('data/steam_complete_tag_list.txt', 'w') as fp:
    a = csv.writer(fp) 
    a.writerow(complete_genre_list)

In [None]:
len(game_descriptions)

In [None]:
len(complete_genre_list)

## Doing one-hot encoding of the genres

In [None]:
tag_to_idx_dict = {}
for idx, tag in enumerate(complete_genre_list):
    tag_to_idx_dict[tag] = idx

In [None]:
tag_matrix = np.zeros( (len(game_descriptions), len(complete_genre_list)) )
for idx, tag_list in enumerate(game_genres):
    #print(tag_list)
    for tag in tag_list:
        #print(tag)
        tag_idx = tag_to_idx_dict[tag]
        tag_matrix[idx, tag_idx] = 1

In [None]:
np.save('data/steam_tag_metrix_2', tag_matrix)