# Layout:
1. Import Packages/Libraries
2. Pre-Scrape
    + Read csv from Initial Scrape as DF
3. Scrape
    + Create List of Links to Parse
    + Build overall DataFrame
    + Iterate through Links
    + Use BeautifulSoup to build row dictionaries of form variable:value
    + Note: Need Selenium for 18+ content games
    + Attach dictionary to dataframe
    + Pickle dataframe
4. Feature Engineering
    + Set Dummy Variables (T/F Booleans)
    + Double Check
    + Drop Unneeded Variables
    + Double Check

# Mass Importation

In [1]:
import pandas as pd
import numpy as np
import dateutil.parser
from datetime import date
import re

from bs4 import BeautifulSoup
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os

import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
import statsmodels.formula.api as smf
import patsy
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.cross_validation import cross_val_score
from sklearn.cross_validation import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn import preprocessing


import pickle
import time



# Pre-Scrape

## Read in CSV as DF

In [2]:
df=pd.read_csv('steam3.csv')
del(df['Unnamed: 0'])
df.head()

Unnamed: 0,title,link,system,systems,reviews,release_date,discount,price
0,Warframe,https://store.steampowered.com/app/230410/Warf...,1,['win'],192592,1940,0,0.0
1,Clicker Heroes 2,https://store.steampowered.com/app/629910/Clic...,2,"['win', 'mac']",183,1,0,29.99
2,Tom Clancy's Rainbow Six® Siege,https://store.steampowered.com/app/359550/Tom_...,1,['win'],170518,959,0,14.99
3,Counter-Strike: Global Offensive,https://store.steampowered.com/app/730/Counter...,3,"['win', 'mac', 'linux']",2628652,2156,0,14.99
4,Grand Theft Auto V,https://store.steampowered.com/app/271590/Gran...,1,['win'],343780,1191,34,29.99


In [3]:
no_dup_df = df[~df.index.duplicated(keep='first')]
no_dup_df.set_index('title', inplace=True)

# Scrape

In [4]:
cols2 = ['title', 'developer', 'user_tags', 'languages', 'genres', 'specs', 'lang_supports']
df_gamepg=pd.DataFrame(columns=cols2, index=[])
link_games=list(no_dup_df.link)

for link in link_games[0:15]:
    try:
        response=requests.get(link)
        soup = BeautifulSoup(response.text, 'html5lib')
        row1={}
        row1['title']=soup.find('div', class_='apphub_AppName').text
        for developers in soup.find_all('div', class_="summary column", id="developers_list"):
            developer=[one_developer.text for one_developer in developers.find_all('a')]
        row1['developer']=developer
        user_tags=[tag.text.replace("\n","").strip("\t") for tag in soup.find_all('a', class_="app_tag")]
        row1['user_tags']=user_tags
        all_lang=[]
        for languages in soup.find_all('table', class_="game_language_options"):
            ri={}
            for element in languages.find_all('tr', style=True, class_=True):
                language=(element.find('td', class_="ellipsis").text.replace("\n","").strip("\t"))
                options=[1 if bool(options.text.replace("\n","").strip("\t")) else 0 for options in element.find_all('td', class_="checkcol")]
                ri[language]=options
                all_lang.append(tuple(options))
        row1['lang_supports']=ri
        row1['languages']=all_lang
        x=soup.find_all('div', class_="details_block")[0].text.replace("\n","").split("\t")
        feature_list = list(filter(None, x))
        y=feature_list[1].split(":")
        genres=y[1]
        row1['genres']=genres
        specs=[]
        for element in soup.find_all('div', class_="game_area_details_specs"):
            l=[]
            for e2 in element.find_all('a'):
                l.append(e2.text.replace("\n","").strip("\t"))
            specs.extend(l)
        full_specs=list(filter(None, specs))
        row1['specs']=full_specs 
        df_gamepg=df_gamepg.append(row1, ignore_index=True)
    except:
        print(link)
        
df_gamepg.set_index('title', inplace=True)
df_gamepg.rename(columns=lambda x: x.strip())
pd.options.display.max_rows = 4000
df_gamepg

https://store.steampowered.com/app/230410/Warframe/?snr=1_7_7_230_150_1
https://store.steampowered.com/app/359550/Tom_Clancys_Rainbow_Six_Siege/?snr=1_7_7_230_150_1
https://store.steampowered.com/app/271590/Grand_Theft_Auto_V/?snr=1_7_7_230_150_1
https://store.steampowered.com/app/570/Dota_2/?snr=1_7_7_230_150_1
https://store.steampowered.com/app/648350/Jurassic_World_Evolution/?snr=1_7_7_230_150_1
https://store.steampowered.com/app/578080/PLAYERUNKNOWNS_BATTLEGROUNDS/?snr=1_7_7_230_150_1


Unnamed: 0_level_0,developer,user_tags,languages,genres,specs,lang_supports
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Clicker Heroes 2,[Playsaurus],"[Early Access, Clicker, Casual, Indie, RPG, Ad...","[(1, 1, 1)]","Action, Adventure, Casual, Indie, RPG, Simula...","[Single-player, Stats]","{'English': [1, 1, 1]}"
Counter-Strike: Global Offensive,[Valve],"[FPS, Multiplayer, Shooter, Action, Team-Based...","[(1, 1, 0), (1, 0, 0), (1, 0, 0), (1, 0, 0), (...",Action,"[Multi-player, Steam Achievements, Full contro...","{'English': [1, 1, 0], 'Czech': [1, 0, 0], 'Da..."
MOTHERGUNSHIP,"[Grip Digital, Terrible Posture Games]","[Action, Indie, Crafting, Bullet Hell, FPS, Fa...","[(1, 1, 1), (1, 0, 1), (1, 0, 1), (1, 0, 1), (...","Action, Indie","[Single-player, Steam Achievements, Full contr...","{'English': [1, 1, 1], 'French': [1, 0, 1], 'I..."
Rocket League®,"[Psyonix, Inc.]","[Multiplayer, Racing, Soccer, Sports, Competit...","[(1, 0, 0), (1, 0, 0), (1, 0, 0), (1, 0, 0), (...","Action, Indie, Racing, Sports","[Single-player, Multi-player, Co-op, Shared/Sp...","{'English': [1, 0, 0], 'French': [1, 0, 0], 'I..."
Islands of Nyne: Battle Royale,[Define Human Studios],"[Early Access, Action, FPS, Indie, Multiplayer...","[(1, 1, 0)]","Action, Indie, Early Access","[Online Multi-Player, Partial Controller Suppo...","{'English': [1, 1, 0]}"
Team Fortress 2,[Valve],"[Free to Play, Multiplayer, FPS, Action, Shoot...","[(1, 1, 0), (1, 0, 0), (1, 0, 0), (1, 0, 0), (...","Action, Free to Play","[Multi-player, Cross-Platform Multiplayer, Ste...","{'English': [1, 1, 0], 'Danish': [1, 0, 0], 'D..."
Spintires: MudRunner,[Saber Interactive],"[Offroad, Driving, Simulation, Multiplayer, Ph...","[(1, 0, 0), (1, 0, 0), (1, 0, 0), (1, 0, 0), (...",Simulation,"[Single-player, Online Co-op, Steam Achievemen...","{'English': [1, 0, 0], 'French': [1, 0, 0], 'I..."
Divinity: Original Sin 2,[Larian Studios],"[RPG, Turn-Based, Co-op, Fantasy, Story Rich, ...","[(1, 1, 1), (1, 0, 1), (1, 0, 1), (1, 0, 1), (...","Adventure, RPG, Strategy","[Single-player, Multi-player, Online Multi-Pla...","{'English': [1, 1, 1], 'French': [1, 0, 1], 'G..."
Moonlighter,[Digital Sun],"[Adventure, Pixel Graphics, Action, Rogue-like...","[(1, 0, 1), (1, 0, 1), (1, 0, 1), (1, 0, 1), (...","Action, Adventure, Indie","[Single-player, Steam Achievements, Full contr...","{'English': [1, 0, 1], 'French': [1, 0, 1], 'G..."


# Feature Engineering

## Single and Multi Player

In [5]:
single=[1 if 'Single-player' in element else 0 for element in df_gamepg['specs']]
multi=[1 if 'Multi-player' in element else 0 for element in df_gamepg['specs']]
controller_support=[1 if 'Full controller support' in element else 0 for element in df_gamepg['specs']]
cards=[1 if 'Single-player' in element else 0 for element in df_gamepg['specs']]
in_app_purchases=[1 if 'Single-player' in element or 'Partial Controller Support' else 0 for element in df_gamepg['specs']]
cloud=[1 if 'Steam Cloud' in element else 0 for element in df_gamepg['specs']]
workshop=[1 if 'Steam Workshop' in element else 0 for element in df_gamepg['specs']]
captions=[1 if 'Captions available' in element else 0 for element in df_gamepg['specs']]

## Number of Languages

What format they are supported in

In [6]:
num_lang=[len(lang_list) for lang_list in df_gamepg['languages']]
lang_interface=[]
lang_audio=[]
lang_subtitles=[]
for lang_list in df_gamepg['languages']:
    temp=list(zip(*lang_list))
    lang_interface.append(sum(temp[0]))
    lang_audio.append(sum(temp[1]))
    lang_subtitles.append(sum(temp[2]))

## Number of Genres

Whether the game is indie or action

In [7]:
num_genres=[]
indie=[]
action=[]
for element in df_gamepg['genres']:
    x=element.split(",")
    num_genres.append(len(x))
    if ' Indie' in x:
        indie.append(1)
    else:
        indie.append(0)
    if  ' Action' in x:
        action.append(1)
    else:
        action.append(0)
#strip whitespace in front of each genre (" Action")

## Putting Variables Into DataFrame

In [8]:
df_gamepg['single']=single
df_gamepg['multi']=multi
df_gamepg['controller_support']=controller_support
df_gamepg['indie']=indie
df_gamepg['action']=action
df_gamepg['num_lang']=num_lang
df_gamepg['interface_lang']=lang_interface
df_gamepg['audio_lang']=lang_audio
df_gamepg['subtitles_lang']=lang_subtitles

## Double Checking

In [9]:
df_gamepg

Unnamed: 0_level_0,developer,user_tags,languages,genres,specs,lang_supports,single,multi,controller_support,indie,action,num_lang,interface_lang,audio_lang,subtitles_lang
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
Clicker Heroes 2,[Playsaurus],"[Early Access, Clicker, Casual, Indie, RPG, Ad...","[(1, 1, 1)]","Action, Adventure, Casual, Indie, RPG, Simula...","[Single-player, Stats]","{'English': [1, 1, 1]}",1,0,0,1,1,1,1,1,1
Counter-Strike: Global Offensive,[Valve],"[FPS, Multiplayer, Shooter, Action, Team-Based...","[(1, 1, 0), (1, 0, 0), (1, 0, 0), (1, 0, 0), (...",Action,"[Multi-player, Steam Achievements, Full contro...","{'English': [1, 1, 0], 'Czech': [1, 0, 0], 'Da...",0,1,1,0,1,25,25,1,0
MOTHERGUNSHIP,"[Grip Digital, Terrible Posture Games]","[Action, Indie, Crafting, Bullet Hell, FPS, Fa...","[(1, 1, 1), (1, 0, 1), (1, 0, 1), (1, 0, 1), (...","Action, Indie","[Single-player, Steam Achievements, Full contr...","{'English': [1, 1, 1], 'French': [1, 0, 1], 'I...",1,0,1,1,1,7,7,1,7
Rocket League®,"[Psyonix, Inc.]","[Multiplayer, Racing, Soccer, Sports, Competit...","[(1, 0, 0), (1, 0, 0), (1, 0, 0), (1, 0, 0), (...","Action, Indie, Racing, Sports","[Single-player, Multi-player, Co-op, Shared/Sp...","{'English': [1, 0, 0], 'French': [1, 0, 0], 'I...",1,1,1,1,1,12,12,0,0
Islands of Nyne: Battle Royale,[Define Human Studios],"[Early Access, Action, FPS, Indie, Multiplayer...","[(1, 1, 0)]","Action, Indie, Early Access","[Online Multi-Player, Partial Controller Suppo...","{'English': [1, 1, 0]}",0,0,0,1,1,1,1,1,0
Team Fortress 2,[Valve],"[Free to Play, Multiplayer, FPS, Action, Shoot...","[(1, 1, 0), (1, 0, 0), (1, 0, 0), (1, 0, 0), (...","Action, Free to Play","[Multi-player, Cross-Platform Multiplayer, Ste...","{'English': [1, 1, 0], 'Danish': [1, 0, 0], 'D...",0,1,0,0,1,26,26,1,0
Spintires: MudRunner,[Saber Interactive],"[Offroad, Driving, Simulation, Multiplayer, Ph...","[(1, 0, 0), (1, 0, 0), (1, 0, 0), (1, 0, 0), (...",Simulation,"[Single-player, Online Co-op, Steam Achievemen...","{'English': [1, 0, 0], 'French': [1, 0, 0], 'I...",1,0,1,0,0,9,9,0,0
Divinity: Original Sin 2,[Larian Studios],"[RPG, Turn-Based, Co-op, Fantasy, Story Rich, ...","[(1, 1, 1), (1, 0, 1), (1, 0, 1), (1, 0, 1), (...","Adventure, RPG, Strategy","[Single-player, Multi-player, Online Multi-Pla...","{'English': [1, 1, 1], 'French': [1, 0, 1], 'G...",1,1,1,0,0,7,7,1,7
Moonlighter,[Digital Sun],"[Adventure, Pixel Graphics, Action, Rogue-like...","[(1, 0, 1), (1, 0, 1), (1, 0, 1), (1, 0, 1), (...","Action, Adventure, Indie","[Single-player, Steam Achievements, Full contr...","{'English': [1, 0, 1], 'French': [1, 0, 1], 'G...",1,0,1,1,1,10,10,0,10


In [10]:
df_gamepg2=df_gamepg[['single','multi','controller_support','indie','action','interface_lang','audio_lang','subtitles_lang']]

# Look at Final DF

In [11]:
df_gamepg2

Unnamed: 0_level_0,single,multi,controller_support,indie,action,interface_lang,audio_lang,subtitles_lang
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Clicker Heroes 2,1,0,0,1,1,1,1,1
Counter-Strike: Global Offensive,0,1,1,0,1,25,1,0
MOTHERGUNSHIP,1,0,1,1,1,7,1,7
Rocket League®,1,1,1,1,1,12,0,0
Islands of Nyne: Battle Royale,0,0,0,1,1,1,1,0
Team Fortress 2,0,1,0,0,1,26,1,0
Spintires: MudRunner,1,0,1,0,0,9,0,0
Divinity: Original Sin 2,1,1,1,0,0,7,1,7
Moonlighter,1,0,1,1,1,10,0,10
