In [28]:
# set-up
import pandas as pd
import urllib
import requests
import re
from bs4 import BeautifulSoup
from surprise import Reader, Dataset
from surprise.model_selection import train_test_split
from surprise import SVDpp, SVD, accuracy
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
from sklearn.decomposition import KernelPCA
from sklearn.decomposition import SparsePCA
import seaborn as sns

### Import Dataset

In [16]:
user_url = "https://raw.githubusercontent.com/mkivenson/" + \
"Recommender-Systems/master/Video%20Game%20Recommendations/steam-200k.csv"
user_df = pd.read_csv(user_url, 
                      header = None,
                      usecols = list(range(4)),
                      names = ['user_id','game_title','action','hours_played'])
user_df.head()

Unnamed: 0,user_id,game_title,action,hours_played
0,151603712,The Elder Scrolls V Skyrim,purchase,1.0
1,151603712,The Elder Scrolls V Skyrim,play,273.0
2,151603712,Fallout 4,purchase,1.0
3,151603712,Fallout 4,play,87.0
4,151603712,Spore,purchase,1.0


### Clean-Up Dataset

In [17]:
user_df = (pd.pivot_table(user_df, 
                      index = ['user_id','game_title'], 
                      columns='action', 
                      values='hours_played')
        .reset_index()
        .fillna(0)
        .rename(index = str, columns = {'play': 'hours_played', 'purchase':'purchased'}))
user_df.head()

action,user_id,game_title,hours_played,purchased
0,5250,Alien Swarm,4.9,1.0
1,5250,Cities Skylines,144.0,1.0
2,5250,Counter-Strike,0.0,1.0
3,5250,Counter-Strike Source,0.0,1.0
4,5250,Day of Defeat,0.0,1.0


### Create User-Item Matrix and Long Format

In [19]:
matrix = (user_df.pivot(index = 'user_id', columns = 'game_title', values = 'purchased')
                    .fillna(int(0))
                    .reset_index())
matrix.head()

game_title,user_id,007 Legends,0RBITALIS,1... 2... 3... KICK IT! (Drop That Beat Like an Ugly Baby),10 Second Ninja,"10,000,000",100% Orange Juice,1000 Amps,12 Labours of Hercules,12 Labours of Hercules II The Cretan Bull,...,rFactor 2,realMyst,realMyst Masterpiece Edition,resident evil 4 / biohazard 4,rymdkapsel,sZone-Online,samurai_jazz,the static speaks my name,theHunter,theHunter Primal
0,5250,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,76767,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,86540,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,103360,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,144736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Define Top N Recommendations Functions from Surprise

In [20]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

### Apply SVD++ for Implicit Data

In [21]:
reader = Reader()
user_result = Dataset.load_from_df(user_df[['user_id',
                                            'game_title', 
                                            'purchased']],
                                   reader)
train, test = train_test_split(user_result, 
                               test_size=0.25)
algo_SVDpp = SVDpp()
algo_SVDpp.fit(train)
predictions = algo_SVDpp.test(test)
from collections import defaultdict
from surprise import accuracy
get_top_n(predictions, n=10)

defaultdict(list,
            {113300324: [('Arma 2', 1.0320659339450156),
              ('Call of Duty Modern Warfare 2', 1.0287166886934642),
              ('Call of Duty Advanced Warfare - Multiplayer',
               1.0279045359808474),
              ('Left 4 Dead 2', 1.0094331482966732),
              ('Half-Life 2 Lost Coast', 1.006836941594029),
              ('Counter-Strike Global Offensive', 1.005445149059207),
              ('Day of Defeat Source', 1)],
             65229865: [('Crusader Kings II', 1.0754883221047937),
              ('Luxuria Superbia', 1.0585278437753625),
              ('Lightfish', 1.0350489320733198),
              ("King's Bounty Crossworlds", 1.0195773580396406),
              ('Leviathan Warships', 1.0168971158240245),
              ('War of the Roses Balance Beta', 1.0166201761960896),
              ('BioShock 2', 1.0114277551853583),
              ('Magicka Party Robes', 1.0113388576887368),
              ('GRID', 1.0066872744519038),
             

In [22]:
accuracy.rmse(predictions)


RMSE: 0.0166


0.016583598900575727

In [24]:
pd.DataFrame(predictions).head()

Unnamed: 0,uid,iid,r_ui,est,details
0,113300324,Call of Duty Advanced Warfare - Multiplayer,1.0,1.027905,{'was_impossible': False}
1,65229865,Crayon Physics Deluxe,1.0,1.0,{'was_impossible': False}
2,67579257,Dead Space,1.0,1.0,{'was_impossible': False}
3,561758,Dark Messiah of Might & Magic Single Player,1.0,1.0,{'was_impossible': False}
4,62990992,METAL SLUG 3,1.0,1.000278,{'was_impossible': False}


In [25]:
get_top_n(predictions, n=10)[99077905]

[('BioShock', 1.040083865643144),
 ('Warhammer 40,000 Dawn of War II  Retribution', 1.0353588508607967),
 ('Tomb Raider', 1.0297832516017364),
 ('Viscera Cleanup Detail Shadow Warrior', 1.0292455913785241),
 ('Kane & Lynch Dead Men', 1.0251458347596842),
 ('Serious Sam Double D XXL', 1.0208329734562827),
 ('Rise of the Argonauts', 1.0185621352047332),
 ('Nexuiz STUPID Mode', 1.0157152290531353),
 ('THE KING OF FIGHTERS 2002 UNLIMITED MATCH', 1.01410970715943),
 ('BioShock Infinite', 1.0108619486768307)]

In [32]:
list(user_df[user_df["user_id"] == 99077905]["game_title"].unique())

['Age of Empires II HD Edition',
 'Age of Empires II HD The Forgotten',
 'Alan Wake',
 "Alan Wake's American Nightmare",
 'Aliens vs. Predator',
 'Amnesia The Dark Descent',
 'ArcaniA',
 'Batman Arkham Asylum GOTY Edition',
 'Batman Arkham City GOTY',
 'Batman Arkham Origins',
 'BioShock',
 'BioShock 2',
 'BioShock Infinite',
 'Bloody Good Time',
 'Borderlands',
 'Borderlands 2',
 'Burnout Paradise The Ultimate Box',
 'Command and Conquer Red Alert 3',
 'Command and Conquer Red Alert 3 - Uprising',
 'Commandos 2 Men of Courage',
 'Commandos 3 Destination Berlin',
 'Commandos Behind Enemy Lines',
 'Commandos Beyond the Call of Duty',
 'Company of Heroes',
 'Company of Heroes (New Steam Version)',
 'Company of Heroes Opposing Fronts',
 'Crysis',
 'Crysis 2 Maximum Edition',
 'Crysis Warhead',
 'Crysis Wars',
 'DOOM 3 BFG Edition',
 'DRAGON BALL XENOVERSE',
 'Dark Souls Prepare to Die Edition',
 'Darksiders',
 'Darksiders II',
 'DayZ',
 'Dead Space',
 'Dead Space 2',
 "Deus Ex Human Revol

### Reduce Sparsity of Dataset

In [39]:
print("Initial length of dataframe:", len(user_df))
#user_df = user_df.groupby('user_id').filter(lambda x: len(x) > 25)
#user_df = user_df.groupby('game_title').filter(lambda x: len(x) > 50)
print("New length of dataframe:", len(user_df))

Initial length of dataframe: 128804
New length of dataframe: 128804


In [40]:
def get_game_details(game):
    user_agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7'
    headers={'User-Agent':user_agent,} 

    #make game title into url-friendly format
    game_url = re.sub('\s+','-',game).lower()
    
    #web-scraping request
    URL = "https://www.metacritic.com/game/pc/" + game_url + "/details"
    request = urllib.request.Request(URL, None, headers)
    page = urllib.request.urlopen(request)
    soup = BeautifulSoup(page)
    summary = soup.select("div.summary_detail.product_summary .data")[0].text

    #get publisher
    publisher = re.sub('\n| ','',soup.select("li.summary_detail.publisher .data a")[0].text).lower()

    #get index of details
    details = [x.text for x in soup.select("#main > div.product_details th")]
    for idx, x in enumerate(details):
        if x  == 'Genre(s):':
            genre_position = idx
        if x  == 'Developer:':
            dev_position = idx

    #get genre
    genre = " ".join(re.findall('\w+-?\w+', 
                                soup.select("#main > div.product_details td")[genre_position].text)).lower()
    #get developer
    developer = " ".join(re.findall('\w+-?\w+', 
                                    soup.select("#main > div.product_details td")[dev_position].text)).lower()

    #get user score
    user_score = float(soup.select('.userscore_wrap.feature_userscore .metascore_w')[0].text)

    #get amount of ratings
    no_ratings = int(re.findall('\d+', soup.select('.userscore_wrap.feature_userscore .count a')[0].text)[0])
    return (game, summary, publisher, developer, genre, user_score, no_ratings)

In [None]:
games_df = pd.DataFrame(columns = ['game_title','summary','publisher','developer','genre', 'user_score','no_ratings'])
failures = []

for game in user_df['game_title']:
    try:
        game_info = get_game_details(game)
        games_df = games_df.append(pd.DataFrame([game_info],
                                                columns = games_df.columns,
                                                ignore_index = True))
    except Exception as error:
        failures.append((game, error))
games_df.head()

game failed: arma-2
game failed: arma-2-operation-arrowhead
game failed: arma-2-operation-arrowhead-beta-(obsolete)
game failed: call-of-duty-black-ops---multiplayer
game failed: call-of-duty-modern-warfare-2---multiplayer
game failed: call-of-duty-modern-warfare-3---multiplayer
game failed: thief
game failed: thief---ghost
game failed: thief---opportunist
game failed: thief---predator
game failed: thief---the-bank-heist
game failed: thief-2
game failed: thief-deadly-shadows
game failed: thief-gold
game failed: age-of-empires-ii-hd-the-forgotten
game failed: alan-wake
game failed: alan-wake's-american-nightmare
game failed: arma-2
game failed: arma-2-operation-arrowhead
game failed: arma-2-operation-arrowhead-beta-(obsolete)
game failed: back-to-the-future-ep-1---it's-about-time
game failed: back-to-the-future-ep-2---get-tannen!
game failed: back-to-the-future-ep-3---citizen-brown
game failed: back-to-the-future-ep-4---double-visions
game failed: back-to-the-future-ep-5---outatime
game failed: deus-ex-game-of-the-year-edition
game failed: dirt-3-complete-edition
game failed: hector-ep-1
game failed: hector-ep-2
game failed: hector-ep-3
game failed: killer-is-dead
game failed: l.a.-noire
game failed: sam-&-max-301-the-penal-zone
game failed: sam-&-max-302-the-tomb-of-sammun-mak
game failed: sam-&-max-303-they-stole-max's-brain!
game failed: sam-&-max-304-beyond-the-alley-of-the-dolls
game failed: sam-&-max-305-the-city-that-dares-not-sleep
game failed: serious-sam-2
game failed: serious-sam-classic-the-first-encounter
game failed: serious-sam-classic-the-second-encounter
game failed: serious-sam-classics-revolution
game failed: serious-sam-double-d-xxl
game failed: skyrim-high-resolution-texture-pack