Code Written By: Nathan Lamberson & Michael Treacy

Method: Collaborative Filtering using Matrix Factorization

Code Modeled off of: https://www.kaggle.com/jwyang91/steam-game-recommender/notebook

This is accomplished by infering the preference of a new game based on the known preferences of a user, show as R = U * V.

This model (Model 2) was used to develop ideas for the refined recommender (Model 1) that created a rating system based on the hours.

In [1]:
# Import Libraries to complete operations
# Pandas for data processing
# Numpy for linear algebra
import pandas as pd
import numpy as np

# List all files in the data directory
import os
for dirname, _, filenames in os.walk('/data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
# Import Libraries to complete operations
# Pandas for data processing
# Numpy for linear algebra
# Tensorflow for ML
import pandas as pd
import numpy as np
import tensorflow as tf
import random
from collections import Counter
from sklearn.metrics import roc_curve, auc, average_precision_score

In [3]:
# Begin Cleaning the Data
# Set the filepath to the datasheet from Kaggle
filePath = './data/steam-200k.csv'

# Set the data to the information contained in the provided file
fileData = pd.read_csv(filePath, header = None, names = ['UserID', 'Game', 'Action', 'Hours', '0'])

# Show the first 5 rows of the data
fileData.head()

# Change the 'Hours' Header Column to 'HoursPlayed' and made the data a float32 value
fileData['HoursPlayed'] = fileData['Hours'].astype('float32')

# Change data in the file where the 'Action' row is purchased and 'Hours' is 1.0 to represent a value of 0 in the new 'HoursPlayed' column, since this "Purchased" with "1.0" hours means that the game was solely purchased and does not talk about the hours played. We want to ignore this for the cleansed data
fileData.loc[(fileData['Action'] == 'purchase') & (fileData['Hours'] == 1.0), 'HoursPlayed'] = 0

In [4]:
# Set the UserID Column to data type int
fileData.UserID = fileData.UserID.astype('int')

# Sort the new fileData by 'UserID', 'Game', and the new 'HoursPlayed' column
fileData = fileData.sort_values(['UserID', 'Game', 'HoursPlayed'])

# Remove all duplicates game names tied to a UserID to remove "add-ons" to games, then drop the 'Action', 'Hours', and '0' columns
cleanFileData = fileData.drop_duplicates(['UserID', 'Game'], keep = 'last').drop(['Action', 'Hours', '0'], axis = 1)

# Show the first 5 rows of the new cleaned data
cleanFileData.head()

Unnamed: 0,UserID,Game,HoursPlayed
65430,5250,Alien Swarm,4.9
65424,5250,Cities Skylines,144.0
65435,5250,Counter-Strike,0.0
65436,5250,Counter-Strike Source,0.0
65437,5250,Day of Defeat,0.0


In [5]:
# Get the total number of users by finding all unique UserID values
numUsers = len(cleanFileData.UserID.unique())

# Get the total number of games by finding all unique Game values
numGames = len(cleanFileData.Game.unique())

# Print this information to the screen
print('There are {0} users and {1} games in the data'.format(numUsers, numGames))

There are 12393 users and 5155 games in the data


In [6]:
# Calculate the sparsity of user-item matrix, based on the shape of the clean file data array shape
sparsity = cleanFileData.shape[0] / (float(numUsers * numGames))

# Print this information to the screen
print('{:.2%} of the user-item matrix is filled'.format(sparsity))

0.20% of the user-item matrix is filled


In [7]:
# Set a variable to a Counter container for the number of users
userCounter = Counter()

# Loop through the cleaned file data list to count the number of users
for user in cleanFileData.UserID.tolist():
    userCounter[user] = userCounter[user] + 1

# Set a variable to a Counter container for the number of games
gameCounter = Counter()

# Loop through the cleaned file data list to count the number of games
for game in cleanFileData.Game.tolist():
    gameCounter[game] = gameCounter[game] + 1

# Create
user2index = {user: i for i, user in enumerate(cleanFileData.UserID.unique())}
index2user = {i: user for user, i in user2index.items()}

game2index = {game: i for i, game in enumerate(cleanFileData.Game.unique())}
index2game = {i: game for game, i in game2index.items()}

In [8]:
# Set the user index to the lambda function of user2index[x] values (do not include header)
userIndex = cleanFileData['UserID'].apply(lambda x: user2index[x]).values

# Set the game index to the lambda function of game2index[x] values (do not include header)
gameIndex = cleanFileData['GameIndex'] = cleanFileData['Game'].apply(lambda x: game2index[x]).values

# Get the total hours played of all games (do not include header)
hours = cleanFileData['HoursPlayed'].values

In [9]:
# Create a baseline zero matrix modelled off of the number of users and number of games
zeroMatrix = np.zeros(shape = (numUsers, numGames))

# Set the user game preference matrix to the shape of the zero matrix
userGamePreference = zeroMatrix.copy()

# Fill the user game preference matrix with base values of 1 where applicable in regards to the previously calculated index values for users and games
userGamePreference[userIndex, gameIndex] = 1

# Set the user game interactions confidence matrix to the shape of the zero matrix
userGameInteractions = zeroMatrix.copy()

# Fill the user game interactions confidence matrix with value of hours played + 1 where applicable according to the userIndex and gameIndex values calculated earlier
userGameInteractions[userIndex, gameIndex] = hours + 1

In [10]:
# Set a base value for k
k = 5

# Count to total number of purchases for each UserID
purchaseCounts = np.apply_along_axis(np.bincount, 1, userGamePreference.astype(int))

# Find the total number of users who bought 2*k games (our baseline for prediction)
buyersIndex = np.where(purchaseCounts[:, 1] >= (2 * k))[0]

# Print the total number of users who bought 2*k games
print('{0} users bought {1} or more games'.format(len(buyersIndex), (2 * k)))

2189 users bought 10 or more games


In [11]:
# Keep a total of 20% of the data to use for training
testFrac = 0.2

# Create a user index based on buyers
testUsersIndex = np.random.choice(buyersIndex, size = int(np.ceil(len(buyersIndex) * testFrac)), replace = False)

In [12]:
# Create a user index ending at position of the length of the test users index / 2
valUsersIndex = testUsersIndex[:int(len(testUsersIndex) / 2)]

# Create a user index beginning at position of the length of the test users index / 2 until the end
testUsersIndex = testUsersIndex[int(len(testUsersIndex) / 2):]

In [13]:
# Data Processing Function
def dataProcess(data, train, test, userIndex, k):
    # Loop over userIndex
    for user in userIndex:
        # Get all purchases
        purchases = np.where(data[user, :] == 1)[0]

        # Create a mask for purchases
        mask = np.random.choice(purchases, size = k, replace = False)

        # Train the data with the user and the mask
        train[user, mask] = 0

        # test the data with the user and mask data
        test[user, mask] = data[user, mask]
    
    # Return train and test
    return train, test

In [14]:
# Create a Train Matrix based off of the user game preference matrix
trainMatrix = userGamePreference.copy()

# Create a Test Matrix from the zero matrix
testMatrix = zeroMatrix.copy()

# Create a Validation Matrix from the zero matrix
valMatrix = zeroMatrix.copy()

# Mask the Train matrix and create the Validation and Test matricies
trainMatrix, valMatrix = dataProcess(userGamePreference, trainMatrix, valMatrix, valUsersIndex, k)
trainMatrix, testMatrix = dataProcess(userGamePreference, trainMatrix, testMatrix, testUsersIndex, k)

In [15]:
# Print the Test Matrix
testMatrix[testUsersIndex[0], testMatrix[testUsersIndex[0], :].nonzero()[0]]

array([1., 1., 1., 1., 1.])

In [16]:
# Print the Train Matrix
trainMatrix[testUsersIndex[0], testMatrix[testUsersIndex[0], :].nonzero()[0]]

array([0., 0., 0., 0., 0.])

In [17]:
# Create a new graph
tf.reset_default_graph()

# Get a Preference Matrix
preference = tf.placeholder(tf.float32, (numUsers, numGames))

# Get the users interactions (hours played) on games in a Matrix
interactions = tf.placeholder(tf.float32, (numUsers, numGames))

# Get the users index
usersIndex = tf.placeholder(tf.int32, (None))

In [18]:
# Number of features
numFeatures = 30

# X Matrix for user preferences and features
X = tf.Variable(tf.truncated_normal([numUsers, numFeatures], mean = 0, stddev = 0.05))

# Y Matrix for user games and features
Y = tf.Variable(tf.truncated_normal([numGames, numFeatures], mean = 0, stddev = 0.05))

# Initialize the first confidence parameter
confidenceParam = tf.Variable(tf.random_uniform([1], 0, 1))

In [19]:
# Add user bias
userBias = tf.Variable(tf.truncated_normal([numUsers, 1], stddev = 0.2))

# Concatenate the vector to the user matrix
XPlusBias = tf.concat([X, userBias, tf.ones((numUsers, 1), dtype = tf.float32)], axis = 1)

In [20]:
# Add game bias
itemBias = tf.Variable(tf.truncated_normal([numGames, 1], stddev = 0.2))

# Concatenate the vector to the game matrix
YPlusBias = tf.concat([Y, tf.ones((numGames, 1), dtype = tf.float32), itemBias], axis = 1)

In [21]:
# Prediction Preference Matrix formed from Matrix Multiplication of the X and Y Matrix with bias applied
predPreference = tf.matmul(XPlusBias, YPlusBias, transpose_b = True)

# Construct the confidence matrix with the hours played and alpha parameter
confidence = 1 + confidenceParam * interactions

In [22]:
# Cost of the model, being the square sum of the actual preferences and the predicted preferences
cost = tf.reduce_sum(tf.multiply(confidence, tf.square(tf.subtract(preference, predPreference))))

# The l2 regulizer
l2_square = tf.nn.l2_loss(X) + tf.nn.l2_loss(Y) + tf.nn.l2_loss(userBias) + tf.nn.l2_loss(itemBias)

lambda_c = 0.01

loss = cost + lambda_c * l2_square

In [23]:
lr = 0.05

# Optimize the training data based on loss
optimize = tf.train.AdagradOptimizer(learning_rate = lr).minimize(loss)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


In [24]:
# Function to help calculate the top k precision
def topKPrecision(prediction, matrix, k, userIndex):
    precisions = []

    for user in userIndex:
        # Find the best recommendation based off the predictions
        recommendation = np.argsort(-prediction[user, :])

        # Find the recommendations up to k
        top_k = recommendation[:k]
        
        # Remove the labels
        labels = matrix[user, :].nonzero()[0]

        # Calculate the precision score
        precision = len(set(top_k) & set(labels)) / float(k)
        precisions.append(precision)
    return np.mean(precisions)

In [25]:
# Now is the training sessions
# Iterate over the data 100 times
iterations = 100

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())

    # Loop over the data 100 times (based off iteration value above)
    for i in range(iterations):
        sess.run(optimize, feed_dict = {preference: trainMatrix, interactions: userGameInteractions})

        # Print the current data for every 10 steps
        if i % 10 == 0:
            modLoss = sess.run(loss, feed_dict = {preference: trainMatrix, interactions: userGameInteractions})
            modPred = predPreference.eval()
            trainPrecision = topKPrecision(modPred, trainMatrix, k, valUsersIndex)
            valPrecision = topKPrecision(modPred, valMatrix, k, valUsersIndex)
            print('Iterations {0}...'.format(i), 'Training Loss {:.2f}...'.format(modLoss), 'Train Precision {:.3f}...'.format(trainPrecision), 'Val Precision {:.3f}'.format(valPrecision))
    
    # Calculate the precision and print it
    rec = predPreference.eval()
    testPrecision = topKPrecision(rec, testMatrix, k, testUsersIndex)
    print('\n')
    print('Overall Test Precision: {:.3f}'.format(testPrecision))

Iterations 0... Training Loss 3826740.25... Train Precision 0.078... Val Precision 0.016
Iterations 10... Training Loss 318520.19... Train Precision 0.373... Val Precision 0.022
Iterations 20... Training Loss 247674.06... Train Precision 0.457... Val Precision 0.037
Iterations 30... Training Loss 218792.69... Train Precision 0.515... Val Precision 0.043
Iterations 40... Training Loss 199725.83... Train Precision 0.553... Val Precision 0.047
Iterations 50... Training Loss 185168.55... Train Precision 0.580... Val Precision 0.052
Iterations 60... Training Loss 173109.50... Train Precision 0.603... Val Precision 0.050
Iterations 70... Training Loss 162511.11... Train Precision 0.624... Val Precision 0.048
Iterations 80... Training Loss 152735.88... Train Precision 0.637... Val Precision 0.048
Iterations 90... Training Loss 143328.58... Train Precision 0.645... Val Precision 0.050


Overall Test Precision: 0.058


In [26]:
# Testing
numExamples = 10
users = np.random.choice(testUsersIndex, size = numExamples, replace = False)

# Sort the recommended games
recGames = np.argsort(-rec)

In [27]:
# Print the UserId as a header
for user in users:
    print('User #{0} recommendations...'.format(index2user[user]))
    purchaseHistory = np.where(trainMatrix[user, :] != 0)[0]
    recommendations = recGames[user, :]

    newRecommendations = recommendations[~np.in1d(recommendations, purchaseHistory)][:k]

    # Print their receommendations and actual purchases, and the precision score
    print('Recommendations')
    print(', '.join([index2game[game] for game in newRecommendations]))
    print('\n')
    print('Actual Purchases')
    print(', '.join([index2game[game] for game in np.where(testMatrix[user, :] != 0)[0]]))
    print('\n')
    print('Precision of {0}'.format(len(set(newRecommendations) & set(np.where(testMatrix[user, :] != 0)[0])) / float(k)))
    print('-------------------------------')
    print('\n')

User #107818793 recommendations...
Recommendations
Unturned, Dota 2, Trove, Counter-Strike Global Offensive, Left 4 Dead 2


Actual Purchases
ARK Survival Evolved, Prison Architect, The Forest, Unturned, Grand Theft Auto V


Precision of 0.2
-------------------------------


User #151600301 recommendations...
Recommendations
PlanetSide 2, Rocket League, Tom Clancy's Ghost Recon Phantoms - EU, ARK Survival Evolved, The Elder Scrolls V Skyrim


Actual Purchases
Dota 2, Don't Starve Together Beta, Unturned, Trove, Among Ripples


Precision of 0.0
-------------------------------


User #190191843 recommendations...
Recommendations
Team Fortress 2, DayZ, The Elder Scrolls V Skyrim, Dying Light, Heroes & Generals


Actual Purchases
Team Fortress 2, Let the Cat In, Mount Your Friends, AdVenture Capitalist, Metro 2033 Redux


Precision of 0.2
-------------------------------


User #125141344 recommendations...
Recommendations
Dota 2, Counter-Strike, Age of Empires II HD Edition, Total War SHOG