### Processing user-game data

In [1]:
import gzip

def parse(path):
  g = gzip.open(path, 'r')
  for l in g:
    yield eval(l)

In [2]:
df_ui = parse('australian_users_items.json.gz')

In [3]:
ui_dict = {}

In [4]:
for d in df_ui:
    ui_dict[int(d['steam_id'])] = [{'game_id': int(item['item_id']), 'play_time': item['playtime_forever']} for item in d['items']]

In [5]:
import json

with open('ui_dict.json', 'w') as js:
    json.dump(ui_dict, js)

### Processing game data

In [6]:
df_games = parse('steam_games.json.gz')

In [7]:
from collections import defaultdict
tags = []
game_id_dict = defaultdict(list)
for d in df_games:
    if 'id' in d:
        if 'genres' in d:
            for genre in d['genres']:
                if genre not in tags:
                    tags.append(genre)
        if 'specs' in d:
            for spec in d['specs']:
                if spec not in tags:
                    tags.append(spec)

In [8]:
"""Turn game id to 1-hot vectors"""

df_games = parse('steam_games.json.gz')
num_tags = len(tags)
for d in df_games:
    if 'id' in d:
        game_id = int(d['id'])

        vec_1hot = [0]*num_tags
        if 'genres' in d:
            for genre in d['genres']:
                vec_1hot[tags.index(genre)] = 1

        if 'specs' in d:
            for spec in d['specs']:
                vec_1hot[tags.index(spec)] = 1

        game_id_dict[game_id] = vec_1hot


In [9]:
with open('game_tags.json', 'w') as js:
    json.dump({'tags': tags}, js)

In [10]:
with open('game_dict.json', 'w') as js:
    json.dump(game_id_dict, js)

### Customer Embedding

In [27]:
import numpy as np

user_history = {}
user_type = {}
# {user_id: history \in (53, T)}
for user in ui_dict:
    if len(ui_dict[user]) == 0:
        continue
    user_history[user] = []
    user_type[user] = np.zeros(num_tags)
    total_time = 0
    
    for i,game_dict in enumerate(ui_dict[user]):
        if game_dict['game_id'] in game_id_dict:
            user_history[user].append( game_id_dict[game_dict['game_id']] )
            user_type[user] += np.array(game_id_dict[game_dict['game_id']])*game_dict['play_time']
            total_time += game_dict['play_time']
            
    user_history[user] = np.array(user_history[user])
    
    if total_time != 0:
        user_type[user] /= total_time
    user_type_sum = user_type[user].sum()    
    if user_type_sum != 0:
        user_type[user] /= user_type_sum



In [29]:
from sklearn.manifold import TSNE
import seaborn as sns


cnt = 0
X = np.zeros(shape=(len(user_type),num_tags))
for i,u in enumerate(user_type):
    X[i] = user_type[u]

    
model = TSNE(n_components=2, random_state=0)
model.fit_transform(X) 


In [None]:
plt.figure(figsize=(16,10))
sns.scatterplot(
    x="pca-one", y="pca-two",
    hue="y",
    palette=sns.color_palette("hls", 10),
    data=df.loc[rndperm,:],
    legend="full",
    alpha=0.3
)