In [1]:
!pip install --upgrade tensorflow==1.12.0

Requirement already up-to-date: tensorflow==1.12.0 in /usr/local/lib/python3.6/dist-packages (1.12.0)


In [2]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_hub as hub
import json
import urllib
import itertools

from sklearn.preprocessing import MultiLabelBinarizer

# Disable most of the logging
tf.logging.set_verbosity(tf.logging.ERROR)
print(tf.__version__)

1.12.0


## Pre-processing

### Read in data

In [0]:
# From https://www.kaggle.com/kedokedokedo/vgsales
vgsales_df = pd.read_csv("vgsales.csv")

In [4]:
vgsales_df.head(3)

Unnamed: 0,Rank,Name,Platform,Year,Genre,Publisher,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales
0,1,Wii Sports,Wii,2006.0,Sports,Nintendo,41.49,29.02,3.77,8.46,82.74
1,2,Super Mario Bros.,NES,1985.0,Platform,Nintendo,29.08,3.58,6.81,0.77,40.24
2,3,Mario Kart Wii,Wii,2008.0,Racing,Nintendo,15.85,12.88,3.79,3.31,35.82


In [0]:
# From https://www.kaggle.com/nikdavis/steam-store-games
steam_df = pd.read_csv("steam.csv")

In [6]:
steam_df.head(3)

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99


### Clean up data

Shuffle the data set and remove extrea feature columns

In [0]:
# Get unique values from a Pandas series containing semi-colon delimited strings.
def get_unique(series):
    return set(list(itertools.chain(*series.apply(lambda x: [c for c in x.split(';')]))))

def process_genre(df):    
    # repeat for genre column names (get_unique used to find unique genre names, 
    # not necessary but useful if keeping all of them)
    gen_cols = get_unique(df['genres'])
    
    # only keeping 'main' genres similar to steam store
    gen_cols = [
        # 'Accounting',
        'Action',
        'Adventure',
        # 'Animation & Modeling',
        # 'Audio Production',
        'Casual',
        # 'Design & Illustration',
        # 'Documentary',
        # 'Early Access',
        # 'Education',
        # 'Free to Play',
        # 'Game Development',
        # 'Gore',
        'Indie',
        'Massively Multiplayer',
        # 'Nudity',
        # 'Photo Editing',
        'RPG',
        'Racing',
        # 'Sexual Content',
        'Simulation',
        # 'Software Training',
        'Sports',
        'Strategy'
        # 'Tutorial',
        # 'Utilities',
        # 'Video Production',
        # 'Violent',
        # 'Web Publishing'
    ]
    
    gen_col_names = []
    
    # create new columns for each genre with 1s for games of that genre
    for col in sorted(gen_cols):
        col_name = col.lower().replace('&', 'and').replace(' ', '_')
        gen_col_names.append(col_name)
        
        df[col_name] = df['genres'].apply(lambda x: 1 if col in x.split(';') else 0)
        # alternate method using np.where:
        # df[col_name] = np.where(df['genres'].str.contains(col), 1, 0)
    
    # remove "non-games" based on genre
    # if a row has all zeros in the new genre columns, it most likely isn't a game, so remove (mostly software)
    gen_sums = df[gen_col_names].sum(axis=1)
    df = df[gen_sums > 0].copy()
    
    return df

VG Sales dataset

In [0]:
# Drop unwanted columns
unwanted_cols = ['Rank','Platform','Publisher', 'Year' ,'NA_Sales',
                 'EU_Sales','JP_Sales','Other_Sales','Global_Sales']
                 
vgsales_df.drop(unwanted_cols, inplace=True, axis=1)

In [0]:
# Rename columns to match steam df
vgsales_df.rename(columns={'Name':'name','Genre':'genres'}, inplace=True)

In [10]:
# Remove duplicates 
vgsales_df.drop_duplicates(subset='name',keep='last')

Unnamed: 0,name,genres
0,Wii Sports,Sports
2,Mario Kart Wii,Racing
3,Wii Sports Resort,Sports
4,Pokemon Red/Pokemon Blue,Role-Playing
6,New Super Mario Bros.,Platform
7,Wii Play,Misc
8,New Super Mario Bros. Wii,Platform
9,Duck Hunt,Shooter
10,Nintendogs,Simulation
11,Mario Kart DS,Racing


In [0]:
# Remap genres to match steam genres
vg_to_steam_mapping = {
    "Role-Playing": "RPG",
     "Misc": "Casual",
     "Fighting":"Action",
     "Shooter" : "Action"
     }

vgsales_df = vgsales_df.replace({"genres": vg_to_steam_mapping})

Steam dataset

In [0]:
# keep only rows marked as supporting english
steam_df = steam_df[steam_df['english'] == 1].copy()
    
# keep rows which don't contain 3 or more non-ascii characters in succession
steam_df = steam_df[~steam_df['name'].str.contains('[^\u0001-\u007F]{3,}')]

In [0]:
# Drop unwanted columns
unwanted_cols = ['appid','release_date'	,'english'	,'developer'	,'publisher'	,'platforms'	,
                 'required_age',	'categories','steamspy_tags'	,'achievements'	,'positive_ratings'	,
                 'negative_ratings'	,'average_playtime',	'median_playtime',	'owners',	'price']
                 
steam_df.drop(unwanted_cols, inplace=True,axis=1)

In [14]:
# Remove duplicates
steam_df.drop_duplicates(subset='name',keep='last')

Unnamed: 0,name,genres
0,Counter-Strike,Action
1,Team Fortress Classic,Action
2,Day of Defeat,Action
3,Deathmatch Classic,Action
4,Half-Life: Opposing Force,Action
5,Ricochet,Action
6,Half-Life,Action
7,Counter-Strike: Condition Zero,Action
8,Half-Life: Blue Shift,Action
9,Half-Life 2,Action


Combine datasets

In [0]:
# Combine the two datasets
df = pd.concat([vgsales_df,steam_df])

In [16]:
# Remove duplicates
df.drop_duplicates(subset='name',keep='last')

Unnamed: 0,name,genres
0,Wii Sports,Sports
2,Mario Kart Wii,Racing
3,Wii Sports Resort,Sports
4,Pokemon Red/Pokemon Blue,RPG
6,New Super Mario Bros.,Platform
7,Wii Play,Casual
8,New Super Mario Bros. Wii,Platform
9,Duck Hunt,Action
10,Nintendogs,Simulation
11,Mario Kart DS,Racing


In [0]:
# only keeping 'main' genres similar to steam store
mod_df = process_genre(df)

In [32]:
mod_df

Unnamed: 0,name,genres,action,adventure,casual,indie,massively_multiplayer,rpg,racing,simulation,sports,strategy
0,Ball Driver,Action;Casual;Indie;Racing;Sports,1,0,1,1,0,0,1,0,1,0
1,Space Run Galaxy,Indie;Strategy,0,0,0,1,0,0,0,0,0,1
3,Sad City 42,Action;Adventure;Casual;Indie;RPG,1,1,1,1,0,1,0,0,0,0
4,Advanced Gaming Platform::Epica,Violent;Gore;Action;Indie;RPG;Simulation;Early...,1,0,0,1,0,1,0,1,0,0
5,Rayon Riddles - Rise of the Goblin King,Adventure;Strategy,0,1,0,0,0,0,0,0,0,1
6,NBA Live 2005,Sports,0,0,0,0,0,0,0,0,1,0
7,Pharaonic,Action;Adventure;Indie;RPG,1,1,0,1,0,1,0,0,0,0
8,Thoroughbred Breeder II Plus,Simulation,0,0,0,0,0,0,0,1,0,0
9,Steamcraft,Action;Massively Multiplayer;Racing,1,0,0,0,1,0,1,0,0,0
10,Naughty Elves,Casual;Indie,0,0,1,1,0,0,0,0,0,0


In [0]:
# Shuffle the dataset
df = df.sample(frac=1).reset_index(drop=True)

### Splitting the data

80% for training 20% for validation


In [0]:
# Split data into training and validation set
train_size = int(len(df) * .8)
train_df = df.iloc[0:train_size]
valid_df = df.iloc[train_size:len(df)]

In [0]:
target_col = 'genres'
input_col = 'name'

train_inputs = train_df[input_col]
train_genres = train_df[target_col]

valid_inputs = valid_df[input_col]
valid_genres = valid_df[target_col]

### Formating labels

Convert genre strings to multi-hot vectors

In [21]:
encoder = MultiLabelBinarizer()
encoder.fit_transform(train_genres)
train_encoded = encoder.transform(train_genres)
valid_encoded = encoder.transform(valid_genres)
num_classes = len(encoder.classes_)

print(encoder.classes_)
print(train_encoded[0])

[' ' '&' ';' 'A' 'C' 'D' 'E' 'F' 'G' 'I' 'M' 'N' 'P' 'R' 'S' 'T' 'U' 'V'
 'W' 'a' 'b' 'c' 'd' 'e' 'f' 'g' 'h' 'i' 'l' 'm' 'n' 'o' 'p' 'r' 's' 't'
 'u' 'v' 'w' 'x' 'y' 'z']
[0 0 1 1 1 0 0 0 0 1 0 0 0 1 1 0 0 0 0 1 0 1 1 1 0 1 0 1 1 0 1 1 1 1 1 1 1
 0 0 0 0 0]


### Embedding layer
Using TF Hub universal-sentence-encoder module for pre-trained word embeddings

In [0]:
name_embeddings = hub.text_embedding_column("name", module_spec="https://tfhub.dev/google/universal-sentence-encoder/2", trainable=False)

## Language Model

### DNNEstimator Model

The first parameter we pass to our DNNEstimator is called a head, and defines the type of labels our model should expect. Since we want our model to output multiple labels, we’ll use multi_label_head here. Then we'll convert our features and labels to numpy arrays and instantiate our Estimator. batch_size and num_epochs are hyperparameters - you should experiment with different values to see what works best on your dataset.

In [0]:
multi_label_head = tf.contrib.estimator.multi_label_head(
    num_classes,
    loss_reduction=tf.losses.Reduction.SUM_OVER_BATCH_SIZE
)

In [0]:
features = {
  "name": np.array(train_inputs).astype(np.str)
}

labels = np.array(train_encoded).astype(np.int32)

train_input_fn = tf.estimator.inputs.numpy_input_fn(features, labels, shuffle=True, batch_size=32, num_epochs=25)

estimator = tf.contrib.estimator.DNNEstimator(
    head=multi_label_head,
    hidden_units=[64,10],
    feature_columns=[name_embeddings])

### Training

training and evaluation of the model

In [25]:
estimator.train(input_fn=train_input_fn)

<tensorflow.contrib.estimator.python.estimator.dnn.DNNEstimator at 0x7f69b2041e10>

In [26]:
# Define our eval input_fn and run eval
eval_input_fn = tf.estimator.inputs.numpy_input_fn({"name": np.array(valid_inputs).astype(np.str)}, valid_encoded.astype(np.int32), shuffle=False)
estimator.evaluate(input_fn=eval_input_fn)

{'auc': 0.9094411,
 'auc_precision_recall': 0.7611026,
 'average_loss': 0.32556504,
 'global_step': 26747,
 'loss': 0.3255545}

## Predictions

In [0]:
# Test our model on some raw name data
raw_test = [
    "Genital Jousting", # Action
    "Firewatch", # Adventure
    "Madden", # Sport
]

In [0]:
# Generate predictions
predict_input_fn = tf.estimator.inputs.numpy_input_fn({"name": np.array(raw_test).astype(np.str)}, shuffle=False)
results = estimator.predict(predict_input_fn)

In [29]:
# Display predictions
index = 0
text_classes = encoder.inverse_transform(encoder.fit_transform(train_genres))
for game_genres in results:
  print(raw_test[index])
  top_2 = game_genres['probabilities'].argsort()[-2:][::-1]
  for genre in top_2:
    text_genre = encoder.classes_[genre]
    print(text_genre + ': ' + str(round(game_genres['probabilities'][genre] * 100, 2)) + '%')
  index += 1
  print('')

Genital Jousting
n: 86.55%
t: 85.41%

Firewatch
n: 89.08%
t: 85.3%

Madden
o: 89.76%
t: 86.89%

