In [1]:
import os
import magonote_functions as mf
import random
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

In [2]:
game_info_csv = 'data/game_info.csv'
game_info_df = pd.read_csv(game_info_csv, sep='\t', index_col=0)
game_info_df

Unnamed: 0,title text,creator text,creator url,game url,full text
91140,Away - GBJam5,dgoldaraz,https://dgoldaraz.itch.io,https://dgoldaraz.itch.io/away-gbjam5,Away - GBJam5new I.HtmlEmbed('#html_embed_widg...
134059,Aftermath Demo,MattWillis,https://mattwillis.itch.io,https://mattwillis.itch.io/amdemo,Aftermath DemoA downloadable game for WindowsV...
91190,The Terror Of Matthew,VictorBurgos,https://victorburgos.itch.io,https://victorburgos.itch.io/the-terror-of-mat...,The Terror Of MatthewA downloadable game for W...
221027,Ninja Toy - World Hero,Rombus,https://rombus.itch.io,https://rombus.itch.io/ninja-toy-world-hero,Ninja Toy - World Heronew I.HtmlEmbed('#html_e...
144235,ScrapYard ALPHA,ByteRockers' Games,https://byterockers-games.itch.io,https://byterockers-games.itch.io/scrapyard,ScrapYard ALPHAA downloadable game for Windows...
140120,World Mower,Fellowship of the Game,https://fog-icmc.itch.io,https://fog-icmc.itch.io/world-mower,View all by Fellowship of the GameFollow Fello...
101684,The story about racoons,ThinkingMicrowave,https://thinkingmicrowave.itch.io,https://thinkingmicrowave.itch.io/the-story-ab...,View all by ThinkingMicrowaveFollow ThinkingMi...
136453,Jam's prison prova 2,jaspel_jam,https://jaspel-jam.itch.io,https://jaspel-jam.itch.io/jams-prison-prova-2,Jam's prison prova 2new I.HtmlEmbed('#html_emb...
94079,P.tree: Single-Cell Simulator,Zane Hedges,https://chemistrychrist.itch.io,https://chemistrychrist.itch.io/ptree-single-c...,P.tree: Single-Cell SimulatorA downloadable ga...
220320,META,digaly,https://digaly.itch.io,https://digaly.itch.io/meta,View all by digalyFollow digalyFollowing digal...


In [3]:
from sklearn.feature_extraction import text
add_stop_words = ('itchio', 'report', 'zip', 'view', 'comment', 'post', 'upvotes', 'account', 'post_id', 'report_url', 'io', 'game', 'nowname', 'priceclick', 'download', 'mb', 'downloadable', 'viewgame', 'user_tools', 'https', 'viewhtmlgame', 'start_maximized', 'htmlembed', 'play_after', 'itch', '_merchantsettings', 'document', 'apple', 'itunes', 'itunes_autolinkmaker', 'script', 'src', 'javascript', 'function', 'http', 'swf', 'flash', 'class', 'data', 'googleaccessid', 'gserviceaccount', 'signature', 'moonscript2', 'commondatastorage', 'div', 'js', 'span', 'default', 'var', 'usd', 'up_score', 'down_score', 'autolinkmaker', 'com', 'autolink','commentlog', 'html', 'play_url')
all_stop_words = text.ENGLISH_STOP_WORDS.union(add_stop_words)
itch_stop_words=set(all_stop_words)

In [7]:
vectorizer = TfidfVectorizer(stop_words=itch_stop_words)
X = vectorizer.fit_transform(game_info_df['full text'])
features = vectorizer.get_feature_names()
n_clu = 6
kmeans = KMeans(n_clusters=n_clu)
kmeans.fit(X)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
    n_clusters=6, n_init=10, n_jobs=1, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [8]:
top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-11:-1]
for num, centroid in enumerate(top_centroids):
    print("%d: %s" % (num, ", ".join(features[i] for i in centroid)))

0: space, 3d, toolkit, new, maker, shooter, jam, shoot, leave, player
1: dare, ludum, 3d, 99, world, 38, small, new, purchase, apr
2: la, le, et, en, que, pour, les, jeu, el, des
3: 3d, rp, width, height, 90, false, new, type, id, freei
4: reply, days, ago, year, really, like, time, just, 3d, new
5: 3d, new, leave, freei, type, updated, id, loading, informationpublished, slug


In [9]:
vectorizer = TfidfVectorizer(stop_words=itch_stop_words, max_features=1000)
X = vectorizer.fit_transform(game_info_df['full text'])
features = vectorizer.get_feature_names()
kmeans = KMeans(n_clusters=n_clu)
kmeans.fit(X)
top_centroids = kmeans.cluster_centers_.argsort()[:,-1:-11:-1]
for num, centroid in enumerate(top_centroids):
    print("%d: %s" % (num, ", ".join(features[i] for i in centroid)))

0: reply, days, ago, year, 3d, new, really, like, time, thanks
1: 3d, new, leave, freei, type, updated, id, slug, loading, loader_outer
2: dare, ludum, 38, 3d, small, world, apr, new, 24, leave
3: 3d, width, height, rp, 90, false, new, freei, type, id
4: 99, purchase, minimum, morein, order, lessi, price, 00, windowsbuy, 3d
5: 3d, new, leave, play, player, space, type, freei, id, updated


In [11]:
assigned_cluster = kmeans.transform(X).argmin(axis=1)
# assigned_cluster

In [19]:
for i in range(kmeans.n_clusters):
    cluster = np.arange(0, X.shape[0])[assigned_cluster==i]
    sample_games = np.random.choice(cluster, 3, replace=False)
    # print(sample_games)
    print("cluster %d:" % i)
    for game in sample_games:
        print("    %s" % game_info_df.iloc[game]['title text'])

cluster 0:
    Less then a minute
    THAT LIZARD STOLE MY LUNCH
    Mine Swine
cluster 1:
    The Sorrows Of Young Werther
    The Cave
    Bubonic
cluster 2:
    It's A Small World
    Duoregi
    Escape From The Bottle
cluster 3:
    Dungeon Wizard
    Min hjärna Orginal Edition
    Block Breaker
cluster 4:
    The Communist Dogifesto
    Kimmy
    Tower Defence Demo
cluster 5:
    Steamed Hams but a Playable Remix
    Angry Cats
    Paddle Ping
