In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import re
import html
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('data/games_detailed_info.csv')

In [3]:
df.columns

Index(['Unnamed: 0', 'type', 'id', 'thumbnail', 'image', 'primary',
       'alternate', 'description', 'yearpublished', 'minplayers', 'maxplayers',
       'suggested_num_players', 'suggested_playerage',
       'suggested_language_dependence', 'playingtime', 'minplaytime',
       'maxplaytime', 'minage', 'boardgamecategory', 'boardgamemechanic',
       'boardgamefamily', 'boardgameexpansion', 'boardgameimplementation',
       'boardgamedesigner', 'boardgameartist', 'boardgamepublisher',
       'usersrated', 'average', 'bayesaverage', 'Board Game Rank',
       'Strategy Game Rank', 'Family Game Rank', 'stddev', 'median', 'owned',
       'trading', 'wanting', 'wishing', 'numcomments', 'numweights',
       'averageweight', 'boardgameintegration', 'boardgamecompilation',
       'Party Game Rank', 'Abstract Game Rank', 'Thematic Rank',
       'War Game Rank', 'Customizable Rank', 'Children's Game Rank',
       'RPG Item Rank', 'Accessory Rank', 'Video Game Rank', 'Amiga Rank',
       'Commod

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19230 entries, 0 to 19229
Data columns (total 56 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Unnamed: 0                     19230 non-null  int64  
 1   type                           19230 non-null  object 
 2   id                             19230 non-null  int64  
 3   thumbnail                      19211 non-null  object 
 4   image                          19211 non-null  object 
 5   primary                        19230 non-null  object 
 6   alternate                      7685 non-null   object 
 7   description                    19229 non-null  object 
 8   yearpublished                  19230 non-null  int64  
 9   minplayers                     19230 non-null  int64  
 10  maxplayers                     19230 non-null  int64  
 11  suggested_num_players          19230 non-null  object 
 12  suggested_playerage            16884 non-null 

In [5]:
df[df.columns[0]].describe()

count    19230.000000
mean      9614.500000
std       5551.367174
min          0.000000
25%       4807.250000
50%       9614.500000
75%      14421.750000
max      19229.000000
Name: Unnamed: 0, dtype: float64

Although the feature name is not specific, it looks like this is just a table id column.  It can be dropped

In [6]:
df.drop(df.columns[0],axis=1,inplace=True)

I will also drop id.  The dataset contained multiplic csv files, as I am only exploring this one, id is not needed

In [7]:
df.drop('id',axis=1,inplace=True)

looking at the 'type' feature

In [8]:
df.type.unique()

array(['boardgame'], dtype=object)

Since there is only 1 type this feature is not needed, I will drop it

In [9]:
df.drop('type', axis=1, inplace = True)

In [10]:
df.columns

Index(['thumbnail', 'image', 'primary', 'alternate', 'description',
       'yearpublished', 'minplayers', 'maxplayers', 'suggested_num_players',
       'suggested_playerage', 'suggested_language_dependence', 'playingtime',
       'minplaytime', 'maxplaytime', 'minage', 'boardgamecategory',
       'boardgamemechanic', 'boardgamefamily', 'boardgameexpansion',
       'boardgameimplementation', 'boardgamedesigner', 'boardgameartist',
       'boardgamepublisher', 'usersrated', 'average', 'bayesaverage',
       'Board Game Rank', 'Strategy Game Rank', 'Family Game Rank', 'stddev',
       'median', 'owned', 'trading', 'wanting', 'wishing', 'numcomments',
       'numweights', 'averageweight', 'boardgameintegration',
       'boardgamecompilation', 'Party Game Rank', 'Abstract Game Rank',
       'Thematic Rank', 'War Game Rank', 'Customizable Rank',
       'Children's Game Rank', 'RPG Item Rank', 'Accessory Rank',
       'Video Game Rank', 'Amiga Rank', 'Commodore 64 Rank', 'Arcade Rank',
      

In [11]:
ast.literal_eval(df.boardgamemechanic[0])

['Action Points',
 'Cooperative Game',
 'Hand Management',
 'Point to Point Movement',
 'Set Collection',
 'Trading',
 'Variable Player Powers']

In [12]:
# can unescape html but first need to replace the null with an empty string
df.description.fillna("",inplace=True)
df.description = df.description.apply(html.unescape)

In [13]:
tfidf = TfidfVectorizer(strip_accents='unicode', stop_words='english')

In [14]:
X = tfidf.fit_transform(df.description)

In [15]:
X.shape

(19230, 66436)

In [16]:
tfidf.get_feature_names()[:20]

['00',
 '000',
 '0002',
 '000km',
 '000m',
 '000men',
 '001',
 '002',
 '007',
 '008',
 '00h',
 '01',
 '0100',
 '01077',
 '011',
 '0140280324',
 '0141037875',
 '017',
 '019',
 '02']

In [17]:
tfidf.get_feature_names()[-20:]

['τονες',
 'του',
 'τους',
 'τρα',
 'τραγοi',
 'τραγουδιστικi',
 'τρελi',
 'τρελizν',
 'των',
 'φρiœντισε',
 'φτιαχτεi',
 'φωνi',
 'χi',
 'χizρ',
 'χει',
 'χημεi',
 'χουν',
 'χρiœνος',
 'ων',
 'ωστiœσο']

In [None]:
# might want to use regex to check for non alphanumeric characters and look at those descriptioons to remove

In [49]:
# Too sparse to use PCA

In [None]:
# PCA doesn't work, will have to try this by hand