# Data Exploration

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Games Data

In [2]:
# Load games data
df = pd.read_json('../Data/steam/gamesdata.json')
df.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,discount_price,reviews_url,specs,price,early_access,id,developer,sentiment,metascore
0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.49,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,False,761140.0,Kotoshiro,,
1,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",,http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,False,643980.0,Secret Level SRL,Mostly Positive,
2,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,"[Free to Play, Simulation, Sports, Casual, Ind...",,http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,False,670290.0,Poolians.com,Mostly Positive,
3,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,"[Action, Adventure, Casual]",0.83,http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,False,767400.0,彼岸领域,,
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",1.79,http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,False,773570.0,,,


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32135 entries, 0 to 32134
Data columns (total 16 columns):
publisher         24083 non-null object
genres            28852 non-null object
app_name          32133 non-null object
title             30085 non-null object
url               32135 non-null object
release_date      30068 non-null object
tags              31972 non-null object
discount_price    225 non-null float64
reviews_url       32133 non-null object
specs             31465 non-null object
price             30758 non-null object
early_access      32135 non-null bool
id                32133 non-null float64
developer         28836 non-null object
sentiment         24953 non-null object
metascore         2677 non-null object
dtypes: bool(1), float64(2), object(13)
memory usage: 3.7+ MB


## Users/items Data

In [4]:
# Load users/items data
useritems = pd.read_json('../Data/steam/data.json')
useritems.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982480,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864384,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712560,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445856,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099488,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [5]:
useritems.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88310 entries, 0 to 88309
Data columns (total 5 columns):
user_id        88310 non-null object
items_count    88310 non-null int64
steam_id       88310 non-null int64
user_url       88310 non-null object
items          88310 non-null object
dtypes: int64(2), object(3)
memory usage: 3.4+ MB


We notice that the `items` column is a list of dictionaries, with a dictionary for each game.

In [6]:
# Preview items column entry for first user
# Restrict to first 2 items in dictionary

useritems['items'][0][0:2]

[{'item_id': '10',
  'item_name': 'Counter-Strike',
  'playtime_forever': 6,
  'playtime_2weeks': 0},
 {'item_id': '20',
  'item_name': 'Team Fortress Classic',
  'playtime_forever': 0,
  'playtime_2weeks': 0}]

In [7]:
# Extract item id
useritems['items'][0][0]['item_id']

'10'

In [8]:
# Get all ids for one user
userzerogameids = []
for index, game in enumerate(useritems['items'][0]):
    userzerogameids.append(useritems['items'][0][index]['item_id'])
userzerogameids[:10]

['10', '20', '30', '40', '50', '60', '70', '130', '300', '240']

In [9]:
# Using list comprehension
gameids = [useritems['items'][0][index]['item_id'] for index, _ in enumerate(useritems['items'][0])]
gameids[:10]

['10', '20', '30', '40', '50', '60', '70', '130', '300', '240']

In [10]:
# Create column with itemDs
useritems['item_ids'] = useritems['items'].apply(lambda x: [x [index]['item_id'] for index, _ in enumerate(x)])

In [11]:
useritems.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items,item_ids
0,76561197970982479,277,76561197970982480,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik...","[10, 20, 30, 40, 50, 60, 70, 130, 300, 240, 38..."
1,js41637,888,76561198035864384,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik...","[10, 80, 100, 300, 30, 40, 60, 240, 280, 360, ..."
2,evcentric,137,76561198007712560,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest...","[1200, 1230, 1280, 1520, 220, 320, 340, 360, 3..."
3,Riot-Punch,328,76561197963445856,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik...","[10, 20, 30, 40, 50, 60, 70, 130, 80, 100, 300..."
4,doctr,541,76561198002099488,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea...","[300, 20, 50, 70, 130, 10, 30, 40, 60, 80, 100..."


In [12]:
useritems.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88310 entries, 0 to 88309
Data columns (total 6 columns):
user_id        88310 non-null object
items_count    88310 non-null int64
steam_id       88310 non-null int64
user_url       88310 non-null object
items          88310 non-null object
item_ids       88310 non-null object
dtypes: int64(2), object(4)
memory usage: 4.0+ MB


In [13]:
# Get first user's 5 item_ids
useritems['item_ids'][0][:5]

['10', '20', '30', '40', '50']

In [14]:
# list of all ids - includes duplicates
idlist = np.concatenate(useritems['item_ids'])

In [15]:
len(idlist)

5153209

In [16]:
#unique ids
uniqueids = set(idlist)
len(uniqueids)

10978

In [18]:
# Define utility matrix with relevant columns
utilmatrix = useritems[['user_id', 'item_ids']]
utilmatrix.head()

Unnamed: 0,user_id,item_ids
0,76561197970982479,"[10, 20, 30, 40, 50, 60, 70, 130, 300, 240, 38..."
1,js41637,"[10, 80, 100, 300, 30, 40, 60, 240, 280, 360, ..."
2,evcentric,"[1200, 1230, 1280, 1520, 220, 320, 340, 360, 3..."
3,Riot-Punch,"[10, 20, 30, 40, 50, 60, 70, 130, 80, 100, 300..."
4,doctr,"[300, 20, 50, 70, 130, 10, 30, 40, 60, 80, 100..."


In [19]:
lst_col = 'item_ids'
utilmatrix = pd.DataFrame({col:np.repeat(utilmatrix[col].values, utilmatrix[lst_col].str.len())
                              for col in utilmatrix.columns.difference([lst_col])
                            }).assign(**{lst_col:np.concatenate(utilmatrix[lst_col].values)})[utilmatrix.columns.tolist()]

In [20]:
utilmatrix

Unnamed: 0,user_id,item_ids
0,76561197970982479,10
1,76561197970982479,20
2,76561197970982479,30
3,76561197970982479,40
4,76561197970982479,50
...,...,...
5153204,76561198329548331,346330
5153205,76561198329548331,373330
5153206,76561198329548331,388490
5153207,76561198329548331,521570


## Test DF

In [18]:
testdf = useritems[:2]
testdf.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982480,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864384,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."


In [36]:
testdf['items'][0][0]

{'item_id': '10',
 'item_name': 'Counter-Strike',
 'playtime_forever': 6,
 'playtime_2weeks': 0}

In [37]:
testdf['items'][0]

[{'item_id': '10',
  'item_name': 'Counter-Strike',
  'playtime_forever': 6,
  'playtime_2weeks': 0},
 {'item_id': '20',
  'item_name': 'Team Fortress Classic',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '30',
  'item_name': 'Day of Defeat',
  'playtime_forever': 7,
  'playtime_2weeks': 0},
 {'item_id': '40',
  'item_name': 'Deathmatch Classic',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '50',
  'item_name': 'Half-Life: Opposing Force',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '60',
  'item_name': 'Ricochet',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '70',
  'item_name': 'Half-Life',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '130',
  'item_name': 'Half-Life: Blue Shift',
  'playtime_forever': 0,
  'playtime_2weeks': 0},
 {'item_id': '300',
  'item_name': 'Day of Defeat: Source',
  'playtime_forever': 4733,
  'playtime_2weeks': 0},
 {'item_id': '240',
  'item_name': 'Counter-Strike: S

In [46]:
testdf['gameids'] = testdf['items'].apply(lambda x: [x [index]['item_id'] for index, _ in enumerate(x)])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [47]:
testdf.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items,gameids
0,76561197970982479,277,76561197970982480,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik...","[10, 20, 30, 40, 50, 60, 70, 130, 300, 240, 38..."
1,js41637,888,76561198035864384,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik...","[10, 80, 100, 300, 30, 40, 60, 240, 280, 360, ..."


In [89]:
# list of all ids - includes duplicates
testidlist = np.concatenate(testdf['gameids'])

In [90]:
len(testidlist)

1165

In [91]:
#unique ids
testuniqueids = set(testidlist)
len(testuniqueids)

1021

In [120]:
# Define utility matrix with relevant columns
testutilmatrix = testdf[['user_id', 'gameids']]
testutilmatrix.head()

Unnamed: 0,user_id,gameids
0,76561197970982479,"[10, 20, 30, 40, 50, 60, 70, 130, 300, 240, 38..."
1,js41637,"[10, 80, 100, 300, 30, 40, 60, 240, 280, 360, ..."


In [122]:
lst_col = 'gameids'
testutilmatrix = pd.DataFrame({col:np.repeat(testutilmatrix[col].values, testutilmatrix[lst_col].str.len())
                              for col in testutilmatrix.columns.difference([lst_col])
                            }).assign(**{lst_col:np.concatenate(testutilmatrix[lst_col].values)})[testutilmatrix.columns.tolist()]

In [123]:
testutilmatrix 

Unnamed: 0,user_id,gameids
0,76561197970982479,10
1,76561197970982479,20
2,76561197970982479,30
3,76561197970982479,40
4,76561197970982479,50
...,...,...
1160,js41637,473450
1161,js41637,485330
1162,js41637,485890
1163,js41637,491330
