# Steamspy Data Cleaning

*This forms part of a larger series of posts for my [blog](http://nik-davis.github.io) on downloading, processing and analysing data from the steam store. [See all posts here](http://nik-davis.github.io/tag/steam).*

In [29]:
# view software version information

# http://raw.github.com/jrjohansson/version_information/master/version_information.py
%load_ext version_information
%reload_ext version_information

%version_information numpy, pandas

The version_information extension is already loaded. To reload it, use:
  %reload_ext version_information


Software,Version
Python,3.7.3 64bit [MSC v.1900 64 bit (AMD64)]
IPython,7.5.0
OS,Windows 10 10.0.18362 SP0
numpy,1.16.3
pandas,0.24.2
Fri Jun 07 17:39:11 2019 GMT Summer Time,Fri Jun 07 17:39:11 2019 GMT Summer Time


In [30]:
# standard library imports
from ast import literal_eval
import itertools
import time
import re

# third-party imports
import numpy as np
import pandas as pd

# customisations
pd.set_option("max_columns", 100)

In [31]:
raw_steamspy_data = pd.read_csv('../data/raw/steamspy_data.csv')
raw_steamspy_data.head()

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
0,10,Counter-Strike,Valve,Valve,,124534,3339,0,"10,000,000 .. 20,000,000",17612,709,317,26,999.0,999.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,14923,"{'Action': 2681, 'FPS': 2048, 'Multiplayer': 1..."
1,20,Team Fortress Classic,Valve,Valve,,3318,633,0,"5,000,000 .. 10,000,000",277,15,62,15,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,87,"{'Action': 208, 'FPS': 188, 'Multiplayer': 172..."
2,30,Day of Defeat,Valve,Valve,,3416,398,0,"5,000,000 .. 10,000,000",187,0,34,0,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Spain",Action,130,"{'FPS': 138, 'World War II': 122, 'Multiplayer..."
3,40,Deathmatch Classic,Valve,Valve,,1273,267,0,"5,000,000 .. 10,000,000",258,0,184,0,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,4,"{'Action': 85, 'FPS': 71, 'Multiplayer': 58, '..."
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,,5250,288,0,"5,000,000 .. 10,000,000",624,0,415,0,499.0,499.0,0.0,"English, French, German, Korean",Action,71,"{'FPS': 235, 'Action': 211, 'Sci-fi': 166, 'Si..."


In [None]:
raw_steamspy_data.isnull().sum()

In [None]:
# for col in raw_steamspy_data.columns:
#     print('\n\n', col, '\n\n')
#     display(raw_steamspy_data[raw_steamspy_data[col].isnull()].head())

In [None]:
# for col in raw_steamspy_data.columns:
#     print('----------------', col, '----------------')
#     display(raw_steamspy_data[col].value_counts().head())

In [None]:
raw_steamspy_data['tags'].max()

In [None]:
raw_steamspy_data['owners'].value_counts()

In [None]:
i = 0
while i < 10:
    print(10**i, end=', ')
    i += 1
    
ml = [10**i for i in range(8)]
ml

In [None]:
raw_steamspy_data['positive'].max()

In [None]:
pd.cut(raw_steamspy_data['positive'], bins=[10**i for i in range(8)]).value_counts()

In [None]:
pd.cut(raw_steamspy_data['negative'], bins=[10**i for i in range(8)]).value_counts()

In [None]:
pos = pd.cut(raw_steamspy_data['positive'], bins=[10**i for i in range(8)])
neg = pd.cut(raw_steamspy_data['negative'], bins=[10**i for i in range(8)])

div = raw_steamspy_data['positive'] / raw_steamspy_data['negative']
pd.cut(div, bins=[0, 1, 5, 10, 100]).value_counts()

In [None]:
raw_steamspy_data['userscore'].value_counts()

In [None]:
drop_score = raw_steamspy_data.drop(['userscore', 'score_rank'], axis=1)
drop_score.head()

In [None]:
drop_score['average_2weeks'].value_counts().head()

In [None]:
drop_score['median_2weeks'].value_counts().head()

In [None]:
drop_2weeks = drop_score.drop(['average_2weeks', 'median_2weeks'], axis=1)
drop_2weeks.head()

In [None]:
drop_2weeks['average_forever'].value_counts().head()

In [None]:
drop_2weeks['median_forever'].value_counts().head()

In [None]:
drop_2weeks['price'].value_counts().head()

In [None]:
drop_2weeks['initialprice'].value_counts().head()

In [None]:
drop_2weeks['discount'].value_counts().head()

In [None]:
drop_2weeks[drop_2weeks['initialprice'].isnull()].head()

In [None]:
drop_2weeks[drop_2weeks['price'].isnull()]

In [None]:
drop_2weeks[drop_2weeks['initialprice'] < drop_2weeks['price']]

In [None]:
drop_price_nulls = drop_2weeks[drop_2weeks['price'].notnull()].copy()
drop_price_nulls.isnull().sum()

In [None]:
drop_price_nulls[drop_price_nulls['name'].isnull()]

In [None]:
drop_name_nulls = drop_price_nulls[drop_price_nulls['name'].notnull()].copy()
drop_name_nulls.isnull().sum()

In [None]:
drop_name_nulls[drop_name_nulls['genre'].isnull()].head()

In [None]:
drop_name_nulls[drop_name_nulls['languages'].isnull()]

In [None]:
drop_lang_genre_nulls = drop_name_nulls[(drop_name_nulls['languages'].notnull()) & (drop_name_nulls['genre'].notnull())]
drop_lang_genre_nulls.isnull().sum()

In [None]:
drop_lang_genre_nulls[drop_lang_genre_nulls['name'].str.contains(r'\bdemo\b', flags=re.I)].head()

In [None]:
drop_demos = drop_lang_genre_nulls[~drop_lang_genre_nulls['name'].str.contains(r'\bdemo\b', flags=re.I)]

In [None]:
drop_demos[drop_demos['name'].str.contains(r'\bbeta\b', flags=re.I)].head()

In [None]:
drop_betas = drop_demos[~drop_demos['name'].str.contains(r'\bbeta\b', flags=re.I)].copy()
drop_betas.isnull().sum()

In [None]:
drop_betas[drop_betas['developer'].isnull()]

In [None]:
drop_betas[drop_betas['publisher'].isnull()]

In [None]:
drop_betas[((drop_betas['publisher'].isnull()) | (drop_betas['developer'].isnull())) & (drop_betas['average_forever'] > 200)]

In [None]:
print(drop_betas[(drop_betas['publisher'].isnull()) & (drop_betas['developer'].isnull())].shape[0])

drop_betas[(drop_betas['publisher'].isnull()) & (drop_betas['developer'].isnull())].head()

In [None]:
drop_dev_pub = drop_betas[(drop_betas['publisher'].notnull()) | (drop_betas['developer'].notnull())].copy()
drop_dev_pub.isnull().sum()

In [None]:
drop_dev_pub.loc[drop_dev_pub['developer'].isnull(), 'developer'] = 'unknown'
drop_dev_pub.loc[drop_dev_pub['publisher'].isnull(), 'publisher'] = 'unknown'
drop_dev_pub.isnull().sum()

In [None]:
drop_ccu = drop_dev_pub.drop('ccu', axis=1)
drop_ccu.head()

In [None]:
drop_ccu.shape

In [None]:
# handle tags
tags = drop_ccu['tags']

tags_dict = {}

for i, row in tags.iteritems():
    eval_row = literal_eval(row)
    
    if isinstance(eval_row, dict):
        for key in literal_eval(row).keys():
            tags_dict[key] = 1
        
tags_dict

In [None]:
for k in literal_eval(tags[0]).keys():
    print(k)

In [None]:
# for i, row in drop_ccu.iterrows():
#     row_tags = literal_eval(row['tags'])
    
#     if isinstance(row_tags, dict):
#         for key in tags_dict.keys():
#             if key in row_tags:
#                 drop_ccu[key] = row_tags[key]
#             else:
#                 drop_ccu[key] = 0
#     else:
#         for key in tags_dict.keys():
#             drop_ccu[key] = 0
with pd.option_context("display.max_colwidth", 500):
    display(drop_ccu['tags'].head())

In [None]:
print(
    list(literal_eval(drop_ccu['tags'][0]).keys())[:5],
    '\n',
    literal_eval(drop_ccu['tags'][0])
)

In [None]:
def parse_tags(x):
    x_eval = literal_eval(x)
    
    if isinstance(x_eval, dict):
        return x_eval
    elif isinstance(x_eval, list):
        return {}
    else:
        print(x_eval, x)

tag_data = drop_ccu[['appid', 'genre', 'tags']].copy()
        
tag_data['tags'] = tag_data['tags'].apply(parse_tags)

tag_names = tag_data['tags'].apply(lambda x: x.keys())

cols = set(list(itertools.chain(*tag_names)))

for col in sorted(cols):
    col_name = (col.lower().replace(' ', '_').replace('-', '_').replace("'", ""))
    
    tag_data[col_name] = tag_data['tags'].apply(lambda x: x[col] if col in x.keys() else 0)
    
tag_data = tag_data.drop('tags', axis=1)
    
tag_data.head()

In [None]:
sums = tag_data.drop('genre', axis=1).sum()

cols = sums[sums > 50000].index

tag_data[cols].head()

In [None]:
tag_data.columns[tag_data.max() == 0]

In [None]:
tag_data.iloc[:, 2:].max(axis=1).head()

In [None]:
tag_data['genre'].value_counts()

In [None]:
tag_data_merge = tag_data.drop('genre', axis=1)
steamspy_data = drop_ccu.merge(tag_data_merge, how='inner', on='appid')

steamspy_data.head()

In [None]:
steamspy_data.isnull().sum()

In [164]:
def process_tags(df, export=False):
    df = df.copy()
    
    if export: 
        
        tag_data = df[['appid', 'tags']].copy()
        
        def parse_tags(x):
            x_eval = literal_eval(x)

            if isinstance(x_eval, dict):
                return x_eval
            elif isinstance(x_eval, list):
                return {}
            else:
                print(x_eval, x)

        tag_data['tags'] = tag_data['tags'].apply(parse_tags)

        tag_names = tag_data['tags'].apply(lambda x: x.keys())

        cols = set(itertools.chain(*tag_names))

        for col in sorted(cols):
            col_name = (col.lower().replace(' ', '_').replace('-', '_').replace("'", ""))

            tag_data[col_name] = tag_data['tags'].apply(lambda x: x[col] if col in x.keys() else 0)

        tag_data = tag_data.drop('tags', axis=1)

        tag_data.to_csv('../data/exports/steamspy_tag_data.csv', index=False)
        print("Exported tag data to '../data/exports/steamspy_tag_data.csv'")
        
        
    def parse_tags(x):
        x = literal_eval(x)
        
        if isinstance(x, dict):
            return ';'.join(list(x.keys())[:3])
        else:
            return np.nan
    
    df['tags'] = df['tags'].apply(parse_tags)
    
    # rows with null tags seem to be superseded by newer release, so remove (e.g. dead island)
    df = df[df['tags'].notnull()]
    
    return df


def process(df):
    df = df.copy()
    
    # handle missing values
    df = df[(df['name'].notnull()) & (df['name'] != 'none')]
    df = df[df['developer'].notnull()]
    df = df[df['languages'].notnull()]
    
    df = df.drop(['genre', 'developer', 'publisher', 'score_rank', 'userscore', 'average_2weeks', 'median_2weeks', 'price', 'initialprice', 'discount', 'ccu'], axis=1)
    
    # keep top five tags
    df = process_tags(df, export=True)
    
    df['owners'] = df['owners'].str.replace(',', '').str.replace(' .. ', '-')
    
    # could fill genres like this
    # df.loc[df['genre'].isnull(), 'genre'] = df.loc[df['genre'].isnull(), 'tags'].apply(lambda x: x.split(';')[0])
    
    # still some duplicates with same name but different appid
    
    return df


steamspy_data = process(raw_steamspy_data)
steamspy_data.head()

Exported tag data to '../data/exports/steamspy_tag_data.csv'


Unnamed: 0,appid,name,positive,negative,owners,average_forever,median_forever,languages,tags
0,10,Counter-Strike,124534,3339,10000000-20000000,17612,317,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Multiplayer
1,20,Team Fortress Classic,3318,633,5000000-10000000,277,62,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Multiplayer
2,30,Day of Defeat,3416,398,5000000-10000000,187,34,"English, French, German, Italian, Spanish - Spain",FPS;World War II;Multiplayer
3,40,Deathmatch Classic,1273,267,5000000-10000000,258,184,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Multiplayer
4,50,Half-Life: Opposing Force,5250,288,5000000-10000000,624,415,"English, French, German, Korean",FPS;Action;Sci-fi


In [166]:
# tags = steamspy_data['tags'].apply(lambda x: literal_eval(x))

# tag_counts = {}

# def count_tags(x):
    
#     if isinstance(x, dict):
#         for key in x.keys():
#             tag_counts[key] = tag_counts.get(key, 0) + x[key]
    
#     return x

# tags.apply(count_tags)
# tag_series = pd.Series(tag_counts)

# top_tag_list = list(tag_series[tag_series > 100000].index)

In [165]:
# set(itertools.chain(*steamspy_data['tags'].str.split(';')))

In [167]:
steamspy_data['owners'].value_counts()

0-20000                19543
20000-50000             3191
50000-100000            1767
100000-200000           1458
200000-500000           1347
500000-1000000           547
1000000-2000000          308
2000000-5000000          209
5000000-10000000          49
10000000-20000000         22
20000000-50000000          3
50000000-100000000         2
100000000-200000000        1
Name: owners, dtype: int64

In [168]:
steamspy_data.to_csv('../data/exports/steamspy_clean.csv', index=False)

In [170]:
steam_data = pd.read_csv('../data/exports/steam_data_clean.csv')
    
merged = steam_data.merge(steamspy_data, left_on='steam_appid', right_on='appid', suffixes=('', '_steamspy'))
merged.head()

Unnamed: 0,name,steam_appid,required_age,platforms,categories,genres,achievements,release_date,price,english,developer,publisher,appid,name_steamspy,positive,negative,owners,average_forever,median_forever,languages,tags
0,Counter-Strike,10,0,windows;mac;linux,Multi-player;Online Multi-Player;Local Multi-P...,Action,0,2000-11-01,7.19,1,Valve,Valve,10,Counter-Strike,124534,3339,10000000-20000000,17612,317,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Multiplayer
1,Team Fortress Classic,20,0,windows;mac;linux,Multi-player;Online Multi-Player;Local Multi-P...,Action,0,1999-04-01,3.99,1,Valve,Valve,20,Team Fortress Classic,3318,633,5000000-10000000,277,62,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Multiplayer
2,Day of Defeat,30,0,windows;mac;linux,Multi-player;Valve Anti-Cheat enabled,Action,0,2003-05-01,3.99,1,Valve,Valve,30,Day of Defeat,3416,398,5000000-10000000,187,34,"English, French, German, Italian, Spanish - Spain",FPS;World War II;Multiplayer
3,Deathmatch Classic,40,0,windows;mac;linux,Multi-player;Online Multi-Player;Local Multi-P...,Action,0,2001-06-01,3.99,1,Valve,Valve,40,Deathmatch Classic,1273,267,5000000-10000000,258,184,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Multiplayer
4,Half-Life: Opposing Force,50,0,windows;mac;linux,Single-player;Multi-player;Valve Anti-Cheat en...,Action,0,1999-11-01,3.99,1,Gearbox Software,Valve,50,Half-Life: Opposing Force,5250,288,5000000-10000000,624,415,"English, French, German, Korean",FPS;Action;Sci-fi


In [171]:
steam_clean = merged.drop(['name_steamspy', 'languages', 'steam_appid'], axis=1)
steam_clean = steam_clean[[
    'appid',
    'name',
    'release_date',
    'english',
    'developer',
    'publisher',
    'platforms',
    'required_age',
    'categories',
    'genres',
    'tags',
    'achievements',
    'positive',
    'negative',
    'average_forever',
    'median_forever',
    'owners',
    'price'
]]

steam_clean = steam_clean.rename({
    'tags': 'steamspy_tags',
    'positive': 'positive_ratings',
    'negative': 'negative_ratings',
    'average_forever': 'average_playtime',
    'median_forever': 'median_playtime'
}, axis=1)

steam_clean.head()

Unnamed: 0,appid,name,release_date,english,developer,publisher,platforms,required_age,categories,genres,steamspy_tags,achievements,positive_ratings,negative_ratings,average_playtime,median_playtime,owners,price
0,10,Counter-Strike,2000-11-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,124534,3339,17612,317,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,3318,633,277,62,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Valve Anti-Cheat enabled,Action,FPS;World War II;Multiplayer,0,3416,398,187,34,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,1,Valve,Valve,windows;mac;linux,0,Multi-player;Online Multi-Player;Local Multi-P...,Action,Action;FPS;Multiplayer,0,1273,267,258,184,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,1,Gearbox Software,Valve,windows;mac;linux,0,Single-player;Multi-player;Valve Anti-Cheat en...,Action,FPS;Action;Sci-fi,0,5250,288,624,415,5000000-10000000,3.99


In [172]:
steam_clean.to_csv('../data/steam_clean.csv', index=False)