# Steamspy Data Cleaning

*This forms part of a larger series of posts for my [blog](http://nik-davis.github.io) on downloading, processing and analysing data from the steam store. [See all posts here](http://nik-davis.github.io/tags/steam).*

In [1]:
# view software version information

# http://raw.github.com/jrjohansson/version_information/master/version_information.py
%load_ext version_information
%reload_ext version_information

%version_information numpy, pandas

Software,Version
Python,3.7.3 64bit [MSC v.1900 64 bit (AMD64)]
IPython,7.5.0
OS,Windows 10 10.0.17763 SP0
numpy,1.16.3
pandas,0.24.2
Mon Jun 03 15:35:19 2019 GMT Summer Time,Mon Jun 03 15:35:19 2019 GMT Summer Time


In [3]:
# standard library imports
from ast import literal_eval
import itertools
import time
import re

# third-party imports
import numpy as np
import pandas as pd

# customisations
pd.set_option("max_columns", 100)

In [4]:
raw_steamspy_data = pd.read_csv('../data/raw/steamspy_data.csv')
raw_steamspy_data.head()

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
0,10,Counter-Strike,Valve,Valve,,124534,3339,0,"10,000,000 .. 20,000,000",17612,709,317,26,999.0,999.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,14923,"{'Action': 2681, 'FPS': 2048, 'Multiplayer': 1..."
1,20,Team Fortress Classic,Valve,Valve,,3318,633,0,"5,000,000 .. 10,000,000",277,15,62,15,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,87,"{'Action': 208, 'FPS': 188, 'Multiplayer': 172..."
2,30,Day of Defeat,Valve,Valve,,3416,398,0,"5,000,000 .. 10,000,000",187,0,34,0,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Spain",Action,130,"{'FPS': 138, 'World War II': 122, 'Multiplayer..."
3,40,Deathmatch Classic,Valve,Valve,,1273,267,0,"5,000,000 .. 10,000,000",258,0,184,0,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,4,"{'Action': 85, 'FPS': 71, 'Multiplayer': 58, '..."
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,,5250,288,0,"5,000,000 .. 10,000,000",624,0,415,0,499.0,499.0,0.0,"English, French, German, Korean",Action,71,"{'FPS': 235, 'Action': 211, 'Sci-fi': 166, 'Si..."


In [4]:
raw_steamspy_data.isnull().sum()

appid                  0
name                   5
developer            197
publisher            280
score_rank         29177
positive               0
negative               0
userscore              0
owners                 0
average_forever        0
average_2weeks         0
median_forever         0
median_2weeks          0
price                 29
initialprice          22
discount              22
languages             94
genre                152
ccu                    0
tags                   0
dtype: int64

In [5]:
for col in raw_steamspy_data.columns:
    print('\n\n', col, '\n\n')
    display(raw_steamspy_data[raw_steamspy_data[col].isnull()].head())



 appid 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags




 name 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
481,17760,,,,,0,0,0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0,[]
7235,396420,,,,,22,9,0,"20,000 .. 50,000",0,0,0,0,0.0,0.0,0.0,,,0,"{'Free to Play': 24, 'Action': 21, 'Indie': 21..."
9553,460250,,Jeroen Wimmers,Jeroen Wimmers,,44,4,0,"0 .. 20,000",0,0,0,0,899.0,899.0,0.0,"English, French, Italian, German, Spanish - Sp...","Casual, Indie",0,"{'Indie': 34, 'Casual': 24, 'Puzzle': 15, 'Min..."
22244,806160,,Paleno Games,Paleno Games,,4,13,0,"0 .. 20,000",0,0,0,0,99.0,99.0,0.0,"English, French, Italian, German, Spanish - Sp...","Action, Adventure, Casual, Indie",0,"{'Action': 22, 'Indie': 22, 'Casual': 21, 'Adv..."
27324,965340,,2nd Studio,2nd Studio,,31,1,0,"0 .. 20,000",0,0,0,0,199.0,199.0,0.0,"English, German, Danish, Japanese, Russian, Si...","Action, Indie, Simulation",0,"{'Indie': 32, 'Sexual Content': 31, 'Action': ..."




 developer 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
26,852,ValveTestApp852,,,,0,0,0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0,[]
147,4330,Star Trek: DAC - Demo,,,,0,0,0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0,[]
256,8740,Puzzlegeddon,,,,0,0,0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,1,[]
264,8955,Borderlands DLC: Claptrap's New Robot Revolution,,,,0,0,0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0,[]
285,9730,Tycoon City: New York,,Retroism,,58,102,0,"20,000 .. 50,000",0,0,0,0,999.0,999.0,0.0,English,,0,"{'Simulation': 34, 'Management': 18, 'City Bui..."




 publisher 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
26,852,ValveTestApp852,,,,0,0,0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0,[]
67,2540,RIP - Trilogy,Elephant Games,,,199,131,0,"100,000 .. 200,000",1,0,2,0,499.0,499.0,0.0,English,"Casual, Indie",0,"{'Indie': 38, 'Casual': 35, 'Arcade': 14, 'Sho..."
68,2570,Vigil: Blood Bitterness,Freegamer,,,29,108,0,"50,000 .. 100,000",1,0,2,0,0.0,0.0,0.0,English,"Indie, RPG",0,"{'Indie': 27, 'RPG': 22, 'Puzzle': 7, 'Singlep..."
147,4330,Star Trek: DAC - Demo,,,,0,0,0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0,[]
190,6600,Bullet Candy,R C Knight,,,39,32,0,"50,000 .. 100,000",0,0,0,0,399.0,399.0,0.0,English,"Casual, Indie",0,"{'Indie': 27, 'Casual': 23, 'Twin Stick Shoote..."




 score_rank 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
0,10,Counter-Strike,Valve,Valve,,124534,3339,0,"10,000,000 .. 20,000,000",17612,709,317,26,999.0,999.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,14923,"{'Action': 2681, 'FPS': 2048, 'Multiplayer': 1..."
1,20,Team Fortress Classic,Valve,Valve,,3318,633,0,"5,000,000 .. 10,000,000",277,15,62,15,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,87,"{'Action': 208, 'FPS': 188, 'Multiplayer': 172..."
2,30,Day of Defeat,Valve,Valve,,3416,398,0,"5,000,000 .. 10,000,000",187,0,34,0,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Spain",Action,130,"{'FPS': 138, 'World War II': 122, 'Multiplayer..."
3,40,Deathmatch Classic,Valve,Valve,,1273,267,0,"5,000,000 .. 10,000,000",258,0,184,0,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,4,"{'Action': 85, 'FPS': 71, 'Multiplayer': 58, '..."
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,,5250,288,0,"5,000,000 .. 10,000,000",624,0,415,0,499.0,499.0,0.0,"English, French, German, Korean",Action,71,"{'FPS': 235, 'Action': 211, 'Sci-fi': 166, 'Si..."




 positive 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags




 negative 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags




 userscore 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags




 owners 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags




 average_forever 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags




 average_2weeks 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags




 median_forever 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags




 median_2weeks 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags




 price 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
639,29017,Blood Bowl 2 - Review,,,,0,0,0,"0 .. 20,000",0,0,0,0,,0.0,0.0,,,0,[]
2056,239490,America's Army: Proving Grounds Beta (Closed),,,,0,0,0,"0 .. 20,000",0,0,0,0,,0.0,0.0,,,0,[]
4261,321210,Blade Symphony Beta,,,,0,0,0,"0 .. 20,000",0,0,0,0,,,,,,0,[]
7076,391920,Ether One Redux,,,,0,0,0,"0 .. 20,000",0,0,0,0,,0.0,0.0,,,0,[]
7248,396740,Blood Bowl 2 - Preview,,,,0,0,0,"0 .. 20,000",0,0,0,0,,0.0,0.0,,,0,[]




 initialprice 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
4261,321210,Blade Symphony Beta,,,,0,0,0,"0 .. 20,000",0,0,0,0,,,,,,0,[]
8774,439400,Legends of Callasia Demo,,,,0,0,0,"0 .. 20,000",0,0,0,0,,,,,,0,[]
10737,497960,Legends of Callasia Demo,,,,0,0,0,"0 .. 20,000",0,0,0,0,,,,,,0,[]
10790,499450,The Witcher 3: Wild Hunt Game of the Year Edition,,,,0,0,0,"0 .. 20,000",0,0,0,0,,,,,,0,[]
12076,530940,BIOHAZARD 7 resident evil グロテスクVer.,,,,0,0,0,"0 .. 20,000",0,0,0,0,,,,,,0,[]




 discount 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
4261,321210,Blade Symphony Beta,,,,0,0,0,"0 .. 20,000",0,0,0,0,,,,,,0,[]
8774,439400,Legends of Callasia Demo,,,,0,0,0,"0 .. 20,000",0,0,0,0,,,,,,0,[]
10737,497960,Legends of Callasia Demo,,,,0,0,0,"0 .. 20,000",0,0,0,0,,,,,,0,[]
10790,499450,The Witcher 3: Wild Hunt Game of the Year Edition,,,,0,0,0,"0 .. 20,000",0,0,0,0,,,,,,0,[]
12076,530940,BIOHAZARD 7 resident evil グロテスクVer.,,,,0,0,0,"0 .. 20,000",0,0,0,0,,,,,,0,[]




 languages 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
26,852,ValveTestApp852,,,,0,0,0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0,[]
147,4330,Star Trek: DAC - Demo,,,,0,0,0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0,[]
249,8350,Strong Bad's Cool Game for Attractive People: ...,Telltale Games,Telltale Games,,12,5,0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,Adventure,0,{'Adventure': 20}
250,8360,Strong Bad's Cool Game for Attractive People: ...,Telltale Games,Telltale Games,,11,6,0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,Adventure,0,{'Adventure': 21}
251,8370,Strong Bad's Cool Game for Attractive People: ...,Telltale Games,Telltale Games,,11,5,0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,Adventure,0,{'Adventure': 20}




 genre 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
26,852,ValveTestApp852,,,,0,0,0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0,[]
147,4330,Star Trek: DAC - Demo,,,,0,0,0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0,[]
256,8740,Puzzlegeddon,,,,0,0,0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,1,[]
264,8955,Borderlands DLC: Claptrap's New Robot Revolution,,,,0,0,0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0,[]
285,9730,Tycoon City: New York,,Retroism,,58,102,0,"20,000 .. 50,000",0,0,0,0,999.0,999.0,0.0,English,,0,"{'Simulation': 34, 'Management': 18, 'City Bui..."




 ccu 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags




 tags 




Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags


In [9]:
for col in raw_steamspy_data.columns:
    print('----------------', col, '----------------')
    display(raw_steamspy_data[col].value_counts().head())

---------------- appid ----------------


397310    1
423620    1
31920     1
857800    1
730830    1
Name: appid, dtype: int64

---------------- name ----------------


Fruit Sudoku               4
SiN Episodes: Emergence    3
SQR                        3
none                       3
Crazy Machines 1.5         3
Name: name, dtype: int64

---------------- developer ----------------


Choice of Games               95
KOEI TECMO GAMES CO., LTD.    72
Ripknot Systems               62
Laush Dmitriy Sergeevich      53
Telltale Games                52
Name: developer, dtype: int64

---------------- publisher ----------------


Big Fish Games    231
Strategy First    139
Ubisoft           120
Square Enix       108
THQ Nordic        106
Name: publisher, dtype: int64

---------------- score_rank ----------------


100.0    12
97.0     11
96.0     11
95.0      7
98.0      7
Name: score_rank, dtype: int64

---------------- positive ----------------


1    1592
0    1360
2    1222
3    1166
4     949
Name: positive, dtype: int64

---------------- negative ----------------


0    3708
1    2905
2    2004
3    1589
4    1233
Name: negative, dtype: int64

---------------- userscore ----------------


0      29177
100        4
84         4
95         3
55         3
Name: userscore, dtype: int64

---------------- owners ----------------


0 .. 20,000           20234
20,000 .. 50,000       3218
50,000 .. 100,000      1785
100,000 .. 200,000     1471
200,000 .. 500,000     1373
Name: owners, dtype: int64

---------------- average_forever ----------------


0    22658
1      173
3       79
2       62
9       62
Name: average_forever, dtype: int64

---------------- average_2weeks ----------------


0      28526
1         37
7         33
273       22
8         12
Name: average_2weeks, dtype: int64

---------------- median_forever ----------------


0    22658
1      170
3       77
2       58
9       54
Name: median_forever, dtype: int64

---------------- median_2weeks ----------------


0      28526
1         37
7         33
273       22
8         13
Name: median_2weeks, dtype: int64

---------------- price ----------------


999.0    3878
0.0      3652
499.0    3399
99.0     2928
199.0    1939
Name: price, dtype: int64

---------------- initialprice ----------------


999.0    4038
0.0      3659
499.0    3547
99.0     3098
199.0    2017
Name: initialprice, dtype: int64

---------------- discount ----------------


0.0     27784
51.0      182
75.0      174
50.0      173
90.0      115
Name: discount, dtype: int64

---------------- languages ----------------


English                                              16330
English, Russian                                      1085
English, Simplified Chinese                            503
English, Japanese                                      463
English, French, Italian, German, Spanish - Spain      391
Name: languages, dtype: int64

---------------- genre ----------------


Action, Indie               1971
Casual, Indie               1567
Action, Adventure, Indie    1311
Adventure, Indie            1229
Action, Casual, Indie       1061
Name: genre, dtype: int64

---------------- ccu ----------------


0    22088
1     1391
2      835
3      523
4      343
Name: ccu, dtype: int64

---------------- tags ----------------


[]                                                     656
{'Indie': 21, 'Casual': 21}                            248
{'Action': 21, 'Indie': 21, 'Casual': 21}              185
{'Action': 21, 'Indie': 21}                            157
{'Adventure': 21, 'Casual': 21, 'Hidden Object': 5}     86
Name: tags, dtype: int64

In [10]:
raw_steamspy_data['tags'].max()

"{'Zombies': 98, 'Adventure': 72, 'Survival': 62, 'Action': 58, 'Third Person': 48, 'Open World': 45, 'Gore': 43, 'Horror': 39, 'Singleplayer': 36, 'Multiplayer': 23, 'Cartoony': 19, 'Hack and Slash': 15, 'Stealth': 11, 'Atmospheric': 9, 'Co-op': 6, 'Anime': 6, 'Free to Play': 5, 'Survival Horror': 5}"

In [6]:
raw_steamspy_data['owners'].value_counts()

0 .. 20,000                   20234
20,000 .. 50,000               3218
50,000 .. 100,000              1785
100,000 .. 200,000             1471
200,000 .. 500,000             1373
500,000 .. 1,000,000            556
1,000,000 .. 2,000,000          311
2,000,000 .. 5,000,000          210
5,000,000 .. 10,000,000          49
10,000,000 .. 20,000,000         22
20,000,000 .. 50,000,000          3
50,000,000 .. 100,000,000         2
100,000,000 .. 200,000,000        1
Name: owners, dtype: int64

In [7]:
i = 0
while i < 10:
    print(10**i, end=', ')
    i += 1
    
ml = [10**i for i in range(8)]
ml

1, 10, 100, 1000, 10000, 100000, 1000000, 10000000, 100000000, 1000000000, 

[1, 10, 100, 1000, 10000, 100000, 1000000, 10000000]

In [8]:
raw_steamspy_data['positive'].max()

2644404

In [9]:
pd.cut(raw_steamspy_data['positive'], bins=[10**i for i in range(8)]).value_counts()

(10, 100]              11031
(1, 10]                 7345
(100, 1000]             5567
(1000, 10000]           1904
(10000, 100000]          402
(100000, 1000000]         33
(1000000, 10000000]        1
Name: positive, dtype: int64

In [10]:
pd.cut(raw_steamspy_data['negative'], bins=[10**i for i in range(8)]).value_counts()

(1, 10]                9205
(10, 100]              9094
(100, 1000]            3506
(1000, 10000]           753
(10000, 100000]          60
(100000, 1000000]         4
(1000000, 10000000]       0
Name: negative, dtype: int64

In [11]:
pos = pd.cut(raw_steamspy_data['positive'], bins=[10**i for i in range(8)])
neg = pd.cut(raw_steamspy_data['negative'], bins=[10**i for i in range(8)])

div = raw_steamspy_data['positive'] / raw_steamspy_data['negative']
pd.cut(div, bins=[0, 1, 5, 10, 100]).value_counts()

(1, 5]       12740
(0, 1]        4844
(5, 10]       4105
(10, 100]     3108
dtype: int64

In [12]:
raw_steamspy_data['userscore'].value_counts()

0      29177
100        4
84         4
95         3
55         3
70         2
80         2
82         2
98         2
51         2
68         2
94         2
69         2
46         2
92         2
57         1
67         1
61         1
96         1
49         1
65         1
81         1
97         1
50         1
76         1
60         1
91         1
88         1
59         1
78         1
53         1
74         1
77         1
73         1
63         1
71         1
87         1
8          1
85         1
Name: userscore, dtype: int64

In [13]:
drop_score = raw_steamspy_data.drop(['userscore', 'score_rank'], axis=1)
drop_score.head()

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
0,10,Counter-Strike,Valve,Valve,124534,3339,"10,000,000 .. 20,000,000",17612,709,317,26,999.0,999.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,14923,"{'Action': 2681, 'FPS': 2048, 'Multiplayer': 1..."
1,20,Team Fortress Classic,Valve,Valve,3318,633,"5,000,000 .. 10,000,000",277,15,62,15,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,87,"{'Action': 208, 'FPS': 188, 'Multiplayer': 172..."
2,30,Day of Defeat,Valve,Valve,3416,398,"5,000,000 .. 10,000,000",187,0,34,0,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Spain",Action,130,"{'FPS': 138, 'World War II': 122, 'Multiplayer..."
3,40,Deathmatch Classic,Valve,Valve,1273,267,"5,000,000 .. 10,000,000",258,0,184,0,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,4,"{'Action': 85, 'FPS': 71, 'Multiplayer': 58, '..."
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,5250,288,"5,000,000 .. 10,000,000",624,0,415,0,499.0,499.0,0.0,"English, French, German, Korean",Action,71,"{'FPS': 235, 'Action': 211, 'Sci-fi': 166, 'Si..."


In [14]:
drop_score['average_2weeks'].value_counts().head()

0      28526
1         37
7         33
273       22
8         12
Name: average_2weeks, dtype: int64

In [15]:
drop_score['median_2weeks'].value_counts().head()

0      28526
1         37
7         33
273       22
8         13
Name: median_2weeks, dtype: int64

In [16]:
drop_2weeks = drop_score.drop(['average_2weeks', 'median_2weeks'], axis=1)
drop_2weeks.head()

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,discount,languages,genre,ccu,tags
0,10,Counter-Strike,Valve,Valve,124534,3339,"10,000,000 .. 20,000,000",17612,317,999.0,999.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,14923,"{'Action': 2681, 'FPS': 2048, 'Multiplayer': 1..."
1,20,Team Fortress Classic,Valve,Valve,3318,633,"5,000,000 .. 10,000,000",277,62,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,87,"{'Action': 208, 'FPS': 188, 'Multiplayer': 172..."
2,30,Day of Defeat,Valve,Valve,3416,398,"5,000,000 .. 10,000,000",187,34,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Spain",Action,130,"{'FPS': 138, 'World War II': 122, 'Multiplayer..."
3,40,Deathmatch Classic,Valve,Valve,1273,267,"5,000,000 .. 10,000,000",258,184,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,4,"{'Action': 85, 'FPS': 71, 'Multiplayer': 58, '..."
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,5250,288,"5,000,000 .. 10,000,000",624,415,499.0,499.0,0.0,"English, French, German, Korean",Action,71,"{'FPS': 235, 'Action': 211, 'Sci-fi': 166, 'Si..."


In [17]:
drop_2weeks['average_forever'].value_counts().head()

0    22658
1      173
3       79
2       62
9       62
Name: average_forever, dtype: int64

In [18]:
drop_2weeks['median_forever'].value_counts().head()

0    22658
1      170
3       77
2       58
9       54
Name: median_forever, dtype: int64

In [19]:
drop_2weeks['price'].value_counts().head()

999.0    3878
0.0      3652
499.0    3399
99.0     2928
199.0    1939
Name: price, dtype: int64

In [20]:
drop_2weeks['initialprice'].value_counts().head()

999.0    4038
0.0      3659
499.0    3547
99.0     3098
199.0    2017
Name: initialprice, dtype: int64

In [21]:
drop_2weeks['discount'].value_counts().head()

0.0     27784
51.0      182
75.0      174
50.0      173
90.0      115
Name: discount, dtype: int64

In [22]:
drop_2weeks[drop_2weeks['initialprice'].isnull()].head()

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,discount,languages,genre,ccu,tags
4261,321210,Blade Symphony Beta,,,0,0,"0 .. 20,000",0,0,,,,,,0,[]
8774,439400,Legends of Callasia Demo,,,0,0,"0 .. 20,000",0,0,,,,,,0,[]
10737,497960,Legends of Callasia Demo,,,0,0,"0 .. 20,000",0,0,,,,,,0,[]
10790,499450,The Witcher 3: Wild Hunt Game of the Year Edition,,,0,0,"0 .. 20,000",0,0,,,,,,0,[]
12076,530940,BIOHAZARD 7 resident evil グロテスクVer.,,,0,0,"0 .. 20,000",0,0,,,,,,0,[]


In [23]:
drop_2weeks[drop_2weeks['price'].isnull()]

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,discount,languages,genre,ccu,tags
639,29017,Blood Bowl 2 - Review,,,0,0,"0 .. 20,000",0,0,,0.0,0.0,,,0,[]
2056,239490,America's Army: Proving Grounds Beta (Closed),,,0,0,"0 .. 20,000",0,0,,0.0,0.0,,,0,[]
4261,321210,Blade Symphony Beta,,,0,0,"0 .. 20,000",0,0,,,,,,0,[]
7076,391920,Ether One Redux,,,0,0,"0 .. 20,000",0,0,,0.0,0.0,,,0,[]
7248,396740,Blood Bowl 2 - Preview,,,0,0,"0 .. 20,000",0,0,,0.0,0.0,,,0,[]
7859,413010,Conflicks - Revolutionary Space Battles Demo,,,0,0,"0 .. 20,000",0,0,,0.0,0.0,,,0,[]
8141,421730,Block N Load PTR,,,0,0,"0 .. 20,000",0,0,,0.0,0.0,,,0,[]
8774,439400,Legends of Callasia Demo,,,0,0,"0 .. 20,000",0,0,,,,,,0,[]
8967,445730,Watch paint dry,,,0,0,"0 .. 20,000",0,0,,0.0,0.0,,,0,[]
10737,497960,Legends of Callasia Demo,,,0,0,"0 .. 20,000",0,0,,,,,,0,[]


In [24]:
drop_2weeks[drop_2weeks['initialprice'] < drop_2weeks['price']]

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,discount,languages,genre,ccu,tags


In [25]:
drop_price_nulls = drop_2weeks[drop_2weeks['price'].notnull()].copy()
drop_price_nulls.isnull().sum()

appid                0
name                 5
developer          168
publisher          251
positive             0
negative             0
owners               0
average_forever      0
median_forever       0
price                0
initialprice         0
discount             0
languages           65
genre              123
ccu                  0
tags                 0
dtype: int64

In [26]:
drop_price_nulls[drop_price_nulls['name'].isnull()]

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,discount,languages,genre,ccu,tags
481,17760,,,,0,0,"0 .. 20,000",0,0,0.0,0.0,0.0,,,0,[]
7235,396420,,,,22,9,"20,000 .. 50,000",0,0,0.0,0.0,0.0,,,0,"{'Free to Play': 24, 'Action': 21, 'Indie': 21..."
9553,460250,,Jeroen Wimmers,Jeroen Wimmers,44,4,"0 .. 20,000",0,0,899.0,899.0,0.0,"English, French, Italian, German, Spanish - Sp...","Casual, Indie",0,"{'Indie': 34, 'Casual': 24, 'Puzzle': 15, 'Min..."
22244,806160,,Paleno Games,Paleno Games,4,13,"0 .. 20,000",0,0,99.0,99.0,0.0,"English, French, Italian, German, Spanish - Sp...","Action, Adventure, Casual, Indie",0,"{'Action': 22, 'Indie': 22, 'Casual': 21, 'Adv..."
27324,965340,,2nd Studio,2nd Studio,31,1,"0 .. 20,000",0,0,199.0,199.0,0.0,"English, German, Danish, Japanese, Russian, Si...","Action, Indie, Simulation",0,"{'Indie': 32, 'Sexual Content': 31, 'Action': ..."


In [27]:
drop_name_nulls = drop_price_nulls[drop_price_nulls['name'].notnull()].copy()
drop_name_nulls.isnull().sum()

appid                0
name                 0
developer          166
publisher          249
positive             0
negative             0
owners               0
average_forever      0
median_forever       0
price                0
initialprice         0
discount             0
languages           63
genre              121
ccu                  0
tags                 0
dtype: int64

In [28]:
drop_name_nulls[drop_name_nulls['genre'].isnull()]

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,discount,languages,genre,ccu,tags
26,852,ValveTestApp852,,,0,0,"0 .. 20,000",0,0,0.0,0.0,0.0,,,0,[]
147,4330,Star Trek: DAC - Demo,,,0,0,"0 .. 20,000",0,0,0.0,0.0,0.0,,,0,[]
256,8740,Puzzlegeddon,,,0,0,"0 .. 20,000",0,0,0.0,0.0,0.0,,,1,[]
264,8955,Borderlands DLC: Claptrap's New Robot Revolution,,,0,0,"0 .. 20,000",0,0,0.0,0.0,0.0,,,0,[]
285,9730,Tycoon City: New York,,Retroism,58,102,"20,000 .. 50,000",0,0,999.0,999.0,0.0,English,,0,"{'Simulation': 34, 'Management': 18, 'City Bui..."
371,12570,Hot Dish,Zemnott,ValuSoft,9,12,"20,000 .. 50,000",0,0,999.0,999.0,0.0,"English, French, German, Italian, Spanish - Spain",,0,{'Casual': 7}
372,12580,Dr. Daisy Pet Vet,Zemnott,ValuSoft,11,10,"0 .. 20,000",0,0,999.0,999.0,0.0,"English, French, German, Italian, Spanish - Spain",,0,"{'Casual': 5, 'Time Management': 5}"
387,13120,America's Army 3 Beta,,,0,0,"0 .. 20,000",0,0,0.0,0.0,0.0,,,0,[]
394,13260,Unreal Development Kit,,,0,0,"0 .. 20,000",0,0,0.0,0.0,0.0,,,0,[]
489,18310,Spectraball - Demo,,,0,0,"0 .. 20,000",0,0,0.0,0.0,0.0,,,0,[]


In [29]:
drop_name_nulls[drop_name_nulls['languages'].isnull()]

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,discount,languages,genre,ccu,tags
26,852,ValveTestApp852,,,0,0,"0 .. 20,000",0,0,0.0,0.0,0.0,,,0,[]
147,4330,Star Trek: DAC - Demo,,,0,0,"0 .. 20,000",0,0,0.0,0.0,0.0,,,0,[]
249,8350,Strong Bad's Cool Game for Attractive People: ...,Telltale Games,Telltale Games,12,5,"0 .. 20,000",0,0,0.0,0.0,0.0,,Adventure,0,{'Adventure': 20}
250,8360,Strong Bad's Cool Game for Attractive People: ...,Telltale Games,Telltale Games,11,6,"0 .. 20,000",0,0,0.0,0.0,0.0,,Adventure,0,{'Adventure': 21}
251,8370,Strong Bad's Cool Game for Attractive People: ...,Telltale Games,Telltale Games,11,5,"0 .. 20,000",0,0,0.0,0.0,0.0,,Adventure,0,{'Adventure': 20}
252,8380,Strong Bad's Cool Game for Attractive People: ...,Telltale Games,Telltale Games,12,3,"0 .. 20,000",0,0,0.0,0.0,0.0,,Adventure,0,{'Adventure': 21}
256,8740,Puzzlegeddon,,,0,0,"0 .. 20,000",0,0,0.0,0.0,0.0,,,1,[]
264,8955,Borderlands DLC: Claptrap's New Robot Revolution,,,0,0,"0 .. 20,000",0,0,0.0,0.0,0.0,,,0,[]
387,13120,America's Army 3 Beta,,,0,0,"0 .. 20,000",0,0,0.0,0.0,0.0,,,0,[]
394,13260,Unreal Development Kit,,,0,0,"0 .. 20,000",0,0,0.0,0.0,0.0,,,0,[]


In [30]:
drop_lang_genre_nulls = drop_name_nulls[(drop_name_nulls['languages'].notnull()) & (drop_name_nulls['genre'].notnull())]
drop_lang_genre_nulls.isnull().sum()

appid                0
name                 0
developer           92
publisher          195
positive             0
negative             0
owners               0
average_forever      0
median_forever       0
price                0
initialprice         0
discount             0
languages            0
genre                0
ccu                  0
tags                 0
dtype: int64

In [31]:
drop_lang_genre_nulls[drop_lang_genre_nulls['name'].str.contains(r'\bdemo\b', flags=re.I)].head()

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,discount,languages,genre,ccu,tags
1608,213390,Atooms to Moolecules Demo,BitSits Games,BitSits Games,9,6,"0 .. 20,000",0,0,0.0,0.0,0.0,English,"Indie, Casual",0,"{'Indie': 21, 'Casual': 21}"
1755,222500,Rail Adventures - VR Tech Demo,exosyphen studios,exosyphen studios,16,17,"20,000 .. 50,000",0,0,0.0,0.0,0.0,English,"Action, Free to Play, Indie, Racing",0,"{'Free to Play': 26, 'Action': 25, 'Racing': 2..."
8580,434430,Abbot's Book Demo,"The Abbot's Book, LLC","Blackthorn Media, LLC",155,18,"50,000 .. 100,000",0,0,0.0,0.0,0.0,English,"Adventure, Free to Play, Indie, RPG",1,"{'Adventure': 28, 'Free to Play': 25, 'RPG': 2..."
9771,465090,Don Bradman Cricket 17 Demo,Big Ant Studios,Big Ant Studios,3,2,"0 .. 20,000",0,0,0.0,0.0,0.0,English,"Simulation, Sports",0,"{'Simulation': 23, 'Sports': 22}"
11270,511360,UNCORPOREAL - Holographic Photography Demo,Uncorporeal Systems,Uncorporeal Systems,15,2,"0 .. 20,000",0,0,0.0,0.0,0.0,English,Education,0,{'Education': 21}


In [32]:
drop_demos = drop_lang_genre_nulls[~drop_lang_genre_nulls['name'].str.contains(r'\bdemo\b', flags=re.I)]

In [33]:
drop_demos[drop_demos['name'].str.contains(r'\bbeta\b', flags=re.I)].head()

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,discount,languages,genre,ccu,tags
5093,343320,StaudSoft's Synthetic World Beta,StaudSoft,StaudSoft,12,5,"0 .. 20,000",0,0,999.0,999.0,0.0,English,"Action, Indie, RPG",0,"{'Action': 25, 'RPG': 24, 'Indie': 23, 'Crafti..."
9267,453000,TheWaveVR Beta,TheWaveVR,TheWaveVR,364,21,"50,000 .. 100,000",13,13,0.0,0.0,0.0,English,"Massively Multiplayer, Simulation, Early Access",13,"{'Early Access': 27, 'VR': 26, 'Massively Mult..."
9443,457550,Bigscreen Beta,"Bigscreen, Inc.","Bigscreen, Inc.",975,127,"200,000 .. 500,000",153,155,0.0,0.0,0.0,English,Simulation,55,"{'VR': 34, 'Simulation': 28, 'Utilities': 20, ..."
13327,564310,Serious Sam Fusion 2017 (beta),Croteam,"Devolver Digital, Croteam",1562,126,"0 .. 20,000",180,61,1499.0,1499.0,0.0,"English, French, Italian, German, Spanish - Sp...","Action, Indie",71,"{'Action': 50, 'Indie': 37, 'FPS': 29, 'Gore':..."
14871,604530,EmbodyMe Beta,EmbodyMe Inc.,EmbodyMe Inc.,10,7,"0 .. 20,000",0,0,0.0,0.0,0.0,English,"Casual, Free to Play, Massively Multiplayer, E...",0,"{'Early Access': 21, 'Free to Play': 23, 'Casu..."


In [34]:
drop_betas = drop_demos[~drop_demos['name'].str.contains(r'\bbeta\b', flags=re.I)].copy()
drop_betas.isnull().sum()

appid                0
name                 0
developer           91
publisher          194
positive             0
negative             0
owners               0
average_forever      0
median_forever       0
price                0
initialprice         0
discount             0
languages            0
genre                0
ccu                  0
tags                 0
dtype: int64

In [35]:
drop_betas[drop_betas['developer'].isnull()]

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,discount,languages,genre,ccu,tags
330,11370,Nikopol: Secrets of the Immortals,,Meridian4,163,130,"100,000 .. 200,000",0,0,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Spain","Adventure, Indie",0,"{'Adventure': 61, 'Point & Click': 43, 'Indie'..."
331,11390,Crash Time 2,,Meridian4,1148,1017,"500,000 .. 1,000,000",30,37,999.0,999.0,0.0,"English, French, German","Action, Racing",2,"{'Racing': 102, 'Action': 63, 'Open World': 57..."
379,12690,Hunting Unlimited 2010,,"ValuSoft, Retroism",142,39,"50,000 .. 100,000",0,0,999.0,999.0,0.0,English,"Simulation, Sports",9,"{'Simulation': 35, 'Hunting': 30, 'Sports': 20..."
742,33730,18 Wheels of Steel: Extreme Trucker,,"ValuSoft, Play Hard Games",108,56,"50,000 .. 100,000",0,0,999.0,999.0,0.0,English,Simulation,0,"{'Simulation': 33, 'Driving': 16}"
743,33750,Prison Tycoon 4: SuperMax,,"ValuSoft, Retroism",6,23,"0 .. 20,000",0,0,2499.0,2499.0,0.0,English,Simulation,0,"{'Simulation': 23, 'Management': 5}"
1618,214190,Minimum,,"Atari, Cubed Productions LLC",2608,1711,"200,000 .. 500,000",73,73,0.0,0.0,0.0,English,Action,0,"{'Action': 184, 'Third-Person Shooter': 137, '..."
1701,218980,Patterns,,,26,130,"0 .. 20,000",0,0,0.0,0.0,0.0,English,"Casual, Simulation, Strategy",0,"{'Casual': 26, 'Simulation': 22, 'Strategy': 2..."
1834,227020,Rise of Venice,,Kalypso Media Digital,203,155,"50,000 .. 100,000",230,431,1999.0,1999.0,0.0,"English, French, German, Italian, Spanish - Sp...","Simulation, Strategy",14,"{'Strategy': 52, 'Simulation': 46, 'Trading': ..."
1895,230860,Cannon Brawl,,Turtle Sandbox,618,64,"50,000 .. 100,000",41,41,1499.0,1499.0,0.0,English,"Action, Adventure, Indie, Strategy",2,"{'Indie': 62, 'Local Multiplayer': 58, '2D': 5..."
2011,237370,PlayClaw 5 - Game Recording and Streaming,,,273,125,"0 .. 20,000",0,0,2799.0,3999.0,30.0,"English, Russian, French, Hungarian, Polish, P...","Audio Production, Software Training, Utilities...",21,"{'Utilities': 41, 'Video Production': 40, 'Aud..."


In [36]:
drop_betas[drop_betas['publisher'].isnull()]

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,discount,languages,genre,ccu,tags
67,2540,RIP - Trilogy,Elephant Games,,199,131,"100,000 .. 200,000",1,2,499.0,499.0,0.0,English,"Casual, Indie",0,"{'Indie': 38, 'Casual': 35, 'Arcade': 14, 'Sho..."
68,2570,Vigil: Blood Bitterness,Freegamer,,29,108,"50,000 .. 100,000",1,2,0.0,0.0,0.0,English,"Indie, RPG",0,"{'Indie': 27, 'RPG': 22, 'Puzzle': 7, 'Singlep..."
190,6600,Bullet Candy,R C Knight,,39,32,"50,000 .. 100,000",0,0,399.0,399.0,0.0,English,"Casual, Indie",0,"{'Indie': 27, 'Casual': 23, 'Twin Stick Shoote..."
385,12900,AudioSurf,Dylan Fitterer,,9040,401,"1,000,000 .. 2,000,000",2300,177,999.0,999.0,0.0,"English, Russian",Indie,31,"{'Music': 719, 'Rhythm': 475, 'Indie': 462, 'C..."
451,16300,Everyday Shooter,Queasy Games,,81,6,"20,000 .. 50,000",37,37,999.0,999.0,0.0,English,"Indie, Casual",0,"{'Indie': 31, 'Casual': 21, 'Twin Stick Shoote..."
623,26500,Cogs,Lazy 8 Studios,,648,87,"500,000 .. 1,000,000",38,36,599.0,599.0,0.0,"English, French, German, Italian, Polish, Russian","Indie, Casual",2,"{'Puzzle': 124, 'Indie': 87, 'Casual': 82, 'Si..."
894,40800,Super Meat Boy,Team Meat,,24133,1301,"2,000,000 .. 5,000,000",325,136,1499.0,1499.0,0.0,"English, Russian",Indie,74,"{'Platformer': 1218, 'Indie': 969, 'Difficult'..."
1183,72500,Arcadia,Joshyy,,34,41,"0 .. 20,000",0,0,499.0,499.0,0.0,"English, Dutch, Spanish - Spain","Casual, Indie",0,"{'Indie': 24, 'Casual': 22, ""Shoot 'Em Up"": 6}"
1239,94610,Hector: Episode 2,Straandlooper,,29,7,"0 .. 20,000",0,0,0.0,0.0,0.0,English,"Adventure, Casual",0,"{'Adventure': 22, 'Casual': 21}"
1348,116120,Lightfish,Eclipse Games,,117,27,"50,000 .. 100,000",0,0,499.0,499.0,0.0,English,"Action, Indie",0,"{'Indie': 30, 'Action': 25, 'Casual': 9, 'Puzz..."


In [37]:
drop_betas[((drop_betas['publisher'].isnull()) | (drop_betas['developer'].isnull())) & (drop_betas['average_forever'] > 200)]

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,discount,languages,genre,ccu,tags
385,12900,AudioSurf,Dylan Fitterer,,9040,401,"1,000,000 .. 2,000,000",2300,177,999.0,999.0,0.0,"English, Russian",Indie,31,"{'Music': 719, 'Rhythm': 475, 'Indie': 462, 'C..."
894,40800,Super Meat Boy,Team Meat,,24133,1301,"2,000,000 .. 5,000,000",325,136,1499.0,1499.0,0.0,"English, Russian",Indie,74,"{'Platformer': 1218, 'Indie': 969, 'Difficult'..."
1369,200910,Before the Echo,Iridium Studios,,1109,175,"100,000 .. 200,000",227,181,499.0,499.0,0.0,English,"Indie, RPG",3,"{'Rhythm': 96, 'RPG': 92, 'Indie': 79, 'Music'..."
1749,222160,Hamlet or the Last Game without MMORPG Feature...,Denis Galanin (mif2000),,265,150,"100,000 .. 200,000",322,395,499.0,499.0,0.0,"English, German, French, Italian, Korean, Span...","Adventure, Indie",1,"{'Adventure': 48, 'Indie': 45, 'Point & Click'..."
1834,227020,Rise of Venice,,Kalypso Media Digital,203,155,"50,000 .. 100,000",230,431,1999.0,1999.0,0.0,"English, French, German, Italian, Spanish - Sp...","Simulation, Strategy",14,"{'Strategy': 52, 'Simulation': 46, 'Trading': ..."
2054,239350,Spelunky,,Mossmouth,8560,851,"500,000 .. 1,000,000",606,763,1499.0,1499.0,0.0,"English, French, Italian, German, Spanish - Spain",Indie,270,"{'Rogue-like': 809, 'Platformer': 743, 'Indie'..."
2097,242550,Rayman Legends,,Ubisoft,3050,243,"200,000 .. 500,000",2974,3078,2999.0,2999.0,0.0,"English, French, Italian, German, Spanish - Sp...","Action, Adventure",69,"{'Platformer': 344, 'Local Co-Op': 181, 'Great..."
2175,246300,Paranormal,,Matthew C Cohen,287,166,"20,000 .. 50,000",214,214,999.0,999.0,0.0,English,"Action, Adventure, Indie, Simulation, Early Ac...",1,"{'Early Access': 36, 'Horror': 97, 'Indie': 46..."
2207,247910,Sniper Elite: Nazi Zombie Army 2,,Rebellion,3069,554,"200,000 .. 500,000",289,335,1499.0,1499.0,0.0,"English, French, Italian, German, Spanish - Sp...","Action, Adventure",23,"{'Zombies': 294, 'Action': 185, 'Co-op': 171, ..."
2361,253190,Kingdom Wars 2: Battles,Reverie World Studios,,611,337,"50,000 .. 100,000",1157,1157,999.0,999.0,0.0,"English, French, Italian, German, Spanish - Sp...","Indie, Simulation, Strategy",5,"{'Strategy': 75, 'Medieval': 65, 'RTS': 52, 'Z..."


In [38]:
print(drop_betas[(drop_betas['publisher'].isnull()) & (drop_betas['developer'].isnull())].shape[0])

drop_betas[(drop_betas['publisher'].isnull()) & (drop_betas['developer'].isnull())].head()

59


Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,discount,languages,genre,ccu,tags
1701,218980,Patterns,,,26,130,"0 .. 20,000",0,0,0.0,0.0,0.0,English,"Casual, Simulation, Strategy",0,"{'Casual': 26, 'Simulation': 22, 'Strategy': 2..."
2011,237370,PlayClaw 5 - Game Recording and Streaming,,,273,125,"0 .. 20,000",0,0,2799.0,3999.0,30.0,"English, Russian, French, Hungarian, Polish, P...","Audio Production, Software Training, Utilities...",21,"{'Utilities': 41, 'Video Production': 40, 'Aud..."
2201,247350,Artemis Spaceship Bridge Simulator,,,351,40,"50,000 .. 100,000",0,0,699.0,699.0,0.0,English,"Action, Indie, Simulation",1,"{'Simulation': 54, 'Indie': 37, 'Action': 33, ..."
2231,248730,A Walk in the Dark,,,300,69,"20,000 .. 50,000",0,0,699.0,699.0,0.0,English,"Action, Indie",0,"{'Indie': 42, 'Platformer': 32, 'Action': 30, ..."
2349,252770,Vox,,,93,300,"20,000 .. 50,000",102,108,999.0,999.0,0.0,English,"Action, Adventure, Indie, RPG, Early Access",0,"{'Early Access': 22, 'RPG': 33, 'Adventure': 3..."


In [39]:
drop_dev_pub = drop_betas[(drop_betas['publisher'].notnull()) | (drop_betas['developer'].notnull())].copy()
drop_dev_pub.isnull().sum()

appid                0
name                 0
developer           32
publisher          135
positive             0
negative             0
owners               0
average_forever      0
median_forever       0
price                0
initialprice         0
discount             0
languages            0
genre                0
ccu                  0
tags                 0
dtype: int64

In [40]:
drop_dev_pub.loc[drop_dev_pub['developer'].isnull(), 'developer'] = 'unknown'
drop_dev_pub.loc[drop_dev_pub['publisher'].isnull(), 'publisher'] = 'unknown'
drop_dev_pub.isnull().sum()

appid              0
name               0
developer          0
publisher          0
positive           0
negative           0
owners             0
average_forever    0
median_forever     0
price              0
initialprice       0
discount           0
languages          0
genre              0
ccu                0
tags               0
dtype: int64

In [56]:
drop_ccu = drop_dev_pub.drop('ccu', axis=1)
drop_ccu.head()

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,discount,languages,genre,tags
0,10,Counter-Strike,Valve,Valve,124534,3339,"10,000,000 .. 20,000,000",17612,317,999.0,999.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,"{'Action': 2681, 'FPS': 2048, 'Multiplayer': 1..."
1,20,Team Fortress Classic,Valve,Valve,3318,633,"5,000,000 .. 10,000,000",277,62,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,"{'Action': 208, 'FPS': 188, 'Multiplayer': 172..."
2,30,Day of Defeat,Valve,Valve,3416,398,"5,000,000 .. 10,000,000",187,34,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Spain",Action,"{'FPS': 138, 'World War II': 122, 'Multiplayer..."
3,40,Deathmatch Classic,Valve,Valve,1273,267,"5,000,000 .. 10,000,000",258,184,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,"{'Action': 85, 'FPS': 71, 'Multiplayer': 58, '..."
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,5250,288,"5,000,000 .. 10,000,000",624,415,499.0,499.0,0.0,"English, French, German, Korean",Action,"{'FPS': 235, 'Action': 211, 'Sci-fi': 166, 'Si..."


In [42]:
drop_ccu.shape

(28971, 15)

In [44]:
# handle tags
tags = drop_ccu['tags']

tags_dict = {}

for i, row in tags.iteritems():
    eval_row = literal_eval(row)
    
    if isinstance(eval_row, dict):
        for key in literal_eval(row).keys():
            tags_dict[key] = 1
        
tags_dict

{'Action': 1,
 'FPS': 1,
 'Multiplayer': 1,
 'Shooter': 1,
 'Classic': 1,
 'Team-Based': 1,
 'First-Person': 1,
 'Competitive': 1,
 'Tactical': 1,
 "1990's": 1,
 'e-sports': 1,
 'PvP': 1,
 'Military': 1,
 'Strategy': 1,
 'Score Attack': 1,
 'Survival': 1,
 'Old School': 1,
 'Assassin': 1,
 '1980s': 1,
 'Violent': 1,
 'Class-Based': 1,
 'Co-op': 1,
 'Fast-Paced': 1,
 'Online Co-Op': 1,
 'Retro': 1,
 'Remake': 1,
 'Mod': 1,
 'Funny': 1,
 'Adventure': 1,
 'World War II': 1,
 'War': 1,
 'Historical': 1,
 'Singleplayer': 1,
 'World War I': 1,
 'Arena Shooter': 1,
 'Sci-fi': 1,
 'Aliens': 1,
 'Atmospheric': 1,
 'Story Rich': 1,
 'Silent Protagonist': 1,
 'Great Soundtrack': 1,
 'Puzzle': 1,
 'Gore': 1,
 'Moddable': 1,
 'Masterpiece': 1,
 'Cyberpunk': 1,
 'Space': 1,
 'Conspiracy': 1,
 'Memes': 1,
 'Platformer': 1,
 '3D': 1,
 'Psychological Horror': 1,
 'Linear': 1,
 'Difficult': 1,
 'Open World': 1,
 'Simulation': 1,
 'Dark': 1,
 'Horror': 1,
 'Zombies': 1,
 'Short': 1,
 'Dystopian ': 1,
 'P

In [45]:
for k in literal_eval(tags[0]).keys():
    print(k)

Action
FPS
Multiplayer
Shooter
Classic
Team-Based
First-Person
Competitive
Tactical
1990's
e-sports
PvP
Military
Strategy
Score Attack
Survival
Old School
Assassin
1980s
Violent


In [88]:
# for i, row in drop_ccu.iterrows():
#     row_tags = literal_eval(row['tags'])
    
#     if isinstance(row_tags, dict):
#         for key in tags_dict.keys():
#             if key in row_tags:
#                 drop_ccu[key] = row_tags[key]
#             else:
#                 drop_ccu[key] = 0
#     else:
#         for key in tags_dict.keys():
#             drop_ccu[key] = 0
with pd.option_context("display.max_colwidth", 500):
    display(drop_ccu['tags'].head())

0    {'Action': 2681, 'FPS': 2048, 'Multiplayer': 1659, 'Shooter': 1420, 'Classic': 1344, 'Team-Based': 943, 'First-Person': 799, 'Competitive': 790, 'Tactical': 734, "1990's": 564, 'e-sports': 550, 'PvP': 480, 'Military': 367, 'Strategy': 329, 'Score Attack': 200, 'Survival': 192, 'Old School': 164, 'Assassin': 151, '1980s': 144, 'Violent': 40}
1                      {'Action': 208, 'FPS': 188, 'Multiplayer': 172, 'Classic': 152, 'Shooter': 134, 'Class-Based': 124, 'Team-Based': 115, 'First-Person': 109, "1990's": 71, 'Co-op': 62, 'Competitive': 48, 'Old School': 46, 'Fast-Paced': 39, 'Online Co-Op': 28, 'Retro': 27, 'Remake': 27, 'Violent': 26, 'Mod': 24, 'Funny': 20, 'Adventure': 15}
2                                                                                {'FPS': 138, 'World War II': 122, 'Multiplayer': 115, 'Action': 99, 'Shooter': 95, 'War': 80, 'Team-Based': 79, 'Classic': 61, 'Class-Based': 55, 'First-Person': 50, 'Historical': 28, 'Military': 19, 'Singleplayer': 16, 'Ta

In [102]:
print(
    list(literal_eval(drop_ccu['tags'][0]).keys())[:5],
    '\n',
    literal_eval(drop_ccu['tags'][0])
)

['Action', 'FPS', 'Multiplayer', 'Shooter', 'Classic'] 
 {'Action': 2681, 'FPS': 2048, 'Multiplayer': 1659, 'Shooter': 1420, 'Classic': 1344, 'Team-Based': 943, 'First-Person': 799, 'Competitive': 790, 'Tactical': 734, "1990's": 564, 'e-sports': 550, 'PvP': 480, 'Military': 367, 'Strategy': 329, 'Score Attack': 200, 'Survival': 192, 'Old School': 164, 'Assassin': 151, '1980s': 144, 'Violent': 40}


In [125]:
def parse_tags(x):
    x_eval = literal_eval(x)
    
    if isinstance(x_eval, dict):
        return x_eval
    elif isinstance(x_eval, list):
        return {}
    else:
        print(x_eval, x)

tag_data = drop_ccu[['appid', 'genre', 'tags']].copy()
        
tag_data['tags'] = tag_data['tags'].apply(parse_tags)

tag_names = tag_data['tags'].apply(lambda x: x.keys())

cols = set(list(itertools.chain(*tag_names)))

for col in sorted(cols):
    col_name = (col.lower().replace(' ', '_').replace('-', '_').replace("'", ""))
    
    tag_data[col_name] = tag_data['tags'].apply(lambda x: x[col] if col in x.keys() else 0)
    
tag_data = tag_data.drop('tags', axis=1)
    
tag_data.head()

Unnamed: 0,appid,genre,1980s,1990s,2.5d,2d,2d_fighter,360_video,3d,3d_platformer,3d_vision,4_player_local,4x,6dof,atv,abstract,action,action_rpg,action_adventure,addictive,adventure,agriculture,aliens,alternate_history,america,animation_&_modeling,anime,arcade,arena_shooter,artificial_intelligence,assassin,asynchronous_multiplayer,atmospheric,audio_production,bmx,base_building,baseball,based_on_a_novel,basketball,batman,battle_royale,beat_em_up,beautiful,benchmark,bikes,blood,board_game,bowling,building,bullet_hell,...,text_based,third_person,third_person_shooter,thriller,time_attack,time_management,time_manipulation,time_travel,top_down,top_down_shooter,touch_friendly,tower_defense,trackir,trading,trading_card_game,trains,transhumanism,turn_based,turn_based_combat,turn_based_strategy,turn_based_tactics,tutorial,twin_stick_shooter,typing,underground,underwater,unforgiving,utilities,vr,vr_only,vampire,video_production,villain_protagonist,violent,visual_novel,voice_control,voxel,walking_simulator,war,wargame,warhammer_40k,web_publishing,werewolves,western,word_game,world_war_i,world_war_ii,wrestling,zombies,e_sports
0,10,Action,144,564,0,0,0,0,0,0,0,0,0,0,0,0,2681,0,0,0,0,0,0,0,0,0,0,0,0,0,151,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,40,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,550
1,20,Action,0,71,0,0,0,0,0,0,0,0,0,0,0,0,208,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,30,Action,0,0,0,0,0,0,0,0,0,0,0,0,0,0,99,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,80,0,0,0,0,0,0,5,122,0,0,0
3,40,Action,0,0,0,0,0,0,0,0,0,0,0,0,0,0,85,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,50,Action,0,77,0,0,0,0,0,0,0,0,0,0,0,0,211,0,0,0,87,0,122,0,0,0,0,0,0,0,0,0,73,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [126]:
tag_data.columns[tag_data.max() == 0]

Index([], dtype='object')

In [127]:
tag_data.iloc[:, 2:].max(axis=1).head()

0    2681
1     208
2     138
3      85
4     235
dtype: int64

In [128]:
tag_data['genre'].value_counts()

Action, Indie                                                                                                                                                                                       1966
Casual, Indie                                                                                                                                                                                       1566
Action, Adventure, Indie                                                                                                                                                                            1309
Adventure, Indie                                                                                                                                                                                    1228
Action, Casual, Indie                                                                                                                                                                               

In [130]:
tag_data_merge = tag_data.drop('genre', axis=1)
steamspy_data = drop_ccu.merge(tag_data_merge, how='inner', on='appid')

steamspy_data.head()

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,median_forever,price,initialprice,discount,languages,genre,tags,1980s,1990s,2.5d,2d,2d_fighter,360_video,3d,3d_platformer,3d_vision,4_player_local,4x,6dof,atv,abstract,action,action_rpg,action_adventure,addictive,adventure,agriculture,aliens,alternate_history,america,animation_&_modeling,anime,arcade,arena_shooter,artificial_intelligence,assassin,asynchronous_multiplayer,atmospheric,audio_production,bmx,base_building,baseball,...,text_based,third_person,third_person_shooter,thriller,time_attack,time_management,time_manipulation,time_travel,top_down,top_down_shooter,touch_friendly,tower_defense,trackir,trading,trading_card_game,trains,transhumanism,turn_based,turn_based_combat,turn_based_strategy,turn_based_tactics,tutorial,twin_stick_shooter,typing,underground,underwater,unforgiving,utilities,vr,vr_only,vampire,video_production,villain_protagonist,violent,visual_novel,voice_control,voxel,walking_simulator,war,wargame,warhammer_40k,web_publishing,werewolves,western,word_game,world_war_i,world_war_ii,wrestling,zombies,e_sports
0,10,Counter-Strike,Valve,Valve,124534,3339,"10,000,000 .. 20,000,000",17612,317,999.0,999.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,"{'Action': 2681, 'FPS': 2048, 'Multiplayer': 1...",144,564,0,0,0,0,0,0,0,0,0,0,0,0,2681,0,0,0,0,0,0,0,0,0,0,0,0,0,151,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,40,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,550
1,20,Team Fortress Classic,Valve,Valve,3318,633,"5,000,000 .. 10,000,000",277,62,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,"{'Action': 208, 'FPS': 188, 'Multiplayer': 172...",0,71,0,0,0,0,0,0,0,0,0,0,0,0,208,0,0,0,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,30,Day of Defeat,Valve,Valve,3416,398,"5,000,000 .. 10,000,000",187,34,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Spain",Action,"{'FPS': 138, 'World War II': 122, 'Multiplayer...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,99,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,80,0,0,0,0,0,0,5,122,0,0,0
3,40,Deathmatch Classic,Valve,Valve,1273,267,"5,000,000 .. 10,000,000",258,184,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,"{'Action': 85, 'FPS': 71, 'Multiplayer': 58, '...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,85,0,0,0,0,0,0,0,0,0,0,0,22,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,5250,288,"5,000,000 .. 10,000,000",624,415,499.0,499.0,0.0,"English, French, German, Korean",Action,"{'FPS': 235, 'Action': 211, 'Sci-fi': 166, 'Si...",0,77,0,0,0,0,0,0,0,0,0,0,0,0,211,0,0,0,87,0,122,0,0,0,0,0,0,0,0,0,73,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [131]:
steamspy_data.isnull().sum()

appid                  0
name                   0
developer              0
publisher              0
positive               0
negative               0
owners                 0
average_forever        0
median_forever         0
price                  0
initialprice           0
discount               0
languages              0
genre                  0
tags                   0
1980s                  0
1990s                  0
2.5d                   0
2d                     0
2d_fighter             0
360_video              0
3d                     0
3d_platformer          0
3d_vision              0
4_player_local         0
4x                     0
6dof                   0
atv                    0
abstract               0
action                 0
                      ..
turn_based_tactics     0
tutorial               0
twin_stick_shooter     0
typing                 0
underground            0
underwater             0
unforgiving            0
utilities              0
vr                     0


In [81]:
def process_tags(df):
    df = df.copy()
    
    def parse_tags(x):
        eval_x = literal_eval(x)
        
        if isinstance(eval_x, dict):
            return ';'.join(list(eval_x.keys())[:5])
        else:
            return np.nan
    
    df['tags'] = df['tags'].apply(parse_tags)
    
    # rows with null tags seem to be superseded by newer release, so remove (e.g. dead island)
    df = df[df['tags'].notnull()]
    
    return df

def process_owners(df):
    df = df.copy()
    
    df['owners'] = df['owners'].apply(lambda x: x.split(' .. ')[0] + '+')
    
    return df

def process(df):
    df = df.copy()
    
    df = df[(df['name'].notnull()) & (df['name'] != 'none')]
    
    df = df[df['developer'].notnull()]
    df = df[df['languages'].notnull()]
    
    df = df.drop(['genre', 'developer', 'publisher', 'score_rank', 'userscore', 'average_2weeks', 'median_2weeks', 'price', 'initialprice', 'discount', 'ccu'], axis=1)
    
    # keep top five tags
    df = process_tags(df)
    
    df = process_owners(df)
    
    # could fill genres like this
    # df.loc[df['genre'].isnull(), 'genre'] = df.loc[df['genre'].isnull(), 'tags'].apply(lambda x: x.split(';')[0])
    
    # still some duplicates with same name but different appid
    
    return df


steamspy_data = process(raw_steamspy_data)
steamspy_data.head()

Unnamed: 0,appid,name,positive,negative,owners,average_forever,median_forever,languages,tags
0,10,Counter-Strike,124534,3339,"10,000,000+",17612,317,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Multiplayer;Shooter;Classic
1,20,Team Fortress Classic,3318,633,"5,000,000+",277,62,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Multiplayer;Classic;Shooter
2,30,Day of Defeat,3416,398,"5,000,000+",187,34,"English, French, German, Italian, Spanish - Spain",FPS;World War II;Multiplayer;Action;Shooter
3,40,Deathmatch Classic,1273,267,"5,000,000+",258,184,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Multiplayer;Classic;Shooter
4,50,Half-Life: Opposing Force,5250,288,"5,000,000+",624,415,"English, French, German, Korean",FPS;Action;Sci-fi;Singleplayer;Classic


In [82]:
steamspy_data.to_csv('../data/steamspy_clean.csv', index=False)