## Import Libraries

In [2]:
# standard library imports
from ast import literal_eval
import itertools
import time
import re

# third-party imports
import numpy as np
import pandas as pd

# customisations
pd.set_option("display.max_columns", 100)

# SteamSpy Data 
Importing and Cleaning SteamSpy Data 
- `Step 1:` Identify Null Values & Missing Values, removing those that are irrelevant
- `Step 2:` Processing the data in certain columns
- `Step 3:` Exporting Cleaned SteamSpy DataSet
- `Step 4:` Export after Combining SteamApp & SteamSpy

In [3]:
# Import SteamSpy data
raw_steamspy_data = pd.read_csv('Uncleaned\steamspy_data.csv')

# print out number of rows and columns
print('Rows:', raw_steamspy_data.shape[0])
print('Columns:', raw_steamspy_data.shape[1])

# view first five rows
raw_steamspy_data.head()

Rows: 71193
Columns: 20


Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
0,10,Counter-Strike,Valve,Valve,,229766.0,5992.0,0.0,"10,000,000 .. 20,000,000",9847,205,169,139,999.0,999.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,12919.0,"{'Action': 5468, 'FPS': 4894, 'Multiplayer': 3..."
1,20,Team Fortress Classic,Valve,Valve,,7005.0,1069.0,0.0,"5,000,000 .. 10,000,000",167,0,25,0,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,64.0,"{'Action': 762, 'FPS': 326, 'Multiplayer': 276..."
2,30,Day of Defeat,Valve,Valve,,6036.0,664.0,0.0,"5,000,000 .. 10,000,000",553,0,17,0,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Spain",Action,96.0,"{'FPS': 797, 'World War II': 268, 'Multiplayer..."
3,40,Deathmatch Classic,Valve,Valve,,2415.0,511.0,0.0,"5,000,000 .. 10,000,000",70,0,13,0,499.0,499.0,0.0,"English, French, German, Italian, Spanish - Sp...",Action,3.0,"{'Action': 634, 'FPS': 149, 'Classic': 114, 'M..."
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,,20697.0,1054.0,0.0,"2,000,000 .. 5,000,000",406,0,157,0,499.0,499.0,0.0,"English, French, German, Korean",Action,162.0,"{'FPS': 915, 'Action': 348, 'Classic': 279, 'S..."


## 1. SteamSpy Null Values & Unused Columns
- Step 1: Checking for NULL/Empty then 
- Step 2: Removing Columns that are not Important

In [4]:
# Check for NULL values in steamspy
print("Null values in steamspy:")
print(raw_steamspy_data.isnull().sum())
print("data shape: ", raw_steamspy_data.shape)

Null values in steamspy:
appid                  0
name                  12
developer            306
publisher            260
score_rank         71148
positive               9
negative               9
userscore              9
owners                 9
average_forever        0
average_2weeks         0
median_forever         0
median_2weeks          0
price                 38
initialprice          31
discount              31
languages             97
genre                388
ccu                    9
tags                   9
dtype: int64
data shape:  (71193, 20)


`score rank` that we are interested has many null values that require cleaning.

## 1.1 Handling missing values
1. `name`
2. `developer`
3. `publisher`
4. `price`
5. `languages`

In [12]:
print('Number of Games with NaN Names: ', raw_steamspy_data[raw_steamspy_data['name'].isnull()].shape[0])
raw_steamspy_data[raw_steamspy_data['name'].isnull()]

Number of Games with NaN Names:  12


Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
486,17760,,,,,0.0,0.0,0.0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0.0,[]
9649,460250,,Jeroen Wimmers,Jeroen Wimmers,,160.0,20.0,0.0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,"English, French, Italian, German, Spanish - Sp...","Casual, Indie",0.0,"{'Experimental': 90, 'Minimalist': 84, 'Relaxi..."
11130,506630,,,SA Industry,,1.0,0.0,0.0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,"English, Not supported",,0.0,"{'Action': 21, 'Adventure': 21}"
13848,576960,,Nuclear Tales,,,110.0,26.0,0.0,"20,000 .. 50,000",139,0,273,0,299.0,1499.0,80.0,"English, Spanish - Spain, French, German","Adventure, Casual, Indie, RPG, Simulation, Str...",1.0,"{'Adventure': 377, 'Choose Your Own Adventure'..."
22544,806160,,Paleno Games,Paleno Games,,7.0,20.0,0.0,"20,000 .. 50,000",0,0,0,0,99.0,99.0,0.0,"English, French, Italian, German, Spanish - Sp...","Action, Adventure, Casual, Indie",0.0,"{'Action': 22, 'Indie': 22, 'Casual': 21, 'Adv..."
22546,806220,,"Jochen Heizmann, Emmanuel Henné, Intermediaware",Asylum Square,,30.0,14.0,0.0,"20,000 .. 50,000",0,0,0,0,0.0,0.0,0.0,English,"Action, Indie",1.0,"{'Side Scroller': 105, ""Shoot 'Em Up"": 99, 'Sh..."
28419,965340,,2nd Studio,2nd Studio,,55.0,8.0,0.0,"0 .. 20,000",0,0,0,0,199.0,199.0,0.0,"English, German, Danish, Japanese, Russian, Si...","Action, Indie, Simulation",0.0,"{'Indie': 34, 'Action': 31, 'Sexual Content': ..."
32263,1071920,,Ultimo Games,Ultimo Games,,19.0,0.0,0.0,"0 .. 20,000",0,0,0,0,399.0,399.0,0.0,English,"Action, Indie",0.0,"{'Indie': 33, 'Action': 33, 'Retro': 16, 'Arca..."
33828,1116910,,,,,12.0,16.0,0.0,"0 .. 20,000",0,0,0,0,699.0,699.0,0.0,,"Action, Adventure, Casual, Indie, RPG, Simulat...",0.0,"{'Casual': 33, 'Adventure': 31, 'Indie': 30, '..."
37415,1216770,,,Extreme Games,,1.0,0.0,0.0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,"English, French",,0.0,"{'Casual': 104, 'Indie': 98, 'RPGMaker': 94, '..."


In [14]:
print('Number of Games with none Names: ', raw_steamspy_data[raw_steamspy_data['name'] == 'none'].shape[0])
raw_steamspy_data[raw_steamspy_data['name'] == 'none']

Number of Games with none Names:  2


Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
5017,339860,none,,Green Sauce Games,,84.0,38.0,0.0,"50,000 .. 100,000",326,0,334,0,0.0,0.0,0.0,English,,0.0,"{'Indie': 53, 'RPG': 33, 'Adventure': 32, 'Mas..."
7443,398970,none,none,none,,9.0,23.0,0.0,"50,000 .. 100,000",228,0,272,0,0.0,0.0,0.0,English,"Adventure, Indie, Massively Multiplayer, RPG, ...",0.0,"{'Massively Multiplayer': 38, 'Strategy': 23, ..."


In [15]:
print('Number of Games with NaN Developers: ', raw_steamspy_data[raw_steamspy_data['developer'].isnull()].shape[0])
raw_steamspy_data[raw_steamspy_data['developer'].isnull()]

Number of Games with NaN Developers:  306


Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
26,852,ValveTestApp852,,,,0.0,0.0,0.0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0.0,[]
145,4330,Star Trek: DAC - Demo,,,,0.0,0.0,0.0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0.0,[]
258,8740,Puzzlegeddon,,,,0.0,0.0,0.0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,1.0,[]
266,8955,Borderlands DLC: Claptrap's New Robot Revolution,,,,0.0,0.0,0.0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0.0,[]
332,11390,Crash Time 2,,Meridian4,,1377.0,1039.0,0.0,"100,000 .. 200,000",296,42,249,42,499.0,499.0,0.0,"English, French, German","Action, Racing, Simulation",4.0,"{'Combat Racing': 251, 'Racing': 243, 'Sandbox..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71096,2840030,Sky Roller Coaster,,2kblater,,12.0,2.0,0.0,"0 .. 20,000",0,0,0,0,2999.0,2999.0,0.0,English,,2.0,"{'Action': 71, 'Adventure': 64, 'Casual': 63, ..."
71146,2852270,Ghost Ninja,,Anthony Russo,,12.0,1.0,0.0,"0 .. 20,000",0,0,0,0,2999.0,2999.0,0.0,English,,1.0,"{'Strategy': 210, 'Adventure': 205, 'Casual': ..."
71149,2852300,Ultimate Bicycle Stunts,,Anthony Russo,,13.0,1.0,0.0,"0 .. 20,000",0,0,0,0,2999.0,2999.0,0.0,English,,0.0,"{'Adventure': 290, 'Casual': 286, 'Simulation'..."
71150,2852310,Let's be a Tiger,,Anthony Russo,,15.0,3.0,0.0,"0 .. 20,000",0,0,0,0,2999.0,2999.0,0.0,English,,0.0,"{'Adventure': 211, 'Casual': 205, 'Simulation'..."


In [16]:
print('Number of Games with NaN Publisher: ', raw_steamspy_data[raw_steamspy_data['publisher'].isnull()].shape[0])
raw_steamspy_data[raw_steamspy_data['publisher'].isnull()]

Number of Games with NaN Publisher:  260


Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
26,852,ValveTestApp852,,,,0.0,0.0,0.0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0.0,[]
66,2540,RIP - Trilogy,Elephant Games,,,226.0,150.0,0.0,"100,000 .. 200,000",6,0,6,0,499.0,499.0,0.0,English,"Casual, Indie",0.0,"{'Indie': 41, 'Casual': 37, 'Arcade': 17, 'Sho..."
76,2720,ThreadSpace: Hyperbol,Iocaine Studios,,,34.0,12.0,0.0,"0 .. 20,000",0,0,0,0,499.0,499.0,0.0,English,"Action, Indie, Strategy",1.0,"{'Multiplayer': 29, 'Action': 26, 'Strategy': ..."
145,4330,Star Trek: DAC - Demo,,,,0.0,0.0,0.0,"0 .. 20,000",0,0,0,0,0.0,0.0,0.0,,,0.0,[]
208,7260,Loki,Cyanide Studios,,,137.0,77.0,0.0,"20,000 .. 50,000",3,0,3,0,999.0,999.0,0.0,English,RPG,3.0,"{'RPG': 33, 'Hack and Slash': 16, 'Mythology':..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
69306,2621260,Pocket Race: Driver,"Eric Behme, Oliver Müller",,,0.0,1.0,0.0,"0 .. 20,000",0,0,0,0,399.0,399.0,0.0,"English, French, German, Spanish - Spain","Action, Casual, Racing",0.0,"{'Casual': 71, 'Racing': 65, 'Arcade': 42, 'Au..."
69839,2672990,The Line TD,Gliding Star Games,,,7.0,1.0,0.0,"0 .. 20,000",0,0,0,0,124.0,499.0,75.0,"English, German, Turkish","Action, Adventure, Casual, Indie, Simulation, ...",0.0,"{'Tower Defense': 281, 'Time Management': 275,..."
70243,2709240,Truck Offroad Sim,TR Studio,,,12.0,2.0,0.0,"0 .. 20,000",0,0,0,0,2999.0,2999.0,0.0,English,"Casual, Indie, Simulation, Sports",0.0,"{'Casual': 64, 'Simulation': 58, 'Sports': 55,..."
70862,2789510,FlightTracker3D,ClickPhase,,,4.0,1.0,0.0,"0 .. 20,000",0,0,0,0,499.0,499.0,0.0,English,Utilities,0.0,"{'Utilities': 20, 'Flight': 10}"


## 1.2 Removing unwanted columns
- We decided to remove the `score_rank` column as it contains too many missing values
- We also decided to remove the `userscore` column as most rows have a value of 0 here, so it won't be useful.

In [7]:
raw_steamspy_data['userscore'].value_counts().head()

userscore
0.0      71139
100.0        4
95.0         3
70.0         2
94.0         2
Name: count, dtype: int64

Three columns - `average_2weeks`, `median_2weeks`, and `ccu` - offer temporary insights (past 2 weeks and yesterday) and can be removed as they won't be relevant for future analysis.

In [8]:
drop_cols = [
    'score_rank', # too many missing values
    'userscore', # too little variance (most have 0)
    'genre', 'developer', 'publisher', 'price', 'initialprice', 'discount', # provided by Steam data
    'average_2weeks', 'median_2weeks', 'ccu' # not interested in temporally specific columns
]

## 2.1 Processing`Tags` Column
- "Tags" are interesting, but might overlap with "genres."  
- However, tags could offer unique details from players. 
- To explore this, we've separated the "tags" data for a closer look.  
- The first few rows show tags with vote counts, hinting at their potential value.

In [9]:
tags = raw_steamspy_data['tags']

print(tags[0])
tags.head()

{'Action': 5468, 'FPS': 4894, 'Multiplayer': 3440, 'Shooter': 3390, 'Classic': 2817, 'Team-Based': 1894, 'First-Person': 1734, 'Competitive': 1629, 'Tactical': 1368, "1990's": 1228, 'e-sports': 1213, 'PvP': 905, 'Old School': 802, 'Military': 644, 'Strategy': 628, 'Survival': 310, 'Score Attack': 294, '1980s': 274, 'Assassin': 235, 'Nostalgia': 176}


0    {'Action': 5468, 'FPS': 4894, 'Multiplayer': 3...
1    {'Action': 762, 'FPS': 326, 'Multiplayer': 276...
2    {'FPS': 797, 'World War II': 268, 'Multiplayer...
3    {'Action': 634, 'FPS': 149, 'Classic': 114, 'M...
4    {'FPS': 915, 'Action': 348, 'Classic': 279, 'S...
Name: tags, dtype: object

Tons of tags per game! To keep the main data tidy, we'll include just the top 3 tags. Like with Steam data, full tags (with votes) go in a separate file for easy merging later.  First, we are going to parse that stringy data with `literal_eval`.

In [10]:
eval_row = literal_eval(tags[0])

print(eval_row['Action'])
print(eval_row['FPS'])

5468
4894


We need a list of all unique tags, but each row only has its own tags.

We found a better approach:
- use `itertools.chain` to combine tags from all rows into a single sequence.
- set will then extract the unique tags from this sequence.

This is much faster than looping through each row! (Note: dictionaries only return the keys, not the values (votes), which is perfect for tags.)


In [11]:
# Replace NaN values with an empty dictionary
tags_filled = tags.fillna('{}')

# Now apply literal_eval
parsed_tags = tags_filled.apply(lambda x: literal_eval(x))

# Continue with the rest of the code
cols = set(itertools.chain(*parsed_tags))

print('Number of unique tags:', len(cols))
print('\nFirst few tags:', sorted(list(cols))[:5])

Number of unique tags: 450

First few tags: ['1980s', "1990's", '2.5D', '2D', '2D Fighter']


## 2.2 Extracting Tags and Votes:
- Dictionary Check: We'll use isinstance to identify rows containing tags (dictionaries). For non-dictionary rows, we'll assign empty dictionaries.
- Top Tags: We'll loop through the tags and extract the top 3 tag names (assuming order by votes) for each game. 
- Vote Counts: We'll check each tag name against the tags dictionary in each row to determine its corresponding vote count.

In [12]:
def parse_tags(x):
    if pd.isna(x):  # Check if x is NaN
        return {}    # Return an empty dictionary
    x = literal_eval(x)  # Otherwise, parse the string to a dictionary
    if isinstance(x, dict):  # Check if x is a dictionary
        return x
    elif isinstance(x, list):  # Check if x is a list
        return {}
    else:
        raise TypeError('Something other than dict or list found')

parsed_tags = tags.apply(parse_tags)
        
tag_data = pd.DataFrame()

# Create a list of columns
column_data = []

for col in sorted(cols):
    # standardise column names
    col_name = col.lower().replace(' ', '_').replace('-', '_').replace("'", "")
    
    # Append the column data to the list
    column_data.append(parsed_tags.apply(lambda x: x[col] if col in x.keys() else 0))

# Concatenate the columns to form the DataFrame
tag_data = pd.concat(column_data, axis=1)

# Rename columns
tag_data.columns = sorted(cols)

tag_data.head()


Unnamed: 0,1980s,1990's,2.5D,2D,2D Fighter,2D Platformer,360 Video,3D,3D Fighter,3D Platformer,3D Vision,4 Player Local,4X,6DOF,8-bit Music,ATV,Abstract,Action,Action RPG,Action RTS,Action Roguelike,Action-Adventure,Addictive,Adventure,Agriculture,Aliens,Alternate History,Ambient,America,Animation & Modeling,Anime,Arcade,Archery,Arena Shooter,Artificial Intelligence,Assassin,Asymmetric VR,Asynchronous Multiplayer,Atmospheric,Audio Production,Auto Battler,Automation,Automobile Sim,BMX,Base-Building,Baseball,Based On A Novel,Basketball,Batman,Battle Royale,...,Top-Down,Top-Down Shooter,Touch-Friendly,Tower Defense,TrackIR,Trading,Trading Card Game,Traditional Roguelike,Trains,Transhumanism,Transportation,Trivia,Turn-Based,Turn-Based Combat,Turn-Based Strategy,Turn-Based Tactics,Tutorial,Twin Stick Shooter,Typing,Underground,Underwater,Unforgiving,Utilities,VR,VR Only,Vampire,Vehicular Combat,Video Production,Vikings,Villain Protagonist,Violent,Visual Novel,Voice Control,Volleyball,Voxel,Walking Simulator,War,Wargame,Warhammer 40K,Web Publishing,Well-Written,Werewolves,Western,Wholesome,Word Game,World War I,World War II,Wrestling,Zombies,e-sports
0,274,1228,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5468,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,235,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1213
1,0,155,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,54,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,163,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,157,0,0,0,0,0,0,0,0,11,268,0,0,0
3,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,634,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,348,0,0,0,0,0,134,0,201,0,0,0,0,0,0,0,0,0,0,0,0,122,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


Top 3 keys (assuming order by votes) act as our top tags for each game (row). We'll join them with a semicolon (';') for a clean list.

In [13]:
def parse_tags(x):
    if pd.isna(x):  # Check if x is NaN
        return np.nan  # Return NaN if it is
    x = literal_eval(x)
    if isinstance(x, dict):
        return ';'.join(list(x.keys())[:3])
    else:
        return np.nan

tags.apply(parse_tags).head()

0          Action;FPS;Multiplayer
1          Action;FPS;Multiplayer
2    FPS;World War II;Multiplayer
3              Action;FPS;Classic
4              FPS;Action;Classic
Name: tags, dtype: object

## 2.3 Processing Owners Column
This column provides estimated owner ranges for each game due to Steam privacy restrictions, only estimation is given. Due to these constraints, we'll have to work with the available data.

In [14]:
owners = raw_steamspy_data['owners']
owners.head()

0    10,000,000 .. 20,000,000
1     5,000,000 .. 10,000,000
2     5,000,000 .. 10,000,000
3     5,000,000 .. 10,000,000
4      2,000,000 .. 5,000,000
Name: owners, dtype: object

We can address the `owners` column in two ways:

1. Split and Choose: Remove commas, split the data, and keep either the lower or upper bound.
2. Remove Entirely: Consider removing the column entirely due to its estimated nature.
3. Keep the formatting as it is 

In conclusion we decided to **keep the formatting as it is**. 
1. Spliting and choosing wouldn't be wise. For example, in the 0-20000 owners category, the number 0 and 20000 is a very big difference. If we keep the lower bound all the values will be 0, this might not be true for most games in this cat. The same could be said if we keep the upper bound, all values will be 20000 which might not be true as well.
2. Removing the column wouldn't be wise either. The owners column is a very important variable to consider when evaluating our problem. This column could be 1 of the indicators for a popular game.

In [15]:
owners_split = owners.str.replace(',', '').str.split(' .. ')
owners_split.apply(lambda x: int(x[0]) if isinstance(x, list) else np.nan).head()

0    10000000.0
1     5000000.0
2     5000000.0
3     5000000.0
4     2000000.0
Name: owners, dtype: float64

We reformatted for better readability and made it easier to do analysis on this column in the next part of the project

In [16]:
owners.str.replace(',', '').str.replace(' .. ', '-').head()

0    10000000-20000000
1     5000000-10000000
2     5000000-10000000
3     5000000-10000000
4      2000000-5000000
Name: owners, dtype: object

## 2.4 Processing Data Function and exporting tags
Process data function to handle the cleaning tasks. Most columns can be cleaned within this function. For the more complex "tags" column, we'll define a separate function later.

We'll then move on to cleaning the data, exporting full tags, and inspecting the results.

In [17]:
def process_tags(df, export=False):
    if export: 
        tag_data = df[['appid', 'tags']].copy()
        
        def parse_export_tags(x):
            if pd.isna(x):  # Check if x is NaN
                return {}    # Return an empty dictionary
            x = literal_eval(x)  # Otherwise, parse the string to a dictionary
            if isinstance(x, dict):  # Check if x is a dictionary
                return x
            elif isinstance(x, list):  # Check if x is a list
                return {}
            else:
                raise TypeError('Something other than dict or list found')

        tag_data['tags'] = tag_data['tags'].apply(parse_export_tags)

        cols = set(itertools.chain(*tag_data['tags']))

        # Create a list to hold the column data
        column_data = []
        for col in sorted(cols):
            col_name = col.lower().replace(' ', '_').replace('-', '_').replace("'", "")
            column_data.append(tag_data['tags'].apply(lambda x: x[col] if col in x.keys() else 0))

        # Concatenate the columns to form the DataFrame
        tag_data = pd.concat(column_data, axis=1)
        tag_data.columns = sorted(cols)

        tag_data.to_csv('Uncleaned/steamspy_tag_data.csv', index=False)
        print("Exported tag data to 'Uncleaned/steamspy_tag_data.csv'")
        
    def parse_tags(x):
        if pd.isna(x):  # Check if x is NaN
            return np.nan  # Return NaN if it is
        x = literal_eval(x)
        if isinstance(x, dict):
            return ';'.join(list(x.keys())[:3])
        else:
            return np.nan
    
    df['tags'] = df['tags'].apply(parse_tags)
    
    # Remove rows with null tags
    df = df[df['tags'].notnull()]
    
    return df

def process(df):
    df = df.copy()
    
    # handle missing values
    df = df[(df['name'].notnull()) & (df['name'] != 'none')]
    df = df[df['developer'].notnull()]
    df = df[df['languages'].notnull()]
    df = df[df['price'].notnull()]
    
    # remove unwanted columns
    df = df.drop([
        'genre', 'developer', 'publisher', 'score_rank', 'userscore', 'average_2weeks',
        'median_2weeks', 'price', 'initialprice', 'discount', 'ccu'
    ], axis=1)
    
    # keep top tags, exporting full tag data to file
    df = process_tags(df, export=True)
    
    # reformat owners column
    df['owners'] = df['owners'].str.replace(',', '').str.replace(' .. ', '-')
    
    return df


steamspy_data = process(raw_steamspy_data)
steamspy_data.head()

Exported tag data to 'Uncleaned/steamspy_tag_data.csv'


Unnamed: 0,appid,name,positive,negative,owners,average_forever,median_forever,languages,tags
0,10,Counter-Strike,229766.0,5992.0,10000000-20000000,9847,169,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Multiplayer
1,20,Team Fortress Classic,7005.0,1069.0,5000000-10000000,167,25,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Multiplayer
2,30,Day of Defeat,6036.0,664.0,5000000-10000000,553,17,"English, French, German, Italian, Spanish - Spain",FPS;World War II;Multiplayer
3,40,Deathmatch Classic,2415.0,511.0,5000000-10000000,70,13,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Classic
4,50,Half-Life: Opposing Force,20697.0,1054.0,2000000-5000000,406,157,"English, French, German, Korean",FPS;Action;Classic


In [18]:
# inspect tag data
pd.read_csv('Uncleaned/steamspy_tag_data.csv').head()

Unnamed: 0,1980s,1990's,2.5D,2D,2D Fighter,2D Platformer,360 Video,3D,3D Fighter,3D Platformer,3D Vision,4 Player Local,4X,6DOF,8-bit Music,ATV,Abstract,Action,Action RPG,Action RTS,Action Roguelike,Action-Adventure,Addictive,Adventure,Agriculture,Aliens,Alternate History,Ambient,America,Animation & Modeling,Anime,Arcade,Archery,Arena Shooter,Artificial Intelligence,Assassin,Asymmetric VR,Asynchronous Multiplayer,Atmospheric,Audio Production,Auto Battler,Automation,Automobile Sim,BMX,Base-Building,Baseball,Based On A Novel,Basketball,Batman,Battle Royale,...,Top-Down,Top-Down Shooter,Touch-Friendly,Tower Defense,TrackIR,Trading,Trading Card Game,Traditional Roguelike,Trains,Transhumanism,Transportation,Trivia,Turn-Based,Turn-Based Combat,Turn-Based Strategy,Turn-Based Tactics,Tutorial,Twin Stick Shooter,Typing,Underground,Underwater,Unforgiving,Utilities,VR,VR Only,Vampire,Vehicular Combat,Video Production,Vikings,Villain Protagonist,Violent,Visual Novel,Voice Control,Volleyball,Voxel,Walking Simulator,War,Wargame,Warhammer 40K,Web Publishing,Well-Written,Werewolves,Western,Wholesome,Word Game,World War I,World War II,Wrestling,Zombies,e-sports
0,274,1228,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5468,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,235,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1213
1,0,155,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,762,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,54,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,163,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,157,0,0,0,0,0,0,0,0,11,268,0,0,0
3,0,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,634,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,55,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,156,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,348,0,0,0,0,0,134,0,201,0,0,0,0,0,0,0,0,0,0,0,0,122,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## 3. Merge and Export Cleaned Data
- Missing Values: Check for missing values in the cleaned data.
- Export Clean Data: Export the cleaned SteamSpy data to a CSV file.
- Merge Data: Explore options for combining the cleaned Steam data and SteamSpy data into a single, cohesive dataset for future analysis.

In [19]:
steamspy_data.to_csv('Cleaned/steamspy_cleaned.csv', index=False)

## 4. Combine SteamSpy and Steam Data
Cleaning Up:
- Remove duplicate SteamSpy information to the Steam Data (name, languages, AppID).
- Reindex the dataframe for proper order.
- Rename columns for clarity (better user experience for public data).

In [20]:
steam_data = pd.read_csv('Cleaned\steam_data_cleaned.csv')
    
merged = steam_data.merge(steamspy_data, left_on='steam_appid', right_on='appid', suffixes=('', '_steamspy'))
merged.head()

Unnamed: 0,name,steam_appid,platforms,categories,genres,achievements,release_date,price,english,developer,publisher,appid,name_steamspy,positive,negative,owners,average_forever,median_forever,languages,tags
0,Counter-Strike,10,windows;mac;linux,Multi-player;PvP;Online PvP;Shared/Split Scree...,Action,,2000-11-01,10.0,1,Valve,Valve,10,Counter-Strike,229766.0,5992.0,10000000-20000000,9847,169,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Multiplayer
1,Team Fortress Classic,20,windows;mac;linux,Multi-player;PvP;Online PvP;Shared/Split Scree...,Action,,1999-04-01,5.25,1,Valve,Valve,20,Team Fortress Classic,7005.0,1069.0,5000000-10000000,167,25,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Multiplayer
2,Day of Defeat,30,windows;mac;linux,Multi-player;Valve Anti-Cheat enabled;Family S...,Action,,2003-05-01,5.25,1,Valve,Valve,30,Day of Defeat,6036.0,664.0,5000000-10000000,553,17,"English, French, German, Italian, Spanish - Spain",FPS;World War II;Multiplayer
3,Deathmatch Classic,40,windows;mac;linux,Multi-player;PvP;Online PvP;Shared/Split Scree...,Action,,2001-06-01,5.25,1,Valve,Valve,40,Deathmatch Classic,2415.0,511.0,5000000-10000000,70,13,"English, French, German, Italian, Spanish - Sp...",Action;FPS;Classic
4,Half-Life: Opposing Force,50,windows;mac;linux,Single-player;Multi-player;Valve Anti-Cheat en...,Action,,1999-11-01,5.25,1,Gearbox Software,Valve,50,Half-Life: Opposing Force,20697.0,1054.0,2000000-5000000,406,157,"English, French, German, Korean",FPS;Action;Classic


In [21]:
# remove overlapping columns
steam_clean = merged.drop(['name_steamspy', 'languages', 'steam_appid'], axis=1)

#drop unwanted columns
#steam_clean = steam_clean.drop(['required_age','pc_requirements', 'mac_requirements', 'linux_requirements', 'support_info'], axis=1)

steam_clean = steam_clean.rename({ #for better readability
    'tags': 'steamspy_tags',
    'positive': 'positive_ratings',
    'negative': 'negative_ratings',
    'average_forever': 'average_playtime',
    'median_forever': 'median_playtime'
}, axis=1)

# Define the desired column order
new_column_order = [
    'appid',
    'name',
    'release_date',
    'english',
    'developer',
    'publisher',
    'platforms',
    'categories',
    'genres',
    'steamspy_tags',  
    'positive_ratings',  
    'negative_ratings',  
    'average_playtime',
    'median_playtime',
    'owners',
    'price'
]

# Reorder columns using reindex()
steam_clean = steam_clean.reindex(columns=new_column_order)

In [22]:
# export clean dataset
steam_clean.to_csv('Cleaned/steam_cleaned.csv', index=False)