## 2. Data preprocessing

### Import neccessary libraries

In [6]:
import pandas as pd
import numpy as np
import re
import json
from typing import List, Dict

Read collected datasets and view their shapes

In [7]:
steamspy_df = pd.read_csv('data/steamspy_data.csv')
steamstore_df = pd.read_csv('data/steamstore_data.csv')
steamspy_df.head()

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,ccu
0,570,Dota 2,Valve,Valve,,1920937,433766,0,"200,000,000 .. 500,000,000",40585,1304,910,812,0,0,0,643177
1,730,Counter-Strike: Global Offensive,Valve,Valve,,7297590,1096059,0,"100,000,000 .. 200,000,000",32232,732,5620,313,0,0,0,969773
2,578080,PUBG: BATTLEGROUNDS,PUBG Corporation,"KRAFTON, Inc.",,1447873,1011012,0,"50,000,000 .. 100,000,000",22327,847,5729,288,0,0,0,322362
3,1172470,Apex Legends,Respawn,Electronic Arts,,651498,310966,0,"50,000,000 .. 100,000,000",9613,464,726,144,0,0,0,78658
4,440,Team Fortress 2,Valve,Valve,,995159,128852,0,"50,000,000 .. 100,000,000",11888,1401,354,306,0,0,0,41811


In [8]:
steamstore_df.head()

Unnamed: 0,appid,name,languages,metacritic,genres
0,570,Dota 2,"Bulgarian, Czech, Danish, Dutch, English<stron...","{'score': 90, 'url': 'https://www.metacritic.c...","[{'id': '1', 'description': 'Action'}, {'id': ..."
1,730,Counter-Strike 2,"Czech, Danish, Dutch, English<strong>*</strong...",,"[{'id': '1', 'description': 'Action'}, {'id': ..."
2,578080,PUBG: BATTLEGROUNDS,"English, Korean, Simplified Chinese, French, G...",,"[{'id': '1', 'description': 'Action'}, {'id': ..."
3,1172470,Apex Legends™,"English<strong>*</strong>, French<strong>*</st...","{'score': 88, 'url': 'https://www.metacritic.c...","[{'id': '1', 'description': 'Action'}, {'id': ..."
4,440,Team Fortress 2,"English<strong>*</strong>, Danish, Dutch, Finn...","{'score': 92, 'url': 'https://www.metacritic.c...","[{'id': '1', 'description': 'Action'}, {'id': ..."


Merge the two data together using appid and name

In [9]:
merged_df = pd.merge(steamspy_df, steamstore_df, on=['appid', 'name'], how='inner')
print(f"Number of games in merged_df: {merged_df.shape[0]}")
print(merged_df.isnull().sum())
merged_df = merged_df.dropna(subset=['languages','metacritic','genres'])
print(f"Number of games in merged_df after dropping : {merged_df.shape[0]}")

Number of games in merged_df: 9373
appid                 0
name                  0
developer            31
publisher            14
score_rank         9371
positive              0
negative              0
userscore             0
owners                0
average_forever       0
average_2weeks        0
median_forever        0
median_2weeks         0
price                 0
initialprice          0
discount              0
ccu                   0
languages             2
metacritic         6894
genres               26
dtype: int64
Number of games in merged_df after dropping : 2478


Preprocessing data

In [10]:
# Function to extract the Metacritic score from json string
def extract_metacritic_score(metacritic):
    if pd.isnull(metacritic):
        return None
    try:
        if isinstance(metacritic, str):
            metacritic = eval(metacritic)
        return metacritic.get('score')
    except:
        return None

# Function to count the number of languages
def count_languages(languages_str):
    if pd.isnull(languages_str) or not isinstance(languages_str, str):
        return 0
    languages_str = re.sub('<[^<]+?>', '', languages_str)
    return len(languages_str.split(','))

# Extract genres into a list
def extract_genres(genre_list):
    if pd.isnull(genre_list):
        return []
    try:
        if isinstance(genre_list, str):
            genre_list = eval(genre_list)
        return [genre['description'].lower() for genre in genre_list]
    except:
        return []
    
# Apply the functions
merged_df['metacritic'] = merged_df['metacritic'].apply(extract_metacritic_score)
merged_df['languages'] = merged_df['languages'].apply(count_languages)

merged_df['genre_list'] = merged_df['genres'].apply(extract_genres)
# List of top genres to create boolean columns for
top_genres = ['indie', 'action', 'casual', 'adventure', 'simulation',
              'rpg', 'strategy', 'sports', 'racing', 'massively multiplayer']
# Create boolean columns for each top genre
for genre in top_genres:
    merged_df[f'genre_{genre}'] = merged_df['genre_list'].apply(lambda x: int(genre in x))
    
selected_columns = ['appid', 'name', 'metacritic', 'userscore', 'positive', 'negative', 'median_forever', 'ccu', 'price', 'languages']
selected_columns += [f'genre_{genre}' for genre in top_genres]

# Create the final DataFrame with a copy
processed_df = merged_df[selected_columns].copy()

# Remove duplicates based on appid (keeping the first occurrence)
processed_df = processed_df.drop_duplicates(subset=['appid'], keep='first')

# Rename column
processed_df = processed_df.rename(columns={'median_forever': 'median_playtime'})

# Calculate userscore
processed_df['userscore'] = processed_df['positive'] / (processed_df['positive'] + processed_df['negative'])
processed_df = processed_df.drop(['positive', 'negative'], axis=1)

# Calculate median and replace zeros for playtime
median_of_median_playtime = int(processed_df[processed_df['median_playtime'] > 0]['median_playtime'].median())
processed_df.loc[processed_df['median_playtime'] == 0, 'median_playtime'] = median_of_median_playtime

# Apply log transformation to playtime
processed_df['median_playtime'] = processed_df['median_playtime'].astype('float64')
processed_df['median_playtime'] = np.log1p(processed_df['median_playtime'])

# Calculate median and replace zeros for concurrent users
median_of_ccu = int(processed_df[processed_df['ccu'] > 0]['ccu'].median())
processed_df.loc[processed_df['ccu'] == 0, 'ccu'] = median_of_ccu

# Apply log transformation to concurrent users
processed_df['ccu'] = processed_df['ccu'].astype('float64')
processed_df['ccu'] = np.log1p(processed_df['ccu'])

# Save processed data to a CSV file
processed_df.to_csv('data/processed_data.csv', index=False)