# Setup and Imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


# Loading the dataset
The original CSV had an header error where a comma was missing between Discount and DLC count, which shifted the columns and values. Manually added the comma and saved it as games_fixed.csv. Loaded the dataset and used head() to see if everything lined up correctly. Will also move the target variable "Estimated owners" to the last column.

In [2]:
gamesData = pd.read_csv('data/raw/games_fixed.csv')

gamesData.head()

Unnamed: 0,AppID,Name,Release date,Estimated owners,Peak CCU,Required age,Price,Discount,DLC count,About the game,...,Average playtime two weeks,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies
0,20200,Galactic Bowling,"Oct 21, 2008",0 - 20000,0,0,19.99,0,0,Galactic Bowling is an exaggerated and stylize...,...,0,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
1,655370,Train Bandit,"Oct 12, 2017",0 - 20000,0,0,0.99,0,0,THE LAW!! Looks to be a showdown atop a train....,...,0,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
2,1732930,Jolt Project,"Nov 17, 2021",0 - 20000,0,0,4.99,0,0,Jolt Project: The army now has a new robotics ...,...,0,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
3,1355720,Henosis™,"Jul 23, 2020",0 - 20000,0,0,5.99,0,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,...,0,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0 - 20000,0,0,0.0,0,0,ABOUT THE GAME Play as a hacker who has arrang...,...,0,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...


In [3]:
# Move target variable to the end

target = 'Estimated owners'
cols = [c for c in gamesData.columns if c != target] + [target]
gamesData = gamesData[cols]

gamesData.head()

Unnamed: 0,AppID,Name,Release date,Peak CCU,Required age,Price,Discount,DLC count,About the game,Supported languages,...,Median playtime forever,Median playtime two weeks,Developers,Publishers,Categories,Genres,Tags,Screenshots,Movies,Estimated owners
0,20200,Galactic Bowling,"Oct 21, 2008",0,0,19.99,0,0,Galactic Bowling is an exaggerated and stylize...,['English'],...,0,0,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,0 - 20000
1,655370,Train Bandit,"Oct 12, 2017",0,0,0.99,0,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,0 - 20000
2,1732930,Jolt Project,"Nov 17, 2021",0,0,4.99,0,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",...,0,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,0 - 20000
3,1355720,Henosis™,"Jul 23, 2020",0,0,5.99,0,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"['English', 'French', 'Italian', 'German', 'Sp...",...,0,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,0 - 20000
4,1139950,Two Weeks in Painland,"Feb 3, 2020",0,0,0.0,0,0,ABOUT THE GAME Play as a hacker who has arrang...,"['English', 'Spanish - Spain']",...,0,0,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",https://cdn.akamai.steamstatic.com/steam/apps/...,http://cdn.akamai.steamstatic.com/steam/apps/2...,0 - 20000


# Removing irrelevant or "cheat" features
Removed features that were blatantly irrelevant, like Movies, Website, Screenshots. Also removed features that would allow the machine to "cheat". Goal of this project is to predict the popularity of a game before it's launched or people have played it. Features like Peak Concurrent Users is a direct implication of a game's popularity and measured after launch. These types of features will be removed.

In [4]:
# Show all features in the dataset

print(gamesData.columns)

Index(['AppID', 'Name', 'Release date', 'Peak CCU', 'Required age', 'Price',
       'Discount', 'DLC count', 'About the game', 'Supported languages',
       'Full audio languages', 'Reviews', 'Header image', 'Website',
       'Support url', 'Support email', 'Windows', 'Mac', 'Linux',
       'Metacritic score', 'Metacritic url', 'User score', 'Positive',
       'Negative', 'Score rank', 'Achievements', 'Recommendations', 'Notes',
       'Average playtime forever', 'Average playtime two weeks',
       'Median playtime forever', 'Median playtime two weeks', 'Developers',
       'Publishers', 'Categories', 'Genres', 'Tags', 'Screenshots', 'Movies',
       'Estimated owners'],
      dtype='object')


In [5]:
# Removing irrelevant or "cheat" features

dropped_cols = [
    'AppID', 'Name', 'Peak CCU', 'Positive', 'Negative', 'Recommendations',
    'Score rank', 'User score', 'Metacritic score', 'Metacritic url',
    'Header image', 'Website', 'Support url', 'Support email',
    'Reviews', 'Notes', 'Screenshots', 'Movies',
    'Average playtime forever', 'Median playtime forever',
    'Average playtime two weeks', 'Median playtime two weeks'
]

gamesData = gamesData.drop(columns=dropped_cols)

# Show remaining features in the dataset
print(gamesData.columns)

Index(['Release date', 'Required age', 'Price', 'Discount', 'DLC count',
       'About the game', 'Supported languages', 'Full audio languages',
       'Windows', 'Mac', 'Linux', 'Achievements', 'Developers', 'Publishers',
       'Categories', 'Genres', 'Tags', 'Estimated owners'],
      dtype='object')


# Target Variable Analysis
Now analyzing the target variable 'Estimated owners' and seeing what we can do with it.

In [6]:
# Checking the values of the target variable
gamesData['Estimated owners'].value_counts().sort_index()

Estimated owners
0 - 0                    22091
0 - 20000                70206
100000 - 200000           2664
1000000 - 2000000          525
10000000 - 20000000         38
100000000 - 200000000        1
20000 - 50000             8277
200000 - 500000           2206
2000000 - 5000000          336
20000000 - 50000000         21
50000 - 100000            4068
500000 - 1000000           921
5000000 - 10000000          93
50000000 - 100000000         5
Name: count, dtype: int64

There's a range of 0 - 0, which can include games that didn't released, were removed, etc. These will be removed as they are noisy data. A new column 'popularity_class' will be added and be the new target variable. It's binning value ranges above of 'Estimated owners' into three categories: 
- Low 
- Medium
- High  

Estimated owners will be removed

In [7]:
# 1. Drop rows with "0 - 0" range
gamesData = gamesData[gamesData['Estimated owners'] != '0 - 0'].copy()

# 2. Define the classification function
def categorize_popularity(owners_range):
    if pd.isna(owners_range):
        return None
    owners_range = owners_range.replace(',', '')
    low, high = map(int, owners_range.split(' - '))
    
    if high <= 50000:
        return 'Low'
    elif high <= 500000:
        return 'Medium'
    else:
        return 'High'

# 3. Apply classification
gamesData['popularity_class'] = gamesData['Estimated owners'].apply(categorize_popularity)

# 4. Check the resulting class distribution
gamesData['popularity_class'].value_counts()


popularity_class
Low       78483
Medium     8938
High       1940
Name: count, dtype: int64

In [8]:
# remove the original 'Estimated owners' column

gamesData = gamesData.drop(columns=['Estimated owners'])
gamesData.head()

Unnamed: 0,Release date,Required age,Price,Discount,DLC count,About the game,Supported languages,Full audio languages,Windows,Mac,Linux,Achievements,Developers,Publishers,Categories,Genres,Tags,popularity_class
0,"Oct 21, 2008",0,19.99,0,0,Galactic Bowling is an exaggerated and stylize...,['English'],[],True,False,False,30,Perpetual FX Creative,Perpetual FX Creative,"Single-player,Multi-player,Steam Achievements,...","Casual,Indie,Sports","Indie,Casual,Sports,Bowling",Low
1,"Oct 12, 2017",0,0.99,0,0,THE LAW!! Looks to be a showdown atop a train....,"['English', 'French', 'Italian', 'German', 'Sp...",[],True,True,False,12,Rusty Moyher,Wild Rooster,"Single-player,Steam Achievements,Full controll...","Action,Indie","Indie,Action,Pixel Graphics,2D,Retro,Arcade,Sc...",Low
2,"Nov 17, 2021",0,4.99,0,0,Jolt Project: The army now has a new robotics ...,"['English', 'Portuguese - Brazil']",[],True,False,False,0,Campião Games,Campião Games,Single-player,"Action,Adventure,Indie,Strategy",,Low
3,"Jul 23, 2020",0,5.99,0,0,HENOSIS™ is a mysterious 2D Platform Puzzler w...,"['English', 'French', 'Italian', 'German', 'Sp...",[],True,True,True,0,Odd Critter Games,Odd Critter Games,"Single-player,Full controller support","Adventure,Casual,Indie","2D Platformer,Atmospheric,Surreal,Mystery,Puzz...",Low
4,"Feb 3, 2020",0,0.0,0,0,ABOUT THE GAME Play as a hacker who has arrang...,"['English', 'Spanish - Spain']",[],True,True,False,17,Unusual Games,Unusual Games,"Single-player,Steam Achievements","Adventure,Indie","Indie,Adventure,Nudity,Violent,Sexual Content,...",Low
