# Import Libraries / Environment Set Up:

In [1]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib as plt
import os


In [2]:
import kagglehub

path = kagglehub.dataset_download("artermiloff/steam-games-dataset")
print("Dataset downloaded to:", path)

  from .autonotebook import tqdm as notebook_tqdm


Dataset downloaded to: C:\Users\oscar\.cache\kagglehub\datasets\artermiloff\steam-games-dataset\versions\2


In [None]:
df = pd.read_csv(os.path.join(path, 'games_march2025_cleaned.csv'))

df.head()

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
89613   NaN
89614   NaN
89615   NaN
89616   NaN
89617   NaN
Name: score_rank, Length: 89618, dtype: float64

# Data Preprocessing:

## Check for NA values:

In [None]:
df.isnull().sum()

appid                       0
name                        0
release_date                0
required_age                0
price                       0
dlc_count                   0
windows                     0
mac                         0
linux                       0
metacritic_score            0
achievements                0
recommendations             0
supported_languages         0
full_audio_languages        0
packages                    0
developers                  0
publishers                  0
categories                  0
genres                      0
user_score                  0
positive                    0
negative                    0
estimated_owners            0
average_playtime_forever    0
average_playtime_2weeks     0
median_playtime_forever     0
median_playtime_2weeks      0
discount                    0
peak_ccu                    0
tags                        0
pct_pos_total               0
num_reviews_total           0
pct_pos_recent              0
num_review

In [None]:
# Drop columns with any NA values and unnecessary columns
df.drop(columns=df.columns[df.isnull().any()], inplace=True)
df.drop(columns=['header_image', 'screenshots', 'movies'],inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 89618 entries, 0 to 89617
Data columns (total 34 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   appid                     89618 non-null  int64  
 1   name                      89618 non-null  object 
 2   release_date              89618 non-null  object 
 3   required_age              89618 non-null  int64  
 4   price                     89618 non-null  float64
 5   dlc_count                 89618 non-null  int64  
 6   windows                   89618 non-null  bool   
 7   mac                       89618 non-null  bool   
 8   linux                     89618 non-null  bool   
 9   metacritic_score          89618 non-null  int64  
 10  achievements              89618 non-null  int64  
 11  recommendations           89618 non-null  int64  
 12  supported_languages       89618 non-null  object 
 13  full_audio_languages      89618 non-null  object 
 14  packag

In [None]:
# Change 'release_date' to datetime and extract year and month to new columns for analysis
df['release_date'] = pd.to_datetime(df['release_date'])
df['year'] = df['release_date'].dt.year
df['month'] = df['release_date'].dt.month

In [18]:
df.head()

Unnamed: 0,appid,name,release_date,required_age,price,dlc_count,windows,mac,linux,metacritic_score,...,median_playtime_2weeks,discount,peak_ccu,tags,pct_pos_total,num_reviews_total,pct_pos_recent,num_reviews_recent,year,month
0,730,Counter-Strike 2,2012-08-21,0,0.0,1,True,False,True,0,...,350,0,1212356,"{'FPS': 90857, 'Shooter': 65397, 'Multiplayer'...",86,8632939,82,96473,2012,8
1,578080,PUBG: BATTLEGROUNDS,2017-12-21,0,0.0,0,True,False,False,0,...,0,0,616738,"{'Survival': 14838, 'Shooter': 12727, 'Battle ...",59,2513842,68,16720,2017,12
2,570,Dota 2,2013-07-09,0,0.0,2,True,True,True,90,...,892,0,555977,"{'Free to Play': 59933, 'MOBA': 20158, 'Multip...",81,2452595,80,29366,2013,7
3,271590,Grand Theft Auto V Legacy,2015-04-13,17,0.0,0,True,False,False,96,...,74,0,117698,"{'Open World': 32644, 'Action': 23539, 'Multip...",87,1803832,92,17517,2015,4
4,359550,Tom Clancy's Rainbow Six® Siege,2015-12-01,17,3.99,9,True,False,False,0,...,306,80,89916,"{'FPS': 9831, 'PvP': 9162, 'e-sports': 9072, '...",84,1168020,76,12608,2015,12
