In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df=pd.read_csv('../Dataset/steam_sales.csv')
df.head()

Unnamed: 0,Game Name,Rating,#Reviews,Discount%,Price (€),Original Price (€),Release Date,Windows,Linux,MacOS,Fetched At
0,PEAK,7.0,77683,-38.0,4.64,7.49,"16 Jun, 2025",1,0,0,2025-08-11 00:00
1,Cyberpunk 2077,7.0,772574,-65.0,20.99,59.99,"9 Dec, 2020",1,0,1,2025-08-11 00:00
2,Rust,7.0,1038433,-50.0,19.99,39.99,"8 Feb, 2018",1,0,1,2025-08-11 00:00
3,Microsoft Flight Simulator (2020) 40th Anniver...,5.0,63905,-60.0,27.99,69.99,"17 Aug, 2020",1,0,0,2025-08-11 00:00
4,Sid Meier's Civilization VII,4.0,34643,-30.0,48.99,69.99,"10 Feb, 2025",1,1,1,2025-08-11 00:00


In [3]:
df.columns

Index(['Game Name', 'Rating', '#Reviews', 'Discount%', 'Price (€)',
       'Original Price (€)', 'Release Date', 'Windows', 'Linux', 'MacOS',
       'Fetched At'],
      dtype='object')

In [5]:
df.shape

(736, 11)

# Data Cleaning & Pre-Processing

In [9]:
df.isna().sum()

Game Name             0
Rating                0
#Reviews              0
Discount%             0
Price (€)             0
Original Price (€)    0
Release Date          0
Windows               0
Linux                 0
MacOS                 0
Fetched At            0
dtype: int64

In [10]:
df.duplicated().any()

np.False_

In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736 entries, 0 to 735
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Game Name           736 non-null    object 
 1   Rating              736 non-null    float64
 2   #Reviews            736 non-null    object 
 3   Discount%           736 non-null    float64
 4   Price (€)           736 non-null    float64
 5   Original Price (€)  736 non-null    float64
 6   Release Date        736 non-null    object 
 7   Windows             736 non-null    int64  
 8   Linux               736 non-null    int64  
 9   MacOS               736 non-null    int64  
 10  Fetched At          736 non-null    object 
dtypes: float64(4), int64(3), object(4)
memory usage: 63.4+ KB


In [28]:
#changing the review columns
df['#Reviews'] = pd.to_numeric(
    df['#Reviews']
    .astype(str)
    .str.replace(',', '', regex=True)
    .str.split('.').str[0],
    errors='coerce'
).astype('Int64')

In [24]:
#changing date columns
df['Release Date']=pd.to_datetime(df['Release Date'], format='mixed', errors='coerce')
df['Fetched At']=pd.to_datetime(df['Fetched At'])

In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 736 entries, 0 to 735
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Game Name           736 non-null    object        
 1   Rating              736 non-null    float64       
 2   #Reviews            736 non-null    Int64         
 3   Discount%           736 non-null    float64       
 4   Price (€)           736 non-null    float64       
 5   Original Price (€)  736 non-null    float64       
 6   Release Date        736 non-null    datetime64[ns]
 7   Windows             736 non-null    int64         
 8   Linux               736 non-null    int64         
 9   MacOS               736 non-null    int64         
 10  Fetched At          736 non-null    datetime64[ns]
dtypes: Int64(1), datetime64[ns](2), float64(4), int64(3), object(1)
memory usage: 64.1+ KB


In [33]:
#changing the column names to make it easier to work
cols={'#Reviews':'Reviews','Discount%':'Disc','Price (€)':'DiscPrice','Original Price (€)':'OriPrice','Release Date':'Release'}
df.rename(columns=cols, inplace=True)

In [32]:
df.columns

Index(['Game Name', 'Rating', 'Reviews', 'Disc', 'DiscPrice', 'OriPrice',
       'Release', 'Windows', 'Linux', 'MacOS', 'Fetched At'],
      dtype='object')

# Exploratory Data Analysis

In [34]:
df.describe()

Unnamed: 0,Rating,Reviews,Disc,DiscPrice,OriPrice,Release,Windows,Linux,MacOS,Fetched At
count,736.0,736.0,736.0,736.0,736.0,736,736.0,736.0,736.0,736
mean,6.419837,29403.376359,-50.884511,12.575163,27.31466,2021-05-27 09:58:41.739130624,0.997283,0.247283,0.330163,2025-08-14 15:58:33.260869376
min,3.0,13.0,-90.0,1.46,2.99,2006-12-07 00:00:00,0.0,0.0,0.0,2025-08-11 00:00:00
25%,5.0,1328.75,-75.0,5.99,17.7225,2019-01-13 18:00:00,1.0,0.0,0.0,2025-08-11 00:00:00
50%,7.0,5612.5,-50.0,9.99,23.99,2022-09-27 00:00:00,1.0,0.0,0.0,2025-08-11 00:00:00
75%,7.0,23375.75,-30.0,15.4025,34.99,2024-08-27 06:00:00,1.0,0.0,1.0,2025-08-18 12:52:00
max,8.0,1038433.0,-10.0,59.99,99.99,2025-08-15 00:00:00,1.0,1.0,1.0,2025-08-18 12:52:00
std,1.296395,83397.041983,22.801873,8.786765,14.8803,,0.052093,0.431726,0.470591,
