## Steam Data Cleaning Notebook

In [1]:
# import libraries

import pandas as pd
import numpy as np
import math as m
from ast import literal_eval
import itertools

import matplotlib.pyplot as plt
import seaborn as sns

import datetime
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import (VotingClassifier, 
                              RandomForestClassifier, 
                              AdaBoostClassifier, 
                              GradientBoostingClassifier, 
                              AdaBoostClassifier,
                              BaggingClassifier
                             )
from imblearn.pipeline import Pipeline as imbPipeline
from sklearn.pipeline import FeatureUnion
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score, cross_validate
from imblearn.over_sampling import SMOTENC
from imblearn.under_sampling import ClusterCentroids
from sklearn.metrics import (accuracy_score, 
                             f1_score, 
                             recall_score, 
                             precision_score,
                             precision_recall_curve,
                             make_scorer,
                             confusion_matrix,
                             plot_confusion_matrix,
                             plot_roc_curve,
                             classification_report,
                             roc_curve,
                             auc,
                             plot_precision_recall_curve,
                             roc_auc_score,
                             log_loss
                             
)
# For plotting the tree.
from sklearn.tree import export_graphviz, plot_tree
# from pydotplus import graph_from_dot_data
from IPython.display import Image
from sklearn import tree

import warnings
warnings.filterwarnings('ignore')

import joblib
import xgboost as xgb
from xgboost import plot_importance

In [2]:
df2 = pd.read_csv('test/steamspy_data_full.csv')
df2.isnull().sum()

appid                 0
name                  0
developer            26
publisher            20
score_rank         5996
positive              0
negative              0
userscore             0
owners                0
average_forever       0
average_2weeks        0
median_forever        0
median_2weeks         0
price                 0
initialprice          0
discount              0
languages             2
genre                32
ccu                   0
tags                  0
dtype: int64

In [3]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 20 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   appid            6000 non-null   int64  
 1   name             6000 non-null   object 
 2   developer        5974 non-null   object 
 3   publisher        5980 non-null   object 
 4   score_rank       4 non-null      float64
 5   positive         6000 non-null   int64  
 6   negative         6000 non-null   int64  
 7   userscore        6000 non-null   int64  
 8   owners           6000 non-null   object 
 9   average_forever  6000 non-null   int64  
 10  average_2weeks   6000 non-null   int64  
 11  median_forever   6000 non-null   int64  
 12  median_2weeks    6000 non-null   int64  
 13  price            6000 non-null   int64  
 14  initialprice     6000 non-null   int64  
 15  discount         6000 non-null   int64  
 16  languages        5998 non-null   object 
 17  genre         

# Find Duplicates in Appid Column

In [4]:
df2.duplicated('appid').value_counts()

False    5993
True        7
dtype: int64

In [5]:
df2[df2.duplicated('appid')]

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
3310,322680,BLACKHOLE,FiolaSoft Studio,1C Entertainment,,857,133,0,"200,000 .. 500,000",125,0,146,0,899,899,0,"English, Czech, Spanish - Spain, French, Itali...","Action, Adventure, Indie",0,"{'Adventure': 77, 'Indie': 72, 'Puzzle-Platfor..."
3436,394970,DEADBOLT,Hopoo Games,Hopoo Games,,4084,174,0,"200,000 .. 500,000",218,0,321,0,999,999,0,English,"Action, Indie",16,"{'Pixel Graphics': 156, 'Stealth': 151, 'Great..."
4804,1020470,Evoland Legendary Edition,Shiro Games,Shiro Unlimited,,1352,296,0,"100,000 .. 200,000",42,0,64,0,1999,1999,0,"English, French, German","Action, Adventure, Casual, Indie, RPG",22,"{'RPG': 74, 'Adventure': 58, 'Action': 54, 'In..."
5001,2450,Bloody Good Time,Outerlight Ltd.,Ubisoft,,749,224,0,"100,000 .. 200,000",514,0,667,0,499,499,0,English,Action,2,"{'Action': 75, 'Multiplayer': 37, 'FPS': 35, '..."
5165,282400,SuperPower 2 Steam Edition,GolemLabs,THQ Nordic,,1818,456,0,"100,000 .. 200,000",944,0,547,0,999,999,0,"English, French, Italian, German, Spanish - Sp...","Simulation, Strategy",94,"{'Political Sim': 142, 'Strategy': 134, 'Simul..."
5262,339120,Fork Parker's Holiday Profit Hike,Dodge Roll,Devolver Digital,,551,113,0,"100,000 .. 200,000",114,0,26,0,0,0,0,English,"Action, Adventure, Free to Play, Indie",1,"{'Free to Play': 346, 'Platformer': 61, 'Indie..."
5453,474830,Time in Time,ErayTek,ErayTek,,54,19,0,"100,000 .. 200,000",416,0,444,0,199,199,0,English,"Casual, Indie, Strategy",23,"{'Indie': 126, 'Strategy': 120, 'Casual': 118,..."


In [6]:
df2[df2['appid'] == 322680]

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
2347,322680,BLACKHOLE,FiolaSoft Studio,1C Entertainment,,857,133,0,"200,000 .. 500,000",125,0,146,0,899,899,0,"English, Czech, Spanish - Spain, French, Itali...","Action, Adventure, Indie",0,"{'Adventure': 77, 'Indie': 72, 'Puzzle-Platfor..."
3310,322680,BLACKHOLE,FiolaSoft Studio,1C Entertainment,,857,133,0,"200,000 .. 500,000",125,0,146,0,899,899,0,"English, Czech, Spanish - Spain, French, Itali...","Action, Adventure, Indie",0,"{'Adventure': 77, 'Indie': 72, 'Puzzle-Platfor..."


In [7]:
df2 = df2.drop_duplicates(subset=['appid'], keep = 'first')

In [8]:
df2.duplicated('appid').value_counts()

False    5993
dtype: int64

## Check For Missing Values

In [9]:
df2[df2['developer'].isnull()]

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,average_2weeks,median_forever,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags
391,247120,Portal 2 Sixense Perceptual Pack,,Sixense,,267,237,0,"1,000,000 .. 2,000,000",99,0,8,0,0,0,0,English,,3,"{'Adventure': 5002, 'Free to Play': 85, 'Actio..."
1174,214190,Minimum,,"Atari Inc., Cubed Productions LLC",,2453,1646,0,"200,000 .. 500,000",287,0,457,0,0,0,0,English,,0,"{'Action': 188, 'Third-Person Shooter': 137, '..."
1440,353380,Steam Link,,Anima Locus,,5784,1778,0,"200,000 .. 500,000",0,0,0,0,0,0,0,,,2,"{'Steam Machine': 430, 'Software': 36, 'Shoote..."
1503,396160,Secret Of Magia,,Senpai Studios,,380,415,0,"500,000 .. 1,000,000",258,0,255,0,199,199,0,English,,231,"{'RPGMaker': 58, 'Anime': 39, 'RPG': 36, 'Indi..."
1583,449680,SAMOLIOTIK,,,,2223,420,0,"500,000 .. 1,000,000",310,0,311,0,54,99,45,"English, Russian","Action, Casual, Indie",3,"{'Casual': 823, 'Indie': 75, 'Illuminati': 72,..."
1918,1108320,妄想破绽 Broken Delusion,,bilibili,,2509,1426,0,"500,000 .. 1,000,000",288,0,307,0,699,699,0,"English, Not supported, Japanese, Simplified C...",,3,"{'Indie': 40, 'Adventure': 38, 'Visual Novel':..."
2213,242550,Rayman Legends,,Ubisoft,,5183,511,0,"200,000 .. 500,000",1086,103,1106,103,2999,2999,0,"English, French, Italian, German, Spanish - Sp...","Action, Adventure",123,"{'Platformer': 420, 'Adventure': 270, 'Action'..."
2214,242940,Anachronox,,Square Enix,,346,131,0,"200,000 .. 500,000",302,0,450,0,97,699,86,English,,5,"{'RPG': 98, 'Cyberpunk': 64, 'Sci-fi': 59, 'Co..."
2226,247910,Sniper Elite: Nazi Zombie Army 2,,Rebellion,,3361,599,0,"200,000 .. 500,000",264,0,260,0,1499,1499,0,"English, French, Italian, German, Spanish - Sp...",,6,"{'Zombies': 325, 'Action': 215, 'Co-op': 196, ..."
2245,253650,Sparkle 2 Evo,,Forever Entertainment S. A.,,1421,545,0,"200,000 .. 500,000",160,0,164,0,499,499,0,English,"Action, Casual, Indie, Simulation",1,"{'Casual': 79, 'Indie': 68, 'Simulation': 57, ..."


In [10]:
# create def function
# Replace all missing values to unknown
df2['developer'] = df2['developer'].fillna('Unknown')
df2['developer'] = df2['developer'].replace('none', 'Unknown')
df2['developer'] = df2['developer'].fillna('Unknown')
df2['languages'] = df2['languages'].fillna('Unknown')
df2['publisher'] = df2['publisher'].fillna('Unknown')
df2['genre'] = df2['genre'].fillna('Unknown')
# Change Value type for positive and negative ratings
df2['positive'].astype(float)
df2['negative'].astype(float)

0       4995.0
1        905.0
2        557.0
3        417.0
4        675.0
         ...  
5995       7.0
5996       1.0
5997       3.0
5998       2.0
5999       1.0
Name: negative, Length: 5993, dtype: float64

In [11]:
# https://steamdb.info/blog/steamdb-rating/
# https://steamcommunity.com/discussions/forum/7/541907867783884152/
# Use source for getting review score and rating score
total = df2['positive'] + df2['negative']
df2['review_score'] = round((df2['positive'] / total),2)
df2['rating'] = round(df2['review_score'] - (df2['review_score'] - .5) * 2 ** (-np.log10(total + 1)), 2)*100


In [12]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5993 entries, 0 to 5999
Data columns (total 22 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   appid            5993 non-null   int64  
 1   name             5993 non-null   object 
 2   developer        5993 non-null   object 
 3   publisher        5993 non-null   object 
 4   score_rank       4 non-null      float64
 5   positive         5993 non-null   int64  
 6   negative         5993 non-null   int64  
 7   userscore        5993 non-null   int64  
 8   owners           5993 non-null   object 
 9   average_forever  5993 non-null   int64  
 10  average_2weeks   5993 non-null   int64  
 11  median_forever   5993 non-null   int64  
 12  median_2weeks    5993 non-null   int64  
 13  price            5993 non-null   int64  
 14  initialprice     5993 non-null   int64  
 15  discount         5993 non-null   int64  
 16  languages        5993 non-null   object 
 17  genre         

In [13]:
# check missing values for review score
df2[df2['review_score'].isnull()]

Unnamed: 0,appid,name,developer,publisher,score_rank,positive,negative,userscore,owners,average_forever,...,median_2weeks,price,initialprice,discount,languages,genre,ccu,tags,review_score,rating
112,21110,F.E.A.R.,"Monolith Productions, Inc., Timegate","Warner Bros. Games, Warner Bros. Interactive E...",,0,0,0,"1,000,000 .. 2,000,000",208,...,0,5499,5499,0,English,Action,12,[],,
113,21120,F.E.A.R.,"Monolith Productions, Inc., Timegate","Warner Bros. Games, Warner Bros. Interactive E...",,0,0,0,"1,000,000 .. 2,000,000",198,...,0,5499,5499,0,English,Action,9,[],,
3928,1294340,Wonhon: A Vengeful Spirit,BUSAN SANAI GAMES,Super.com,,0,0,0,"200,000 .. 500,000",0,...,0,1599,1599,0,"English, French, German, Spanish - Spain, Port...","Action, Adventure, Indie, Strategy",1,"{'Strategy': 589, 'Real Time Tactics': 414, 'D...",,
5013,8640,RACE On,SimBin,SimBin,,0,0,0,"100,000 .. 200,000",0,...,0,799,799,0,"English, French, German, Italian, Spanish - Spain",Racing,0,[],,


In [14]:
# Fill positive & negative score that have 0 values to 0
df2['review_score'] = df2['review_score'].fillna(0)
df2['rating'] = df2['rating'].fillna(0)

In [15]:
# drop score_rank column and userscore columns
df2.drop(columns = ['score_rank', 'userscore'], axis = 0, inplace=True )

## Create a Min, Max Owners

In [16]:
# split the '...' in between the owner range and replace it with a ',' in between
owners = df2['owners'].str.replace(',','').str.split(' .. ')

In [17]:
owners

0       [10000000, 20000000]
1        [5000000, 10000000]
2        [5000000, 10000000]
3        [5000000, 10000000]
4        [5000000, 10000000]
                ...         
5995        [100000, 200000]
5996        [100000, 200000]
5997        [100000, 200000]
5998        [100000, 200000]
5999        [100000, 200000]
Name: owners, Length: 5993, dtype: object

In [18]:
# set a variable for lower amount of owners
owners_min = owners.apply(lambda x:  int(x[0]))

In [19]:
owners_min

0       10000000
1        5000000
2        5000000
3        5000000
4        5000000
          ...   
5995      100000
5996      100000
5997      100000
5998      100000
5999      100000
Name: owners, Length: 5993, dtype: int64

In [20]:
# set a variable for max amount of owners
owners_max = owners.apply(lambda x: int(x[1]))

In [21]:
owners_max.value_counts()

200000       2150
500000       2032
1000000       867
2000000       495
5000000       299
10000000       90
20000000       37
50000000       18
100000000       4
200000000       1
Name: owners, dtype: int64

In [22]:
df2['owners_min'] = owners_min
df2['owners_max'] =  owners_max

In [23]:
df2

Unnamed: 0,appid,name,developer,publisher,positive,negative,owners,average_forever,average_2weeks,median_forever,...,initialprice,discount,languages,genre,ccu,tags,review_score,rating,owners_min,owners_max
0,10,Counter-Strike,Valve,Valve,194508,4995,"10,000,000 .. 20,000,000",12298,3385,202,...,999,0,"English, French, German, Italian, Spanish - Sp...",Action,14724,"{'Action': 5383, 'FPS': 4807, 'Multiplayer': 3...",0.97,96.0,10000000,20000000
1,20,Team Fortress Classic,Valve,Valve,5485,905,"5,000,000 .. 10,000,000",624,0,23,...,499,0,"English, French, German, Italian, Spanish - Sp...",Action,95,"{'Action': 746, 'FPS': 307, 'Multiplayer': 258...",0.86,83.0,5000000,10000000
2,30,Day of Defeat,Valve,Valve,5052,557,"5,000,000 .. 10,000,000",735,909,10,...,499,0,"English, French, German, Italian, Spanish - Spain",Action,134,"{'FPS': 789, 'World War II': 250, 'Multiplayer...",0.90,87.0,5000000,10000000
3,40,Deathmatch Classic,Valve,Valve,1876,417,"5,000,000 .. 10,000,000",1362,0,19,...,499,0,"English, French, German, Italian, Spanish - Sp...",Action,7,"{'Action': 630, 'FPS': 140, 'Classic': 108, 'M...",0.82,79.0,5000000,10000000
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,13557,675,"5,000,000 .. 10,000,000",651,37,130,...,499,0,"English, French, German, Korean",Action,116,"{'FPS': 883, 'Action': 324, 'Classic': 252, 'S...",0.95,92.0,5000000,10000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,1810240,Do Something,StrelitziaGames,StrelitziaGames,3,7,"100,000 .. 200,000",0,0,0,...,0,0,"English, Simplified Chinese, Russian, Japanese...","Action, Adventure",1,"{'Zombies': 253, 'Action': 252, 'Survival Horr...",0.30,40.0,100000,200000
5996,1842410,Deadly Racing Duel,DDRACE,kovalevviktor,9,1,"100,000 .. 200,000",0,0,0,...,1099,0,English,Indie,0,"{'Difficult': 248, 'Rogue-like': 243, 'Pixel G...",0.90,71.0,100000,200000
5997,1873960,Dungeon Crawler,Jinxi,Jinxi,5,3,"100,000 .. 200,000",0,0,0,...,599,0,English,"Casual, Indie, RPG, Strategy",0,"{'Turn-Based Combat': 443, 'Strategy': 439, 'D...",0.62,56.0,100000,200000
5998,1889620,AI Roguelite,Max Loh,Max Loh,4,2,"100,000 .. 200,000",0,0,0,...,499,0,English,"Adventure, Indie, RPG, Early Access",0,"{'Early Access': 448, 'RPG': 407, 'Text-Based'...",0.67,58.0,100000,200000


In [24]:
df2.drop('owners', axis=1, inplace=True)

In [25]:
df2

Unnamed: 0,appid,name,developer,publisher,positive,negative,average_forever,average_2weeks,median_forever,median_2weeks,...,initialprice,discount,languages,genre,ccu,tags,review_score,rating,owners_min,owners_max
0,10,Counter-Strike,Valve,Valve,194508,4995,12298,3385,202,6680,...,999,0,"English, French, German, Italian, Spanish - Sp...",Action,14724,"{'Action': 5383, 'FPS': 4807, 'Multiplayer': 3...",0.97,96.0,10000000,20000000
1,20,Team Fortress Classic,Valve,Valve,5485,905,624,0,23,0,...,499,0,"English, French, German, Italian, Spanish - Sp...",Action,95,"{'Action': 746, 'FPS': 307, 'Multiplayer': 258...",0.86,83.0,5000000,10000000
2,30,Day of Defeat,Valve,Valve,5052,557,735,909,10,909,...,499,0,"English, French, German, Italian, Spanish - Spain",Action,134,"{'FPS': 789, 'World War II': 250, 'Multiplayer...",0.90,87.0,5000000,10000000
3,40,Deathmatch Classic,Valve,Valve,1876,417,1362,0,19,0,...,499,0,"English, French, German, Italian, Spanish - Sp...",Action,7,"{'Action': 630, 'FPS': 140, 'Classic': 108, 'M...",0.82,79.0,5000000,10000000
4,50,Half-Life: Opposing Force,Gearbox Software,Valve,13557,675,651,37,130,37,...,499,0,"English, French, German, Korean",Action,116,"{'FPS': 883, 'Action': 324, 'Classic': 252, 'S...",0.95,92.0,5000000,10000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,1810240,Do Something,StrelitziaGames,StrelitziaGames,3,7,0,0,0,0,...,0,0,"English, Simplified Chinese, Russian, Japanese...","Action, Adventure",1,"{'Zombies': 253, 'Action': 252, 'Survival Horr...",0.30,40.0,100000,200000
5996,1842410,Deadly Racing Duel,DDRACE,kovalevviktor,9,1,0,0,0,0,...,1099,0,English,Indie,0,"{'Difficult': 248, 'Rogue-like': 243, 'Pixel G...",0.90,71.0,100000,200000
5997,1873960,Dungeon Crawler,Jinxi,Jinxi,5,3,0,0,0,0,...,599,0,English,"Casual, Indie, RPG, Strategy",0,"{'Turn-Based Combat': 443, 'Strategy': 439, 'D...",0.62,56.0,100000,200000
5998,1889620,AI Roguelite,Max Loh,Max Loh,4,2,0,0,0,0,...,499,0,English,"Adventure, Indie, RPG, Early Access",0,"{'Early Access': 448, 'RPG': 407, 'Text-Based'...",0.67,58.0,100000,200000


## Languages

In [26]:
def filter_string(input_string):
    input_string = input_string.replace("'","")
    input_string = input_string.replace(" '","")
    return input_string.split(",")

filter_string("")

['']

In [27]:
# clean this after test-train split
df2['languages']

0       English, French, German, Italian, Spanish - Sp...
1       English, French, German, Italian, Spanish - Sp...
2       English, French, German, Italian, Spanish - Spain
3       English, French, German, Italian, Spanish - Sp...
4                         English, French, German, Korean
                              ...                        
5995    English, Simplified Chinese, Russian, Japanese...
5996                                              English
5997                                              English
5998                                              English
5999                                              English
Name: languages, Length: 5993, dtype: object

In [28]:
df2['languages'] = df2['languages'].str.replace(' ', '')

In [29]:
all_lang = []
list1 = list(df2['languages'])

all_lang = df2['languages'].apply(lambda x: x.split(',')).apply(lambda y:{ values:1 for values in y}).values

In [30]:
all_lang

array([{'English': 1, 'French': 1, 'German': 1, 'Italian': 1, 'Spanish-Spain': 1, 'SimplifiedChinese': 1, 'TraditionalChinese': 1, 'Korean': 1},
       {'English': 1, 'French': 1, 'German': 1, 'Italian': 1, 'Spanish-Spain': 1, 'Korean': 1, 'Russian': 1, 'SimplifiedChinese': 1, 'TraditionalChinese': 1},
       {'English': 1, 'French': 1, 'German': 1, 'Italian': 1, 'Spanish-Spain': 1},
       ..., {'English': 1}, {'English': 1}, {'English': 1}], dtype=object)

In [31]:
lang_fill = pd.DataFrame(list(all_lang)).fillna(0)
lang_fill.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5993 entries, 0 to 5992
Data columns (total 33 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   English               5993 non-null   float64
 1   French                5993 non-null   float64
 2   German                5993 non-null   float64
 3   Italian               5993 non-null   float64
 4   Spanish-Spain         5993 non-null   float64
 5   SimplifiedChinese     5993 non-null   float64
 6   TraditionalChinese    5993 non-null   float64
 7   Korean                5993 non-null   float64
 8   Russian               5993 non-null   float64
 9   Dutch                 5993 non-null   float64
 10  Danish                5993 non-null   float64
 11  Finnish               5993 non-null   float64
 12  Japanese              5993 non-null   float64
 13  Norwegian             5993 non-null   float64
 14  Polish                5993 non-null   float64
 15  Portuguese           

In [32]:
lang_fill.columns

Index(['English', 'French', 'German', 'Italian', 'Spanish-Spain',
       'SimplifiedChinese', 'TraditionalChinese', 'Korean', 'Russian', 'Dutch',
       'Danish', 'Finnish', 'Japanese', 'Norwegian', 'Polish', 'Portuguese',
       'Swedish', 'Thai', 'Turkish', 'Czech', 'Hungarian', 'Portuguese-Brazil',
       'Greek', 'Bulgarian', 'Romanian', 'Ukrainian', 'Spanish-LatinAmerica',
       'Vietnamese', 'Slovakian', 'Arabic', 'Spanish', 'Unknown',
       'Notsupported'],
      dtype='object')

In [33]:
game_lang = lang_fill
game_lang[['appid', 'name']] = df2[['appid', 'name']]

In [34]:
game_lang

Unnamed: 0,English,French,German,Italian,Spanish-Spain,SimplifiedChinese,TraditionalChinese,Korean,Russian,Dutch,...,Ukrainian,Spanish-LatinAmerica,Vietnamese,Slovakian,Arabic,Spanish,Unknown,Notsupported,appid,name
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,Counter-Strike
1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,Team Fortress Classic
2,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,Day of Defeat
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,Deathmatch Classic
4,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,Half-Life: Opposing Force
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5988,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1692850.0,Historic Fighters
5989,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1693390.0,XOMBEE MPFPS
5990,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1696810.0,RAILROADS Online!
5991,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1728080.0,Roseblight


In [35]:
first_column = game_lang.pop('appid')
second_column = game_lang.pop('name')
game_lang.insert(0, 'name', first_column)
game_lang.insert(1,'appid', second_column)

In [36]:
game_lang

Unnamed: 0,name,appid,English,French,German,Italian,Spanish-Spain,SimplifiedChinese,TraditionalChinese,Korean,...,Bulgarian,Romanian,Ukrainian,Spanish-LatinAmerica,Vietnamese,Slovakian,Arabic,Spanish,Unknown,Notsupported
0,10.0,Counter-Strike,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20.0,Team Fortress Classic,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,30.0,Day of Defeat,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,40.0,Deathmatch Classic,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,50.0,Half-Life: Opposing Force,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5988,1692850.0,Historic Fighters,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5989,1693390.0,XOMBEE MPFPS,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5990,1696810.0,RAILROADS Online!,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5991,1728080.0,Roseblight,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Genre

In [37]:
# Multi label this similar to languages
df2['genre'].value_counts()

Action                                                                                   444
Action, Indie                                                                            272
Action, Adventure                                                                        235
Adventure, Indie                                                                         234
Action, Adventure, Indie                                                                 228
                                                                                        ... 
Action, Adventure, Casual, Free to Play, Indie, Simulation, Early Access                   1
Action, Casual, Free to Play, Indie, Massively Multiplayer, RPG, Simulation, Strategy      1
Action, Indie, Simulation, Sports, Strategy                                                1
Racing, Simulation, Early Access                                                           1
Casual, Indie, Simulation, Sports, Early Access                       

In [38]:
steam_genres = df2[['appid','genre']]
steam_genres

Unnamed: 0,appid,genre
0,10,Action
1,20,Action
2,30,Action
3,40,Action
4,50,Action
...,...,...
5995,1810240,"Action, Adventure"
5996,1842410,Indie
5997,1873960,"Casual, Indie, RPG, Strategy"
5998,1889620,"Adventure, Indie, RPG, Early Access"


In [39]:
test_genre = steam_genres.copy()
test_genre

Unnamed: 0,appid,genre
0,10,Action
1,20,Action
2,30,Action
3,40,Action
4,50,Action
...,...,...
5995,1810240,"Action, Adventure"
5996,1842410,Indie
5997,1873960,"Casual, Indie, RPG, Strategy"
5998,1889620,"Adventure, Indie, RPG, Early Access"


In [40]:
def fill_col(df):
    df = df.str.replace(' ', '')
    col_dict = df.apply(lambda x: x.split(',')).apply(lambda y:{values: 1 for values in y}).values
    col_fill = pd.DataFrame(list(col_dict)).fillna(0)
    return col_fill

In [41]:
genres = fill_col(steam_genres['genre'])
genres[['appid', 'name']] = df2[['appid', 'name']]

In [42]:
genres

Unnamed: 0,Action,FreetoPlay,Strategy,Adventure,Animation&Modeling,VideoProduction,RPG,Indie,Simulation,MassivelyMultiplayer,...,Violent,Education,GameDevelopment,Movie,Nudity,Gore,SexualContent,Accounting,appid,name
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,Counter-Strike
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.0,Team Fortress Classic
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,30.0,Day of Defeat
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0,Deathmatch Classic
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0,Half-Life: Opposing Force
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5988,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1692850.0,Historic Fighters
5989,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1693390.0,XOMBEE MPFPS
5990,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1696810.0,RAILROADS Online!
5991,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1728080.0,Roseblight


In [43]:
def arrange_col(df, col1, col2):
    first_column = df.pop(col1)
    second_column = df.pop(col2)
    df.insert(0, col1, first_column)
    df.insert(1, col2, second_column)
    return df
    

In [44]:
arrange_col(genres, 'appid', 'name')

Unnamed: 0,appid,name,Action,FreetoPlay,Strategy,Adventure,Animation&Modeling,VideoProduction,RPG,Indie,...,SoftwareTraining,WebPublishing,Violent,Education,GameDevelopment,Movie,Nudity,Gore,SexualContent,Accounting
0,10.0,Counter-Strike,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20.0,Team Fortress Classic,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,30.0,Day of Defeat,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,40.0,Deathmatch Classic,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,50.0,Half-Life: Opposing Force,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5988,1692850.0,Historic Fighters,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5989,1693390.0,XOMBEE MPFPS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5990,1696810.0,RAILROADS Online!,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5991,1728080.0,Roseblight,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
for value in genres.columns:
    print(value)
    print(genres[value].value_counts(ascending=False))
    print('========================================')

appid
581630.0     1
258180.0     1
325180.0     1
410980.0     1
1230570.0    1
            ..
1172450.0    1
71340.0      1
242880.0     1
808290.0     1
10.0         1
Name: appid, Length: 5986, dtype: int64
name
F.E.A.R.                                       3
Tropico Reloaded                               2
Fallout: New Vegas                             2
PICO PARK                                      2
Ironsight                                      2
                                              ..
Jets'n'Guns Gold                               1
8-Bit Commando                                 1
King's Bounty II                               1
Prince of Persia: The Two Thrones              1
The Elder Scrolls V: Skyrim Special Edition    1
Name: name, Length: 5977, dtype: int64
Action
0.0    3052
1.0    2941
Name: Action, dtype: int64
FreetoPlay
0.0    4947
1.0    1046
Name: FreetoPlay, dtype: int64
Strategy
0.0    4510
1.0    1483
Name: Strategy, dtype: int64
Adventure
0.0    376

In [46]:
genres

Unnamed: 0,appid,name,Action,FreetoPlay,Strategy,Adventure,Animation&Modeling,VideoProduction,RPG,Indie,...,SoftwareTraining,WebPublishing,Violent,Education,GameDevelopment,Movie,Nudity,Gore,SexualContent,Accounting
0,10.0,Counter-Strike,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,20.0,Team Fortress Classic,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,30.0,Day of Defeat,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,40.0,Deathmatch Classic,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,50.0,Half-Life: Opposing Force,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5988,1692850.0,Historic Fighters,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5989,1693390.0,XOMBEE MPFPS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5990,1696810.0,RAILROADS Online!,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5991,1728080.0,Roseblight,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Load Steam App Data

In [47]:
df1 = pd.read_csv('test/steam_app_data_full.csv')
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6000 entries, 0 to 5999
Data columns (total 39 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   type                     5992 non-null   object 
 1   name                     6000 non-null   object 
 2   steam_appid              6000 non-null   int64  
 3   required_age             5992 non-null   object 
 4   is_free                  5992 non-null   object 
 5   controller_support       1821 non-null   object 
 6   dlc                      2447 non-null   object 
 7   detailed_description     5989 non-null   object 
 8   about_the_game           5989 non-null   object 
 9   short_description        5989 non-null   object 
 10  fullgame                 0 non-null      float64
 11  supported_languages      5989 non-null   object 
 12  header_image             5992 non-null   object 
 13  website                  4703 non-null   object 
 14  pc_requirements         

In [48]:
df1.isnull().sum()

type                          8
name                          0
steam_appid                   0
required_age                  8
is_free                       8
controller_support         4179
dlc                        3553
detailed_description         11
about_the_game               11
short_description            11
fullgame                   6000
supported_languages          11
header_image                  8
website                    1297
pc_requirements               8
mac_requirements              8
linux_requirements            8
legal_notice               2759
drm_notice                 5846
ext_user_account_notice    5702
developers                   34
publishers                    8
demos                      5421
price_overview             1479
packages                   1301
package_groups                8
platforms                     8
metacritic                 3845
reviews                    4000
categories                   47
genres                       23
screensh

## Drop Unnecessary Columns

In [49]:
df1.columns

Index(['type', 'name', 'steam_appid', 'required_age', 'is_free',
       'controller_support', 'dlc', 'detailed_description', 'about_the_game',
       'short_description', 'fullgame', 'supported_languages', 'header_image',
       'website', 'pc_requirements', 'mac_requirements', 'linux_requirements',
       'legal_notice', 'drm_notice', 'ext_user_account_notice', 'developers',
       'publishers', 'demos', 'price_overview', 'packages', 'package_groups',
       'platforms', 'metacritic', 'reviews', 'categories', 'genres',
       'screenshots', 'movies', 'recommendations', 'achievements',
       'release_date', 'support_info', 'background', 'content_descriptors'],
      dtype='object')

In [50]:
cols_to_drop = ['supported_languages', 'header_image',
               'website', 'drm_notice', 'ext_user_account_notice', 'developers',
               'publishers', 'demos', 'screenshots', 'movies', 'background', 'content_descriptors', 'support_info',
               'genres', 'packages', 'dlc', 'fullgame', 'legal_notice', 'package_groups', 'controller_support']

df1.drop(columns = cols_to_drop, axis = 0, inplace = True)
df1

Unnamed: 0,type,name,steam_appid,required_age,is_free,detailed_description,about_the_game,short_description,pc_requirements,mac_requirements,linux_requirements,price_overview,platforms,metacritic,reviews,categories,recommendations,achievements,release_date
0,game,Counter-Strike,10,0,False,Disfruta del juego de acción en línea n° 1 en ...,Disfruta del juego de acción en línea n° 1 en ...,Disfruta del juego de acción en línea n° 1 en ...,{'minimum': '<p><strong>Mínimo:</strong> proce...,{'minimum': 'Minimum: OS X Snow Leopard 10.6....,"{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...","{'currency': 'EUR', 'initial': 819, 'final': 8...","{'windows': True, 'mac': True, 'linux': True}","{'score': 88, 'url': 'https://www.metacritic.c...",,"[{'id': 1, 'description': 'Multijugador'}, {'i...",{'total': 119660},,"{'coming_soon': False, 'date': '1 NOV 2000'}"
1,game,Team Fortress Classic,20,0,False,One of the most popular online action games of...,One of the most popular online action games of...,One of the most popular online action games of...,{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...,{'minimum': 'Minimum: OS X Snow Leopard 10.6....,"{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...","{'currency': 'USD', 'initial': 499, 'final': 4...","{'windows': True, 'mac': True, 'linux': True}",,,"[{'id': 1, 'description': 'Multi-player'}, {'i...",{'total': 4544},,"{'coming_soon': False, 'date': 'Apr 1, 1999'}"
2,game,Day of Defeat,30,0,False,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,Enlist in an intense brand of Axis vs. Allied ...,{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...,{'minimum': 'Minimum: OS X Snow Leopard 10.6....,"{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...","{'currency': 'EUR', 'initial': 399, 'final': 3...","{'windows': True, 'mac': True, 'linux': True}","{'score': 79, 'url': 'https://www.metacritic.c...",,"[{'id': 1, 'description': 'Multi-player'}, {'i...",{'total': 3159},,"{'coming_soon': False, 'date': '1 May, 2003'}"
3,game,Deathmatch Classic,40,0,False,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,Enjoy fast-paced multiplayer gaming with Death...,{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...,{'minimum': 'Minimum: OS X Snow Leopard 10.6....,"{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...","{'currency': 'USD', 'initial': 499, 'final': 4...","{'windows': True, 'mac': True, 'linux': True}",,,"[{'id': 1, 'description': 'Multi-player'}, {'i...",{'total': 1517},,"{'coming_soon': False, 'date': 'Jun 1, 2001'}"
4,game,Half-Life: Opposing Force,50,0,False,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,Return to the Black Mesa Research Facility as ...,{'minimum': '\r\n\t\t\t<p><strong>Minimum:</st...,{'minimum': 'Minimum: OS X Snow Leopard 10.6....,"{'minimum': 'Minimum: Linux Ubuntu 12.04, Dual...","{'currency': 'EUR', 'initial': 399, 'final': 3...","{'windows': True, 'mac': True, 'linux': True}",,,"[{'id': 2, 'description': 'Single-player'}, {'...",{'total': 11611},,"{'coming_soon': False, 'date': '1 Nov, 1999'}"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,game,Do Something,1810240,0,True,You buy a game that you can pass to the end. F...,You buy a game that you can pass to the end. F...,"After a few hours of epidemic, you go straight...",{'minimum': '<strong>Minimum:</strong><br><ul ...,{'minimum': '<strong>Minimum:</strong><br><ul ...,{'minimum': '<strong>Minimum:</strong><br><ul ...,,"{'windows': True, 'mac': False, 'linux': False}",,,"[{'id': 2, 'description': 'Single-player'}, {'...",,,"{'coming_soon': False, 'date': 'Dec 15, 2021'}"
5996,game,Deadly Racing Duel,1842410,0,False,Test your driving skills in this unusual racin...,Test your driving skills in this unusual racin...,Deadly Racing Duel - in the role of a racer wh...,{'minimum': '<strong>Minimum:</strong><br><ul ...,[],[],"{'currency': 'USD', 'initial': 1099, 'final': ...","{'windows': True, 'mac': False, 'linux': False}",,,"[{'id': 2, 'description': 'Single-player'}]",,,"{'coming_soon': False, 'date': 'Jan 31, 2022'}"
5997,game,Dungeon Crawler,1873960,0,False,"<img src=""https://cdn.akamai.steamstatic.com/s...","<img src=""https://cdn.akamai.steamstatic.com/s...","Dungeon Crawler is a round-based, third-person...",{'minimum': '<strong>Minimum:</strong><br><ul ...,{'minimum': '<strong>Minimum:</strong><br><ul ...,{'minimum': '<strong>Minimum:</strong><br><ul ...,"{'currency': 'USD', 'initial': 599, 'final': 5...","{'windows': True, 'mac': False, 'linux': False}",,,"[{'id': 2, 'description': 'Single-player'}]",,"{'total': 87, 'highlighted': [{'name': 'Zombie...","{'coming_soon': False, 'date': 'Feb 25, 2022'}"
5998,game,AI Roguelite,1889620,0,False,<strong>ATTENTION: This game requires an NVIDI...,<strong>ATTENTION: This game requires an NVIDI...,"Infinite text-based RPG, powered by cutting-ed...",{'minimum': '<strong>Minimum:</strong><br><ul ...,[],[],"{'currency': 'USD', 'initial': 499, 'final': 4...","{'windows': True, 'mac': False, 'linux': False}",,,"[{'id': 2, 'description': 'Single-player'}]",,,"{'coming_soon': False, 'date': 'Mar 2, 2022'}"


## Merge Two Datasets Together

In [51]:
df1.rename(columns = {'steam_appid':'appid'}, inplace = True)

In [52]:
df = df1.merge(df2, on=['appid', 'name'])

In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5576 entries, 0 to 5575
Data columns (total 38 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   type                  5568 non-null   object 
 1   name                  5576 non-null   object 
 2   appid                 5576 non-null   int64  
 3   required_age          5568 non-null   object 
 4   is_free               5568 non-null   object 
 5   detailed_description  5565 non-null   object 
 6   about_the_game        5565 non-null   object 
 7   short_description     5565 non-null   object 
 8   pc_requirements       5568 non-null   object 
 9   mac_requirements      5568 non-null   object 
 10  linux_requirements    5568 non-null   object 
 11  price_overview        4178 non-null   object 
 12  platforms             5568 non-null   object 
 13  metacritic            1922 non-null   object 
 14  reviews               1917 non-null   object 
 15  categories           

In [54]:
df.isnull().sum()

type                       8
name                       0
appid                      0
required_age               8
is_free                    8
detailed_description      11
about_the_game            11
short_description         11
pc_requirements            8
mac_requirements           8
linux_requirements         8
price_overview          1398
platforms                  8
metacritic              3654
reviews                 3659
categories                44
recommendations         1427
achievements            1721
release_date               8
developer                  0
publisher                  0
positive                   0
negative                   0
average_forever            0
average_2weeks             0
median_forever             0
median_2weeks              0
price                      0
initialprice               0
discount                   0
languages                  0
genre                      0
ccu                        0
tags                       0
review_score  

## Type

In [55]:
# drop hardware
df['type'].value_counts()

game        5567
hardware       1
Name: type, dtype: int64

In [56]:
df = df.loc[df['type'] != 'hardware']
df['type'].value_counts()


game    5567
Name: type, dtype: int64

In [57]:
df['type'].isnull().sum()

8

In [58]:
df.dropna(subset = ['type'], inplace = True)

In [59]:
df['type'].value_counts()

game    5567
Name: type, dtype: int64

## Release Date

In [60]:
df[['name','release_date']]

Unnamed: 0,name,release_date
0,Counter-Strike,"{'coming_soon': False, 'date': '1 NOV 2000'}"
1,Team Fortress Classic,"{'coming_soon': False, 'date': 'Apr 1, 1999'}"
2,Day of Defeat,"{'coming_soon': False, 'date': '1 May, 2003'}"
3,Deathmatch Classic,"{'coming_soon': False, 'date': 'Jun 1, 2001'}"
4,Half-Life: Opposing Force,"{'coming_soon': False, 'date': '1 Nov, 1999'}"
...,...,...
5571,Do Something,"{'coming_soon': False, 'date': 'Dec 15, 2021'}"
5572,Deadly Racing Duel,"{'coming_soon': False, 'date': 'Jan 31, 2022'}"
5573,Dungeon Crawler,"{'coming_soon': False, 'date': 'Feb 25, 2022'}"
5574,AI Roguelite,"{'coming_soon': False, 'date': 'Mar 2, 2022'}"


In [61]:
release_date = df[['appid', 'name', 'release_date']]
release_date

Unnamed: 0,appid,name,release_date
0,10,Counter-Strike,"{'coming_soon': False, 'date': '1 NOV 2000'}"
1,20,Team Fortress Classic,"{'coming_soon': False, 'date': 'Apr 1, 1999'}"
2,30,Day of Defeat,"{'coming_soon': False, 'date': '1 May, 2003'}"
3,40,Deathmatch Classic,"{'coming_soon': False, 'date': 'Jun 1, 2001'}"
4,50,Half-Life: Opposing Force,"{'coming_soon': False, 'date': '1 Nov, 1999'}"
...,...,...,...
5571,1810240,Do Something,"{'coming_soon': False, 'date': 'Dec 15, 2021'}"
5572,1842410,Deadly Racing Duel,"{'coming_soon': False, 'date': 'Jan 31, 2022'}"
5573,1873960,Dungeon Crawler,"{'coming_soon': False, 'date': 'Feb 25, 2022'}"
5574,1889620,AI Roguelite,"{'coming_soon': False, 'date': 'Mar 2, 2022'}"


In [62]:
# release_date['release_date'] = release_date['release_date'].str.replace('{', '')
# release_date['release_date'] = release_date['release_date'].str.replace('}', '')
# release_date['release_date'] = release_date['release_date'].str.replace("''", '')
# release_date['release_date'] = release_date['release_date'].str.split(':', expand = True)[2]

In [63]:
# doing a bit a research these are re-releases games or they're dead so we will drop them
# release_date[release_date['release_date'] == ' ']

In [64]:
# pd.set_option("Display.max_rows", None)
# release_date['release_date'].value_counts()

In [65]:
def process_release_date(df):
    
    df.copy()

    df['release_date'] = df['release_date'].str.replace('{', '')
    df['release_date'] = df['release_date'].str.replace('}', '')
    df['release_date'] = df['release_date'].str.replace("''", '')
    df['release_date'] = df['release_date'].str.split(':', expand = True)[2]
    
    df['release_date'] = df['release_date'].replace(' ', np.nan)
    
    df['release_date'] = pd.to_datetime(df['release_date'], yearfirst = True, errors = 'coerce')
    
    df.dropna(axis = 0, subset = ['release_date'], inplace = True)

    return df

In [66]:
test = process_release_date(release_date)

In [67]:
test.isnull().sum()

appid           0
name            0
release_date    0
dtype: int64

## Platforms

In [68]:
df['platforms']

0         {'windows': True, 'mac': True, 'linux': True}
1         {'windows': True, 'mac': True, 'linux': True}
2         {'windows': True, 'mac': True, 'linux': True}
3         {'windows': True, 'mac': True, 'linux': True}
4         {'windows': True, 'mac': True, 'linux': True}
                             ...                       
5571    {'windows': True, 'mac': False, 'linux': False}
5572    {'windows': True, 'mac': False, 'linux': False}
5573    {'windows': True, 'mac': False, 'linux': False}
5574    {'windows': True, 'mac': False, 'linux': False}
5575    {'windows': True, 'mac': False, 'linux': False}
Name: platforms, Length: 5567, dtype: object

In [69]:
platforms = df[['appid', 'platforms']]

In [70]:
platforms.value_counts()

appid   platforms                                      
21090   {'windows': True, 'mac': False, 'linux': False}    3
322680  {'windows': True, 'mac': True, 'linux': True}      2
1300    {'windows': True, 'mac': False, 'linux': False}    2
394970  {'windows': True, 'mac': True, 'linux': True}      2
282400  {'windows': True, 'mac': False, 'linux': False}    2
                                                          ..
616030  {'windows': True, 'mac': False, 'linux': False}    1
616090  {'windows': True, 'mac': False, 'linux': False}    1
616220  {'windows': True, 'mac': False, 'linux': False}    1
616560  {'windows': True, 'mac': False, 'linux': False}    1
10      {'windows': True, 'mac': True, 'linux': True}      1
Length: 5556, dtype: int64

In [71]:
platforms.isnull().sum()

appid        0
platforms    0
dtype: int64

In [72]:
# look at the value of each platform row that is true for windows and mac
platforms_dict = {'windows': True, 'mac': True, 'linux': False}

# turn into new list based on the values of an existing list (list comprehension)
print([x for x in platforms_dict.keys() if platforms_dict[x]])

# create lsit of keys, join on ';'
';'.join(x for x in platforms_dict.keys() if platforms_dict[x])

['windows', 'mac']


'windows;mac'

In [73]:
def process_platforms(df):
    df = df.copy()
    
    def parse_platforms(x):
        
        d = literal_eval(x)
        
        return ';'.join(platforms_dict for platforms_dict in d.keys() if d[platforms_dict])
    
    df['platforms'] = df['platforms'].apply(parse_platforms)
    
    return df

In [74]:
platforms_df = process_platforms(platforms)

In [75]:
platforms_df['platforms'].value_counts()

windows              3334
windows;mac;linux    1325
windows;mac           760
windows;linux         148
Name: platforms, dtype: int64

In [76]:
platforms_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5567 entries, 0 to 5575
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   appid      5567 non-null   int64 
 1   platforms  5567 non-null   object
dtypes: int64(1), object(1)
memory usage: 130.5+ KB


In [77]:
process_platforms(platforms)

Unnamed: 0,appid,platforms
0,10,windows;mac;linux
1,20,windows;mac;linux
2,30,windows;mac;linux
3,40,windows;mac;linux
4,50,windows;mac;linux
...,...,...
5571,1810240,windows
5572,1842410,windows
5573,1873960,windows
5574,1889620,windows


In [78]:
# Multi-index (OHE) languages, platforms, genres

## Price

In [79]:
df['is_free'].value_counts()

False    4489
True     1078
Name: is_free, dtype: int64

In [80]:
df['price']

0        999
1        499
2        499
3        499
4        499
        ... 
5571       0
5572    1099
5573     599
5574     499
5575    1099
Name: price, Length: 5567, dtype: int64

In [81]:
def process_price(df):
    
    df = df.copy()
    df['price'] = df.loc[df['price'] > 0, 'price'] / 100
    df['price'] = df['price'].fillna(0)
#     for value in df['price']:
#         if value == 0:
#             df['is_free'] = True
#         else:
#             df['is_free'] = False
    
    return df 

In [82]:
df['price_overview']

0       {'currency': 'EUR', 'initial': 819, 'final': 8...
1       {'currency': 'USD', 'initial': 499, 'final': 4...
2       {'currency': 'EUR', 'initial': 399, 'final': 3...
3       {'currency': 'USD', 'initial': 499, 'final': 4...
4       {'currency': 'EUR', 'initial': 399, 'final': 3...
                              ...                        
5571                                                  NaN
5572    {'currency': 'USD', 'initial': 1099, 'final': ...
5573    {'currency': 'USD', 'initial': 599, 'final': 5...
5574    {'currency': 'USD', 'initial': 499, 'final': 4...
5575    {'currency': 'USD', 'initial': 1099, 'final': ...
Name: price_overview, Length: 5567, dtype: object

In [83]:
price_overview = df[['appid','name', 'price', 'is_free', 'initialprice','discount', 'price_overview']]

In [84]:
first_column = price_overview.pop('appid')
price_overview.insert(0, 'appid', first_column)

In [85]:
price_overview

Unnamed: 0,appid,name,price,is_free,initialprice,discount,price_overview
0,10,Counter-Strike,999,False,999,0,"{'currency': 'EUR', 'initial': 819, 'final': 8..."
1,20,Team Fortress Classic,499,False,499,0,"{'currency': 'USD', 'initial': 499, 'final': 4..."
2,30,Day of Defeat,499,False,499,0,"{'currency': 'EUR', 'initial': 399, 'final': 3..."
3,40,Deathmatch Classic,499,False,499,0,"{'currency': 'USD', 'initial': 499, 'final': 4..."
4,50,Half-Life: Opposing Force,499,False,499,0,"{'currency': 'EUR', 'initial': 399, 'final': 3..."
...,...,...,...,...,...,...,...
5571,1810240,Do Something,0,True,0,0,
5572,1842410,Deadly Racing Duel,1099,False,1099,0,"{'currency': 'USD', 'initial': 1099, 'final': ..."
5573,1873960,Dungeon Crawler,599,False,599,0,"{'currency': 'USD', 'initial': 599, 'final': 5..."
5574,1889620,AI Roguelite,499,False,499,0,"{'currency': 'USD', 'initial': 499, 'final': 4..."


In [86]:
def process_priceoverview(df):
    df = df.copy()
        
    def parse_price(x):
        if x is not np.nan:
            return literal_eval(x)
        else:
            return {'currency': 'USD', 'initial': -1, 'final': -1, 'discount_percent': -1}
    
    # evaluate as dictionary and set to -1 if missing
    df['price_overview'] = df['price_overview'].apply(parse_price)
    
    # Create columns from currency and initial values
    df['currency'] = df['price_overview'].apply(lambda x: x['currency'])
    df['initial_price'] = df['price_overview'].apply(lambda x: x['initial'])
    df['final_price'] = df['price_overview'].apply(lambda x: x['final'])
    df['discount_percent'] = df['price_overview'].apply(lambda x: x['discount_percent'])
    
    
    # Set price of free games to 0
    df.loc[df['is_free'], 'intial_price'] = 0
    
    return df

price_data = process_priceoverview(price_overview)[['appid','name', 'currency', 'initial_price', 
                                            'final_price', 'discount_percent','price', 'discount', 'is_free']]
price_data.head()

Unnamed: 0,appid,name,currency,initial_price,final_price,discount_percent,price,discount,is_free
0,10,Counter-Strike,EUR,819,819,0,999,0,False
1,20,Team Fortress Classic,USD,499,499,0,499,0,False
2,30,Day of Defeat,EUR,399,399,0,499,0,False
3,40,Deathmatch Classic,USD,499,499,0,499,0,False
4,50,Half-Life: Opposing Force,EUR,399,399,0,499,0,False


## Categories

In [87]:
df['categories'][1]

"[{'id': 1, 'description': 'Multi-player'}, {'id': 49, 'description': 'PvP'}, {'id': 36, 'description': 'Online PvP'}, {'id': 37, 'description': 'Shared/Split Screen PvP'}, {'id': 8, 'description': 'Valve Anti-Cheat enabled'}, {'id': 44, 'description': 'Remote Play Together'}]"

In [88]:
categories = df['categories']

In [89]:
categories

0       [{'id': 1, 'description': 'Multijugador'}, {'i...
1       [{'id': 1, 'description': 'Multi-player'}, {'i...
2       [{'id': 1, 'description': 'Multi-player'}, {'i...
3       [{'id': 1, 'description': 'Multi-player'}, {'i...
4       [{'id': 2, 'description': 'Single-player'}, {'...
                              ...                        
5571    [{'id': 2, 'description': 'Single-player'}, {'...
5572          [{'id': 2, 'description': 'Single-player'}]
5573          [{'id': 2, 'description': 'Single-player'}]
5574          [{'id': 2, 'description': 'Single-player'}]
5575          [{'id': 2, 'description': 'Single-player'}]
Name: categories, Length: 5567, dtype: object

In [90]:
categories = categories.str.replace('{', '')
categories = categories.str.replace('}', '')
categories = categories.str.replace(']', '')
categories = categories.str.replace('[', '')
categories = categories.str.replace("''", '')
categories = categories.str.split(',', expand = True)

In [91]:
categories

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,34,35,36,37,38,39,40,41,42,43
0,'id': 1,'description': 'Multijugador','id': 49,'description': 'JcJ','id': 36,'description': 'JcJ en línea','id': 37,'description': 'JcJ a pantalla (com)partida','id': 8,'description': 'Con sist. antitrampas de Valve',...,,,,,,,,,,
1,'id': 1,'description': 'Multi-player','id': 49,'description': 'PvP','id': 36,'description': 'Online PvP','id': 37,'description': 'Shared/Split Screen PvP','id': 8,'description': 'Valve Anti-Cheat enabled',...,,,,,,,,,,
2,'id': 1,'description': 'Multi-player','id': 8,'description': 'Valve Anti-Cheat enabled',,,,,,,...,,,,,,,,,,
3,'id': 1,'description': 'Multi-player','id': 49,'description': 'PvP','id': 36,'description': 'Online PvP','id': 37,'description': 'Shared/Split Screen PvP','id': 8,'description': 'Valve Anti-Cheat enabled',...,,,,,,,,,,
4,'id': 2,'description': 'Single-player','id': 1,'description': 'Multi-player','id': 8,'description': 'Valve Anti-Cheat enabled','id': 44,'description': 'Remote Play Together',,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5571,'id': 2,'description': 'Single-player','id': 28,'description': 'Full controller support',,,,,,,...,,,,,,,,,,
5572,'id': 2,'description': 'Single-player',,,,,,,,,...,,,,,,,,,,
5573,'id': 2,'description': 'Single-player',,,,,,,,,...,,,,,,,,,,
5574,'id': 2,'description': 'Single-player',,,,,,,,,...,,,,,,,,,,


In [92]:
category = df[['appid','categories']]


In [93]:
df[df['categories'].isnull()]

Unnamed: 0,type,name,appid,required_age,is_free,detailed_description,about_the_game,short_description,pc_requirements,mac_requirements,...,initialprice,discount,languages,genre,ccu,tags,review_score,rating,owners_min,owners_max
477,game,Blender,365670,0,True,Blender is a free and open source 3D creation ...,Blender is a free and open source 3D creation ...,Blender is the free and open source 3D creatio...,{'minimum': '<strong>Minimum:</strong><br><ul ...,{'minimum': '<strong>Minimum:</strong><br><ul ...,...,0,0,"English,French,Italian,German,Spanish-Spain,Ja...","Animation & Modeling, Design & Illustration, V...",6008,"{'Animation & Modeling': 503, 'Design & Illust...",0.97,95.0,2000000,5000000
656,game,Soundpad,629520,0,False,Play sounds in voice chats in high digital qua...,Play sounds in voice chats in high digital qua...,Play sounds in voice chats in high digital qua...,{'minimum': '<strong>Minimum:</strong><br><ul ...,[],...,499,0,"English,German,French,Korean,Portuguese-Brazil...","Audio Production, Utilities",14148,"{'Utilities': 82, 'Audio Production': 80, 'Mus...",0.97,95.0,1000000,2000000
948,game,Might & Magic: Heroes VI,48220,0,False,<h1>Complete Edition</h1><p>The Complete Editi...,"The adventure in Heroes VI, starting 400 years...",Heroes VI is a fast-paced epic story where Ang...,"{'minimum': '<ul class=""bb_ul""><li><strong>OS:...",[],...,1999,0,"English,Czech,French,German,Hungarian,Italian,...","RPG, Strategy",139,"{'Strategy': 257, 'Turn-Based Strategy': 170, ...",0.49,49.0,500000,1000000
1308,game,ShareX,400040,0,True,"ShareX is a free and <a href=""https://steamcom...","ShareX is a free and <a href=""https://steamcom...",ShareX is a free and open source program that ...,{'minimum': '<strong>Minimum:</strong><br><ul ...,[],...,0,0,"English,Turkish,French,German,Spanish-Spain,Du...","Audio Production, Design & Illustration, Photo...",1385,"{'Photo Editing': 92, 'Utilities': 86, 'Video ...",0.96,92.0,500000,1000000
1353,game,Aseprite,431730,0,False,Aseprite is a pixel art tool that lets you cre...,Aseprite is a pixel art tool that lets you cre...,Aseprite is a pixel-art tool to create 2D anim...,{'minimum': '<strong>Minimum:</strong><br><ul ...,{'minimum': '<strong>Minimum:</strong><br><ul ...,...,1999,0,English,"Animation & Modeling, Design & Illustration, G...",2894,"{'Pixel Graphics': 124, 'Design & Illustration...",0.99,96.0,500000,1000000
1657,game,OVR Advanced Settings,1009850,0,True,<h1>Overview</h1><p>Do all this without leavin...,OVR Advanced Settings is a desktop overlay app...,"A Dashboard Application for VR, that allows yo...",{'minimum': '<strong>Minimum:</strong><br><ul ...,{'minimum': '<strong>Minimum:</strong><br><ul ...,...,0,0,English,Utilities,1553,"{'VR': 77, 'Utilities': 67, 'Free to Play': 45...",0.96,91.0,500000,1000000
1989,game,Fuse,257400,0,True,<strong>Fuse</strong> is a standalone 3D chara...,<strong>Fuse</strong> is a standalone 3D chara...,Fuse is a standalone 3D character creator that...,{'minimum': '<strong>Minimum:</strong><br><ul ...,{'minimum': '<strong>Minimum:</strong><br><ul ...,...,0,0,English,"Animation & Modeling, Design & Illustration, U...",23,"{'Animation & Modeling': 159, 'Character Custo...",0.64,63.0,200000,500000
2342,game,Driver Booster 4 for Steam,545110,0,True,"Driver Booster 4 for STEAM, as a powerful and ...","Driver Booster 4 for STEAM, as a powerful and ...","Driver Booster 4 for STEAM, as a powerful &amp...",{'minimum': '<strong>Minimum:</strong><br><ul ...,[],...,0,0,"English,French,Italian,German,Spanish-Spain,Da...",Utilities,11,{'Utilities': 36},0.76,73.0,200000,500000
2479,game,Removed,783140,0,False,,,,[],[],...,0,0,"English,Notsupported",Unknown,0,"{'Simulation': 1105, 'Action': 35, 'Indie': 33}",0.61,57.0,200000,500000
2537,game,fpsVR,908520,0,False,<h1>Supported VR headsets</h1><p>Read about <s...,<strong>fpsVR</strong> is a utility applicatio...,IN-GAME utility for keeping track of FPS (Fram...,{'minimum': '<strong>Minimum:</strong><br><ul ...,{'minimum': '<strong>Minimum:</strong><br><ul ...,...,399,0,"English,Russian",Utilities,259,"{'Utilities': 39, 'VR': 23, 'Software': 16, 'F...",0.98,93.0,200000,500000


In [94]:
df['categories'][1]

"[{'id': 1, 'description': 'Multi-player'}, {'id': 49, 'description': 'PvP'}, {'id': 36, 'description': 'Online PvP'}, {'id': 37, 'description': 'Shared/Split Screen PvP'}, {'id': 8, 'description': 'Valve Anti-Cheat enabled'}, {'id': 44, 'description': 'Remote Play Together'}]"

In [95]:
description_category = "[{'id': 1, 'description': 'Multi-player'}, {'id': 49, 'description': 'PvP'}, {'id': 36, 'description': 'Online PvP'}, {'id': 37, 'description': 'Shared/Split Screen PvP'}, {'id': 8, 'description': 'Valve Anti-Cheat enabled'}, {'id': 44, 'description': 'Remote Play Together'}]"
[x['description'] for x in literal_eval(description_category)]

['Multi-player',
 'PvP',
 'Online PvP',
 'Shared/Split Screen PvP',
 'Valve Anti-Cheat enabled',
 'Remote Play Together']

In [96]:
category = df[['appid','name','categories']]

In [97]:
category

Unnamed: 0,appid,name,categories
0,10,Counter-Strike,"[{'id': 1, 'description': 'Multijugador'}, {'i..."
1,20,Team Fortress Classic,"[{'id': 1, 'description': 'Multi-player'}, {'i..."
2,30,Day of Defeat,"[{'id': 1, 'description': 'Multi-player'}, {'i..."
3,40,Deathmatch Classic,"[{'id': 1, 'description': 'Multi-player'}, {'i..."
4,50,Half-Life: Opposing Force,"[{'id': 2, 'description': 'Single-player'}, {'..."
...,...,...,...
5571,1810240,Do Something,"[{'id': 2, 'description': 'Single-player'}, {'..."
5572,1842410,Deadly Racing Duel,"[{'id': 2, 'description': 'Single-player'}]"
5573,1873960,Dungeon Crawler,"[{'id': 2, 'description': 'Single-player'}]"
5574,1889620,AI Roguelite,"[{'id': 2, 'description': 'Single-player'}]"


In [98]:
def process_categories(df):
    
    df = df.copy()
    
    df = df[df['categories'].notnull()]
    
    for col in ['categories']:
        df[col] = df[col].apply(lambda x: ';'.join(item['description'] for item in literal_eval(x)))
        
    return df


In [99]:
process_categories(category)

Unnamed: 0,appid,name,categories
0,10,Counter-Strike,Multijugador;JcJ;JcJ en línea;JcJ a pantalla (...
1,20,Team Fortress Classic,Multi-player;PvP;Online PvP;Shared/Split Scree...
2,30,Day of Defeat,Multi-player;Valve Anti-Cheat enabled
3,40,Deathmatch Classic,Multi-player;PvP;Online PvP;Shared/Split Scree...
4,50,Half-Life: Opposing Force,Single-player;Multi-player;Valve Anti-Cheat en...
...,...,...,...
5571,1810240,Do Something,Single-player;Full controller support
5572,1842410,Deadly Racing Duel,Single-player
5573,1873960,Dungeon Crawler,Single-player
5574,1889620,AI Roguelite,Single-player


### System Requirements

In [100]:
# requirements_cols = ['pc_requirements', 'mac_requirements', 'linux_requirements']

# def process_requirements(df):
    
#     df = df.copy()
    
#     df['pc_requirements', 'mac_requirements', 'linux_requirements']
    
#     process_requirements(df)

In [101]:
def clean_df(df):
    
    df = df.copy()
    
    df = process_categories(df)
    df = process_platforms(df)
    df = process_release_date(df)
    df = process_price(df)
    
    return df

    

In [102]:
steam_clean_data = clean_df(df)

In [103]:
steam_clean_data.isnull().sum()

type                       0
name                       0
appid                      0
required_age               0
is_free                    0
detailed_description       0
about_the_game             0
short_description          0
pc_requirements            0
mac_requirements           0
linux_requirements         0
price_overview          1344
platforms                  0
metacritic              3586
reviews                 3587
categories                 0
recommendations         1397
achievements            1662
release_date               0
developer                  0
publisher                  0
positive                   0
negative                   0
average_forever            0
average_2weeks             0
median_forever             0
median_2weeks              0
price                      0
initialprice               0
discount                   0
languages                  0
genre                      0
ccu                        0
tags                       0
review_score  

In [104]:
steam_clean_data.to_csv('data/steam_clean.csv', index=False)

In [105]:
df3 = pd.read_csv('test/games-features.csv')
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13357 entries, 0 to 13356
Data columns (total 78 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   QueryID                      13357 non-null  int64  
 1   ResponseID                   13357 non-null  int64  
 2   QueryName                    13356 non-null  object 
 3   ResponseName                 13357 non-null  object 
 4   ReleaseDate                  13357 non-null  object 
 5   RequiredAge                  13357 non-null  int64  
 6   DemoCount                    13357 non-null  int64  
 7   DeveloperCount               13357 non-null  int64  
 8   DLCCount                     13357 non-null  int64  
 9   Metacritic                   13357 non-null  int64  
 10  MovieCount                   13357 non-null  int64  
 11  PackageCount                 13357 non-null  int64  
 12  RecommendationCount          13357 non-null  int64  
 13  PublisherCount  