In [36]:
# import standard libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Adjusting some visuals

pd.set_option('display.max_colwidth', 50)  # Set the maximum width of a column to display (None for unlimited)

pd.set_option('display.max_rows', 4)  # Set the maximum number of rows to display

pd.set_option('display.max_columns', None)  # Set the maximum number of columns to display (None for unlimited)


### Take a first look at the data: steam_games.json.gz

In [37]:
# Load the compressed file

relative_path_steamGames = '../data/steam_games.json.gz'
steam_games = pd.read_json(relative_path_steamGames, compression='gzip',lines=True)

In [38]:
steam_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24083 non-null  object 
 1   genres        28852 non-null  object 
 2   app_name      32133 non-null  object 
 3   title         30085 non-null  object 
 4   url           32135 non-null  object 
 5   release_date  30068 non-null  object 
 6   tags          31972 non-null  object 
 7   reviews_url   32133 non-null  object 
 8   specs         31465 non-null  object 
 9   price         30758 non-null  object 
 10  early_access  32135 non-null  float64
 11  id            32133 non-null  float64
 12  developer     28836 non-null  object 
dtypes: float64(2), object(11)
memory usage: 11.9+ MB


In [39]:
# Show rows completley unusable

steam_games[steam_games.isnull().all(axis=1)]

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88308,,,,,,,,,,,,,
88309,,,,,,,,,,,,,


In [40]:
# Drop all rows where 'all' values are null or NaN

steam_games.dropna(how='all', inplace=True)

In [41]:
# Show rows where at least one value is NaN

steam_games[steam_games.isnull().any(axis=1)]

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
88314,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,0.0,773570.0,
88316,,"[Free to Play, Indie, Simulation, Sports]",SNOW - All Access Basic Pass,SNOW - All Access Basic Pass,http://store.steampowered.com/app/774276/SNOW_...,2018-01-04,"[Free to Play, Indie, Simulation, Sports]",http://steamcommunity.com/app/774276/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",9.99,0.0,774276.0,Poppermost Productions
...,...,...,...,...,...,...,...,...,...,...,...,...,...
120438,Riviysky,"[Casual, Indie]",BAE 2,BAE 2,http://store.steampowered.com/app/769330/BAE_2/,2018-01-04,"[Indie, Casual]",http://steamcommunity.com/app/769330/reviews/?...,[Single-player],,0.0,769330.0,Riviysky
120444,,,Maze Run VR,,http://store.steampowered.com/app/681550/Maze_...,,"[Early Access, Adventure, Indie, Action, Simul...",http://steamcommunity.com/app/681550/reviews/?...,"[Single-player, Stats, Steam Leaderboards, HTC...",4.99,1.0,681550.0,


In [42]:
# 'app_name' and 'url' columns have different non-null values. 'url' has the info 'app_name'. We will check to possible impute.

steam_games[steam_games['app_name'].isnull()]

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
88384,,,,,http://store.steampowered.com/,,,,,19.99,0.0,,
90890,,"[Action, Indie]",,,http://store.steampowered.com/app/317160/_/,2014-08-26,"[Action, Indie]",http://steamcommunity.com/app/317160/reviews/?...,"[Single-player, Game demo]",,0.0,317160.0,


In [43]:
# After check, no possible action on/from url, so url is erased. Also, other columns not used in the final project are erased here.
# In other scenarios, where no space limitation would apply, we should leave this as is. Strong assumption here.

steam_games.drop(['url', 'reviews_url', 'early_access', 'specs', 'publisher', 'developer'], axis=1, inplace=True)

In [44]:
steam_games

Unnamed: 0,genres,app_name,title,release_date,tags,price,id
88310,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,"[Strategy, Action, Indie, Casual, Simulation]",4.99,761140.0
88311,"[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,2018-01-04,"[Free to Play, Strategy, Indie, RPG, Card Game...",Free To Play,643980.0
...,...,...,...,...,...,...,...
120443,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe...",4.99,658870.0
120444,,Maze Run VR,,,"[Early Access, Adventure, Indie, Action, Simul...",4.99,681550.0


In [45]:
# 'Genres' and 'Tags' may be same info. Theory: data entry does not know what to input on each.
# Approach: Create a column with unique values

# We defined a function to get values not common to both 'genres' and 'tags', handling NaN values.

columns_to_fill = ['genres', 'tags']

def get_values(row, columns_to_fill):

    if isinstance(pd.Series(row[columns_to_fill[0]]).any(), list) and isinstance(pd.Series(row[columns_to_fill[1]]).any(), list):
        return list(set(set(row[columns_to_fill[1]]).union(set(row[columns_to_fill[0]]))))
    elif isinstance(pd.Series(row[columns_to_fill[0]]).any(), list):
        return row[columns_to_fill[0]]
    else:
        return row[columns_to_fill[1]]

# Create a new column with values common to both genres and tags

steam_games['tags&genres'] = steam_games.apply(get_values, axis=1, args=(columns_to_fill,))

# drop origin columns
steam_games.drop(['genres', 'tags'], axis=1, inplace=True)


In [46]:
steam_games

Unnamed: 0,app_name,title,release_date,price,id,tags&genres
88310,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,4.99,761140.0,"[Strategy, Action, Indie, Casual, Simulation]"
88311,Ironbound,Ironbound,2018-01-04,Free To Play,643980.0,"[Free to Play, Strategy, Indie, RPG, Card Game..."
...,...,...,...,...,...,...
120443,EXIT 2 - Directions,EXIT 2 - Directions,2017-09-02,4.99,658870.0,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe..."
120444,Maze Run VR,,,4.99,681550.0,"[Early Access, Adventure, Indie, Action, Simul..."


In [None]:
# Set to 0 (zero) all text values in 'price' column as many "free" is found there.

steam_games['price'][~steam_games['price'].apply(pd.to_numeric, errors='coerce').notna()]=0
steam_games

In [48]:
# we rename 'id' column to match 'user_items.json.gz' data origin (inside 'items' column there is a 'item_id' key)

steam_games.rename(columns={'id':'item_id'}, inplace=True)
steam_games

In [49]:
# To convert 'item_id's' into str, firt there are some None values to check.

steam_games[steam_games['item_id'].isnull()]


Unnamed: 0,app_name,title,release_date,price,item_id,tags&genres
88384,,,,19.99,,
119271,Batman: Arkham City - Game of the Year Edition,Batman: Arkham City - Game of the Year Edition,2012-09-07,19.99,,"[Action, Open World, Batman, Adventure, Stealt..."


In [50]:
# Nothing can be done, so we erase them

rows_to_erase = steam_games[steam_games['item_id'].isnull()].index
steam_games.drop(labels=rows_to_erase, inplace=True)
steam_games

In [51]:
# Now we can convert 'item_id' into int

steam_games['item_id']=steam_games['item_id'].round().astype(int)
steam_games

Unnamed: 0,app_name,title,release_date,price,item_id,tags&genres
88310,Lost Summoner Kitty,Lost Summoner Kitty,2018-01-04,4.99,761140,"[Strategy, Action, Indie, Casual, Simulation]"
88311,Ironbound,Ironbound,2018-01-04,0,643980,"[Free to Play, Strategy, Indie, RPG, Card Game..."
...,...,...,...,...,...,...
120443,EXIT 2 - Directions,EXIT 2 - Directions,2017-09-02,4.99,658870,"[Indie, Casual, Puzzle, Singleplayer, Atmosphe..."
120444,Maze Run VR,,,4.99,681550,"[Early Access, Adventure, Indie, Action, Simul..."


In [52]:
# Check for repeated 'item_id'

steam_games['item_id'].value_counts().sort_values(ascending=True)

item_id
698100    1
773570    1
         ..
761480    1
612880    2
Name: count, Length: 32132, dtype: int64

In [53]:
steam_games[steam_games['item_id'] == 612880]

Unnamed: 0,app_name,title,release_date,price,item_id,tags&genres
102204,Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,2017-10-26,59.99,612880,"[Action, FPS, Gore, Violent, Alternate History..."
102883,Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,2017-10-26,59.99,612880,"[Action, FPS, Gore, Violent, Alternate History..."


In [54]:
# Being the same, we erase one of them

steam_games.drop(labels=102204, inplace=True)

In [55]:
# No null values left for 'item_id'

steam_games[steam_games['item_id'].isnull()]

Unnamed: 0,app_name,title,release_date,price,item_id,tags&genres


In [56]:
# Again, we suppouse 'app_name' and 'title' may be same info.
# Approach: Fill missing values depending on available cell info

columns_to_fill = ['app_name', 'title']

def get_values_notlist(row, columns_to_fill):
    if row[columns_to_fill[0]]==row[columns_to_fill[1]]:
        return row[columns_to_fill[0]]
    elif row[columns_to_fill[0]]=='':
        return row[columns_to_fill[1]]
    else:
        return row[columns_to_fill[0]]

# Create a new column with values common to both 'app_name' and 'title'

steam_games['app_name&title'] = steam_games.apply(get_values_notlist, axis=1, args=(columns_to_fill,))

# drop origin columns

steam_games.drop(['app_name', 'title'], axis=1, inplace=True)
steam_games

In [58]:
# Checking for repeated values in the 'app_name&title' column
steam_games[steam_games['app_name&title'].duplicated()]

Unnamed: 0,release_date,price,item_id,tags&genres,app_name&title
89819,2013-08-05,0,94304,"[Adventure, Indie, Casual]",The Dream Machine: Chapter 4
97764,2017-04-12,14.99,542220,"[Action, Indie, Casual]",Rumpus
...,...,...,...,...,...
119852,2009-12-04,0,41300,"[Free to Play, Action, Indie, Multiplayer, 2D,...",Altitude
120307,2017-12-29,1.99,746050,"[Casual, Action, Indie, RPG, Simulation, Singl...",MORE SWEATER? OK!


In [60]:
# Adjusting display options to prevent truncation

pd.set_option('display.max_colwidth', 40)
pd.set_option('display.width', 20)

# Set the display max rows option

pd.set_option('display.max_rows', 10) 

# Select columns to check for duplicates

column_subset = ['app_name&title', 'item_id', 'app_name&title']

# Displaying detailed information about duplicates in relevant columns

print("\nInformation about duplicates in each column:")
for column in column_subset:
    duplicates = steam_games[column].duplicated(keep=False)
    if duplicates.any():
        print(f"Column '{column}':")
        print(steam_games[duplicates].sort_values(by=column))

# we have 37 duplicated rows by 'app_name&title' column


Iformation about duplicates in each column:
Column 'app_name&title':
       release_date  \
107158         None   
118857   2012-11-13   
118489   2013-05-07   
118488   2013-05-07   
119713   2010-08-03   
...             ...   
110009   2016-09-05   
94892    2016-07-11   
100232   2017-10-17   
115981   2015-01-07   
115982   2015-01-07   

        price  \
107158   9.99   
118857   9.99   
118489  29.99   
118488      0   
119713   9.99   
...       ...   
110009  11.15   
94892    2.99   
100232   9.99   
115981      0   
115982  14.99   

        item_id  \
107158   599520   
118857   216173   
118489   219441   
118488   224850   
119713    63110   
...         ...   
110009   436260   
94892    494290   
100232   712230   
115981   329270   
115982   326670   

                                    tags&genres  \
107158                [Strategy, Action, Indie]   
118857  [Strategy, Action, Free to Play, Indie]   
118489                                 [Action]   
118488         

In [61]:
steam_games

Unnamed: 0,release_date,price,item_id,tags&genres,app_name&title
88310,2018-01-04,4.99,761140,"[Strategy, Action, Indie, Casual, Si...",Lost Summoner Kitty
88311,2018-01-04,0,643980,"[Free to Play, Strategy, Indie, RPG,...",Ironbound
88312,2017-07-24,0,670290,"[Free to Play, Simulation, Sports, C...",Real Pool 3D - Poolians
88313,2017-12-07,0.99,767400,"[Action, Adventure, Casual]",弹炸人2222
88314,,2.99,773570,"[Action, Indie, Casual, Sports]",Log Challenge
...,...,...,...,...,...
120440,2018-01-04,1.99,773640,"[Strategy, Indie, Casual, Simulation]",Colony On Mars
120441,2018-01-04,4.99,733530,"[Strategy, Indie, Casual]",LOGistICAL: South Africa
120442,2018-01-04,1.99,610660,"[Indie, Simulation, Racing]",Russian Roads
120443,2017-09-02,4.99,658870,"[Indie, Casual, Puzzle, Singleplayer...",EXIT 2 - Directions


In [None]:
# Some item_id to erase:
# 599520
# 224850
# 41300
# 565510                             


In [None]:
# Select non-numeric values in 'price' and set them to 0
non_numeric_values = steam_games['price'][~steam_games['price'].apply(pd.to_numeric, errors='coerce').notna()].index

df_copy = steam_games.copy().reset_index(drop=False)

df_copy.iloc[non_numeric_values]=0


In [None]:
df_copy['item_id'].describe()

count    3.213200e+04
mean     4.517524e+05
std      1.827147e+05
min      1.000000e+01
25%      3.292775e+05
50%      4.520300e+05
75%      5.933925e+05
max      2.028850e+06
Name: item_id, dtype: float64

In [None]:
# Changed 'release_date' format and set to null unusable dates

df_copy['release_date'] = pd.to_datetime(df_copy['release_date'], format='%Y-%m-%d', errors='coerce')
df_copy

Unnamed: 0,index,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,item_id,developer,tags&genres,app_name&title
0,88310,Kotoshiro,"[Action, Casual, Indie, Simulation, ...",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/76...,2018-01-04,"[Strategy, Action, Indie, Casual, Si...",http://steamcommunity.com/app/761140...,[Single-player],4.99,0.0,761140,Kotoshiro,"[Strategy, Action, Indie, Casual, Si...",Lost Summoner Kitty
1,88311,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/64...,2018-01-04,"[Free to Play, Strategy, Indie, RPG,...",http://steamcommunity.com/app/643980...,"[Single-player, Multi-player, Online...",0,0.0,643980,Secret Level SRL,"[Free to Play, Strategy, Indie, RPG,...",Ironbound
2,88312,Poolians.com,"[Casual, Free to Play, Indie, Simula...",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/67...,2017-07-24,"[Free to Play, Simulation, Sports, C...",http://steamcommunity.com/app/670290...,"[Single-player, Multi-player, Online...",0,0.0,670290,Poolians.com,"[Free to Play, Simulation, Sports, C...",Real Pool 3D - Poolians
3,88313,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/76...,2017-12-07,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400...,[Single-player],0.99,0.0,767400,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222
4,88314,,,Log Challenge,,http://store.steampowered.com/app/77...,NaT,"[Action, Indie, Casual, Sports]",http://steamcommunity.com/app/773570...,"[Single-player, Full controller supp...",2.99,0.0,773570,,"[Action, Indie, Casual, Sports]",Log Challenge
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32127,120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/77...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640...,"[Single-player, Steam Achievements]",1.99,0.0,773640,"Nikita ""Ghost_RUS""","[Strategy, Indie, Casual, Simulation]",Colony On Mars
32128,120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/73...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530...,"[Single-player, Steam Achievements, ...",4.99,0.0,733530,Sacada,"[Strategy, Indie, Casual]",LOGistICAL: South Africa
32129,120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/61...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660...,"[Single-player, Steam Achievements, ...",1.99,0.0,610660,Laush Dmitriy Sergeevich,"[Indie, Simulation, Racing]",Russian Roads
32130,120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/65...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer...",http://steamcommunity.com/app/658870...,"[Single-player, Steam Achievements, ...",4.99,0.0,658870,"xropi,stev3ns","[Indie, Casual, Puzzle, Singleplayer...",EXIT 2 - Directions


In [None]:
df_copy[df_copy['app_name'].isna()]

Unnamed: 0,index,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,item_id,developer,tags&genres,app_name&title
2579,90890,,"[Action, Indie]",,,http://store.steampowered.com/app/31...,2014-08-26,"[Action, Indie]",http://steamcommunity.com/app/317160...,"[Single-player, Game demo]",0,0.0,317160,,"[Action, Indie]",


In [None]:
df_copy.drop(index=2579, inplace=True)

In [None]:
# case=False is used to perform a case-insensitive search (introduced here because project needs)
# Check https://github.com/soyHenry/PI_ML_OPS/tree/PT @ "Criterios de evaluación"


df_copy[df_copy['app_name&title'].str.contains('strike', case=False)]


Unnamed: 0,index,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,item_id,developer,tags&genres,app_name&title
747,89058,CI Games,[Action],Sniper: Ghost Warrior - Second Strike,Sniper: Ghost Warrior - Second Strike,http://store.steampowered.com/app/34...,2011-09-09,[Action],http://steamcommunity.com/app/34842/...,"[Single-player, Multi-player, Downlo...",1.99,0.0,34842,City Interactive,[Action],Sniper: Ghost Warrior - Second Strike
823,89134,SEGA,[Action],Renegade Ops - Coldstrike Campaign,Renegade Ops - Coldstrike Campaign,http://store.steampowered.com/app/20...,2011-12-19,[Action],http://steamcommunity.com/app/200610...,"[Single-player, Multi-player, Co-op,...",4.99,0.0,200610,Avalanche Studios,[Action],Renegade Ops - Coldstrike Campaign
887,89198,Deep Silver,[Simulation],Jane's Advanced Strike Fighters,Jane's Advanced Strike Fighters,http://store.steampowered.com/app/20...,2012-03-08,"[Simulation, Arcade, Flight]",http://steamcommunity.com/app/207790...,"[Single-player, Multi-player, Co-op]",9.99,0.0,207790,Trickstar Games,"[Simulation, Arcade, Flight]",Jane's Advanced Strike Fighters
1044,89355,Valve,[Action],Counter-Strike: Global Offensive,Counter-Strike: Global Offensive,http://store.steampowered.com/app/73...,2012-08-21,"[FPS, Multiplayer, Shooter, Action, ...",http://steamcommunity.com/app/730/re...,"[Multi-player, Steam Achievements, F...",14.99,0.0,730,Valve,"[FPS, Multiplayer, Shooter, Action, ...",Counter-Strike: Global Offensive
1260,89571,Born Ready Games Ltd.,"[Action, Indie, Simulation]",Strike Suit Zero Soundtrack,Strike Suit Zero Soundtrack,http://store.steampowered.com/app/20...,2013-01-23,"[Action, Indie, Simulation]",http://steamcommunity.com/app/209545...,"[Single-player, Downloadable Content...",9.99,0.0,209545,Born Ready Games Ltd.,"[Action, Indie, Simulation]",Strike Suit Zero Soundtrack
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31301,119614,Activision,[Action],Call of Duty®: Black Ops First Strik...,Call of Duty®: Black Ops First Strik...,http://store.steampowered.com/app/42...,2011-03-25,"[Action, Zombies]",http://steamcommunity.com/app/42716/...,"[Multi-player, Co-op, Downloadable C...",14.99,0.0,42716,Treyarch,"[Action, Zombies]",Call of Duty®: Black Ops First Strik...
32009,120322,Valve,[Action],Counter-Strike: Source,Counter-Strike: Source,http://store.steampowered.com/app/24...,2004-11-01,"[Action, FPS, Multiplayer, Shooter, ...",http://steamcommunity.com/app/240/re...,"[Multi-player, Cross-Platform Multip...",19.99,0.0,240,Valve,"[Action, FPS, Multiplayer, Shooter, ...",Counter-Strike: Source
32065,120378,,,LightStrike,,http://store.steampowered.com/app/57...,NaT,"[Simulation, Action, Indie, VR]",http://steamcommunity.com/app/579130...,"[Online Multi-Player, Steam Achievem...",3.99,0.0,579130,,"[Simulation, Action, Indie, VR]",LightStrike
32103,120416,Valve,[Action],Counter-Strike,Counter-Strike,http://store.steampowered.com/app/10...,2000-11-01,"[Action, FPS, Multiplayer, Shooter, ...",http://steamcommunity.com/app/10/rev...,"[Multi-player, Valve Anti-Cheat enab...",9.99,0.0,10,Valve,"[Action, FPS, Multiplayer, Shooter, ...",Counter-Strike


In [None]:
df_copy[['item_id', 'release_date']].isna().sum()

item_id            0
release_date    2351
dtype: int64

In [None]:
# As you can see above, there are 2351 'release_dates missing. We will try to impute them by the date of the first review given by users.

item_and_first_review = pd.read_csv('../light_data/item_id&first_review_date.csv')

In [None]:
item_and_first_review

Unnamed: 0,item_id,modified_date
0,10,2011-05-18
1,10090,2012-10-10
2,10130,2014-09-15
3,10140,2011-07-10
4,10150,2010-12-07
...,...,...
3677,99400,2014-03-29
3678,99700,2011-12-13
3679,99810,2013-01-24
3680,99900,2011-06-26


In [None]:
# Merge DataFrames based on "item_id"

merged_df = df_copy.merge(item_and_first_review, on='item_id', how='left')

In [None]:
merged_df

Unnamed: 0,index,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,item_id,developer,tags&genres,app_name&title,modified_date
0,88310,Kotoshiro,"[Action, Casual, Indie, Simulation, ...",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/76...,2018-01-04,"[Strategy, Action, Indie, Casual, Si...",http://steamcommunity.com/app/761140...,[Single-player],4.99,0.0,761140,Kotoshiro,"[Strategy, Action, Indie, Casual, Si...",Lost Summoner Kitty,
1,88311,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/64...,2018-01-04,"[Free to Play, Strategy, Indie, RPG,...",http://steamcommunity.com/app/643980...,"[Single-player, Multi-player, Online...",0,0.0,643980,Secret Level SRL,"[Free to Play, Strategy, Indie, RPG,...",Ironbound,
2,88312,Poolians.com,"[Casual, Free to Play, Indie, Simula...",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/67...,2017-07-24,"[Free to Play, Simulation, Sports, C...",http://steamcommunity.com/app/670290...,"[Single-player, Multi-player, Online...",0,0.0,670290,Poolians.com,"[Free to Play, Simulation, Sports, C...",Real Pool 3D - Poolians,
3,88313,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/76...,2017-12-07,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400...,[Single-player],0.99,0.0,767400,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,
4,88314,,,Log Challenge,,http://store.steampowered.com/app/77...,NaT,"[Action, Indie, Casual, Sports]",http://steamcommunity.com/app/773570...,"[Single-player, Full controller supp...",2.99,0.0,773570,,"[Action, Indie, Casual, Sports]",Log Challenge,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32127,120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/77...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640...,"[Single-player, Steam Achievements]",1.99,0.0,773640,"Nikita ""Ghost_RUS""","[Strategy, Indie, Casual, Simulation]",Colony On Mars,
32128,120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/73...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530...,"[Single-player, Steam Achievements, ...",4.99,0.0,733530,Sacada,"[Strategy, Indie, Casual]",LOGistICAL: South Africa,
32129,120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/61...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660...,"[Single-player, Steam Achievements, ...",1.99,0.0,610660,Laush Dmitriy Sergeevich,"[Indie, Simulation, Racing]",Russian Roads,
32130,120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/65...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer...",http://steamcommunity.com/app/658870...,"[Single-player, Steam Achievements, ...",4.99,0.0,658870,"xropi,stev3ns","[Indie, Casual, Puzzle, Singleplayer...",EXIT 2 - Directions,


In [None]:
# Fill null values in "release_date" with corresponding values from "modified_date"
merged_df['release_date'].fillna(merged_df['modified_date'], inplace=True)

In [None]:
# from 2351 we came up to 2263 (88 dates replaced)
merged_df[['item_id', 'release_date']].isna().sum()

item_id            0
release_date    2263
dtype: int64

In [None]:
merged_df['release_date'].describe()

count                            29869
mean     2015-04-23 20:58:43.706852864
min                1970-07-15 00:00:00
25%                2014-10-02 00:00:00
50%                2016-04-18 00:00:00
75%                2017-04-18 00:00:00
max                2021-12-31 00:00:00
Name: release_date, dtype: object

In [None]:
# the final 'release_date' values that are missing will be imputed by KNN model

from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler

# Convert datetime to numerical representation (in this case, days since a reference date)
merged_df['release_date_numeric'] = (merged_df['release_date'] - pd.to_datetime('1970-07-15')).dt.days

# Select relevant columns for imputation
columns_for_imputation = ['release_date_numeric', 'price']

# Initialize KNNImputer
imputer = KNNImputer(n_neighbors=3)  # Adjust the number of neighbors as needed

# Standardize the data before imputation
scaler = StandardScaler()
df_scaled = scaler.fit_transform(merged_df[columns_for_imputation])

# Impute missing values
df_imputed_scaled = imputer.fit_transform(df_scaled)

# Inverse transform to get back the original scale
df_imputed = pd.DataFrame(scaler.inverse_transform(df_imputed_scaled), columns=columns_for_imputation)

# Convert numerical representation back to datetime (trim hour at the end)
merged_df['release_date_imputed'] = (pd.to_datetime('1970-07-15') + pd.to_timedelta(df_imputed['release_date_numeric'], unit='D')).dt.date

# Drop intermediate columns if needed
merged_df.drop(columns=['release_date_numeric'], inplace=True)


In [None]:
merged_df[['item_id', 'release_date_imputed']].isna().sum()

item_id                 0
release_date_imputed    0
dtype: int64

In [None]:
merged_df

Unnamed: 0,index,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,item_id,developer,tags&genres,app_name&title,modified_date,release_date_imputed
0,88310,Kotoshiro,"[Action, Casual, Indie, Simulation, ...",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/76...,2018-01-04,"[Strategy, Action, Indie, Casual, Si...",http://steamcommunity.com/app/761140...,[Single-player],4.99,0.0,761140,Kotoshiro,"[Strategy, Action, Indie, Casual, Si...",Lost Summoner Kitty,,2018-01-04
1,88311,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/64...,2018-01-04,"[Free to Play, Strategy, Indie, RPG,...",http://steamcommunity.com/app/643980...,"[Single-player, Multi-player, Online...",0,0.0,643980,Secret Level SRL,"[Free to Play, Strategy, Indie, RPG,...",Ironbound,,2018-01-04
2,88312,Poolians.com,"[Casual, Free to Play, Indie, Simula...",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/67...,2017-07-24,"[Free to Play, Simulation, Sports, C...",http://steamcommunity.com/app/670290...,"[Single-player, Multi-player, Online...",0,0.0,670290,Poolians.com,"[Free to Play, Simulation, Sports, C...",Real Pool 3D - Poolians,,2017-07-24
3,88313,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/76...,2017-12-07,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400...,[Single-player],0.99,0.0,767400,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,,2017-12-07
4,88314,,,Log Challenge,,http://store.steampowered.com/app/77...,NaT,"[Action, Indie, Casual, Sports]",http://steamcommunity.com/app/773570...,"[Single-player, Full controller supp...",2.99,0.0,773570,,"[Action, Indie, Casual, Sports]",Log Challenge,,2017-12-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32127,120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/77...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640...,"[Single-player, Steam Achievements]",1.99,0.0,773640,"Nikita ""Ghost_RUS""","[Strategy, Indie, Casual, Simulation]",Colony On Mars,,2018-01-04
32128,120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/73...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530...,"[Single-player, Steam Achievements, ...",4.99,0.0,733530,Sacada,"[Strategy, Indie, Casual]",LOGistICAL: South Africa,,2018-01-04
32129,120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/61...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660...,"[Single-player, Steam Achievements, ...",1.99,0.0,610660,Laush Dmitriy Sergeevich,"[Indie, Simulation, Racing]",Russian Roads,,2018-01-04
32130,120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/65...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer...",http://steamcommunity.com/app/658870...,"[Single-player, Steam Achievements, ...",4.99,0.0,658870,"xropi,stev3ns","[Indie, Casual, Puzzle, Singleplayer...",EXIT 2 - Directions,,2017-09-02


In [None]:
# Save relevant info to be able to solve:

# def PlayTimeGenre( genero : str ): Debe devolver año con mas horas jugadas para dicho género.
# Ejemplo de retorno: {"Año de lanzamiento con más horas jugadas para Género X" : 2013}

merged_df[['item_id', 'release_date_imputed', 'tags&genres']].to_csv('../light_data/playtimegenre.csv', index=False)

# Save relevant info to be able to solve:

# def UsersRecommend( año : int ): Devuelve el top 3 de juegos MÁS recomendados por usuarios para el año dado.
# (reviews.recommend = True y comentarios positivos/neutrales)
# Ejemplo de retorno: [{"Puesto 1" : X}, {"Puesto 2" : Y},{"Puesto 3" : Z}]


merged_df[['item_id', 'app_name&title']].to_csv('../light_data/item_id&name.csv', index=False)

In [None]:
merged_df

Unnamed: 0,index,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,item_id,developer,tags&genres,app_name&title,modified_date,release_date_imputed
0,88310,Kotoshiro,"[Action, Casual, Indie, Simulation, ...",Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/76...,2018-01-04,"[Strategy, Action, Indie, Casual, Si...",http://steamcommunity.com/app/761140...,[Single-player],4.99,0.0,761140,Kotoshiro,"[Strategy, Action, Indie, Casual, Si...",Lost Summoner Kitty,,2018-01-04
1,88311,"Making Fun, Inc.","[Free to Play, Indie, RPG, Strategy]",Ironbound,Ironbound,http://store.steampowered.com/app/64...,2018-01-04,"[Free to Play, Strategy, Indie, RPG,...",http://steamcommunity.com/app/643980...,"[Single-player, Multi-player, Online...",0,0.0,643980,Secret Level SRL,"[Free to Play, Strategy, Indie, RPG,...",Ironbound,,2018-01-04
2,88312,Poolians.com,"[Casual, Free to Play, Indie, Simula...",Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/67...,2017-07-24,"[Free to Play, Simulation, Sports, C...",http://steamcommunity.com/app/670290...,"[Single-player, Multi-player, Online...",0,0.0,670290,Poolians.com,"[Free to Play, Simulation, Sports, C...",Real Pool 3D - Poolians,,2017-07-24
3,88313,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,弹炸人2222,http://store.steampowered.com/app/76...,2017-12-07,"[Action, Adventure, Casual]",http://steamcommunity.com/app/767400...,[Single-player],0.99,0.0,767400,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222,,2017-12-07
4,88314,,,Log Challenge,,http://store.steampowered.com/app/77...,NaT,"[Action, Indie, Casual, Sports]",http://steamcommunity.com/app/773570...,"[Single-player, Full controller supp...",2.99,0.0,773570,,"[Action, Indie, Casual, Sports]",Log Challenge,,2017-12-26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32127,120440,Ghost_RUS Games,"[Casual, Indie, Simulation, Strategy]",Colony On Mars,Colony On Mars,http://store.steampowered.com/app/77...,2018-01-04,"[Strategy, Indie, Casual, Simulation]",http://steamcommunity.com/app/773640...,"[Single-player, Steam Achievements]",1.99,0.0,773640,"Nikita ""Ghost_RUS""","[Strategy, Indie, Casual, Simulation]",Colony On Mars,,2018-01-04
32128,120441,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/73...,2018-01-04,"[Strategy, Indie, Casual]",http://steamcommunity.com/app/733530...,"[Single-player, Steam Achievements, ...",4.99,0.0,733530,Sacada,"[Strategy, Indie, Casual]",LOGistICAL: South Africa,,2018-01-04
32129,120442,Laush Studio,"[Indie, Racing, Simulation]",Russian Roads,Russian Roads,http://store.steampowered.com/app/61...,2018-01-04,"[Indie, Simulation, Racing]",http://steamcommunity.com/app/610660...,"[Single-player, Steam Achievements, ...",1.99,0.0,610660,Laush Dmitriy Sergeevich,"[Indie, Simulation, Racing]",Russian Roads,,2018-01-04
32130,120443,SIXNAILS,"[Casual, Indie]",EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/65...,2017-09-02,"[Indie, Casual, Puzzle, Singleplayer...",http://steamcommunity.com/app/658870...,"[Single-player, Steam Achievements, ...",4.99,0.0,658870,"xropi,stev3ns","[Indie, Casual, Puzzle, Singleplayer...",EXIT 2 - Directions,,2017-09-02
