# ETL process

## We import the libraries

In [97]:
import pandas as pd
import numpy as np
import json
import ast
import re

## Then, we open the JSONs

In [98]:
df_games = pd.read_json(r'Datasets\output_steam_games.json' ,lines=True)

In [99]:
data_list = []       # List to save the JSON's dictionaries for every line

file_path = r"Datasets\australian_user_reviews.json"

# Open the archive and process every line
with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            # Use ast.literal_eval to transform the line to a dictionary
            json_data = ast.literal_eval(line)
            data_list.append(json_data)
        except ValueError as e:
            print(f"Error in line: {line}")
            continue

df_user_reviews = pd.DataFrame(data_list)

In [100]:
# same process here:
data_list = []   

file_path = r"Datasets\australian_users_items.json"

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        try:
            json_data = ast.literal_eval(line)
            data_list.append(json_data)
        except ValueError as e:
            print(f"Error in line: {line}")
            continue

df_user_items = pd.DataFrame(data_list)

## Datasets visualization and first data explorations

In [101]:
df_games.head()

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
0,,,,,,,,,,,,,
1,,,,,,,,,,,,,
2,,,,,,,,,,,,,
3,,,,,,,,,,,,,
4,,,,,,,,,,,,,


In [102]:
df_user_items.head()

Unnamed: 0,user_id,items_count,steam_id,user_url,items
0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
1,js41637,888,76561198035864385,http://steamcommunity.com/id/js41637,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
2,evcentric,137,76561198007712555,http://steamcommunity.com/id/evcentric,"[{'item_id': '1200', 'item_name': 'Red Orchest..."
3,Riot-Punch,328,76561197963445855,http://steamcommunity.com/id/Riot-Punch,"[{'item_id': '10', 'item_name': 'Counter-Strik..."
4,doctr,541,76561198002099482,http://steamcommunity.com/id/doctr,"[{'item_id': '300', 'item_name': 'Day of Defea..."


In [103]:
df_user_reviews.head()

Unnamed: 0,user_id,user_url,reviews
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,"[{'funny': '', 'posted': 'Posted November 5, 2..."
1,js41637,http://steamcommunity.com/id/js41637,"[{'funny': '', 'posted': 'Posted June 24, 2014..."
2,evcentric,http://steamcommunity.com/id/evcentric,"[{'funny': '', 'posted': 'Posted February 3.',..."
3,doctr,http://steamcommunity.com/id/doctr,"[{'funny': '', 'posted': 'Posted October 14, 2..."
4,maplemage,http://steamcommunity.com/id/maplemage,"[{'funny': '3 people found this review funny',..."


In [104]:
df_games.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 120445 entries, 0 to 120444
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     24083 non-null  object 
 1   genres        28852 non-null  object 
 2   app_name      32133 non-null  object 
 3   title         30085 non-null  object 
 4   url           32135 non-null  object 
 5   release_date  30068 non-null  object 
 6   tags          31972 non-null  object 
 7   reviews_url   32133 non-null  object 
 8   specs         31465 non-null  object 
 9   price         30758 non-null  object 
 10  early_access  32135 non-null  float64
 11  id            32133 non-null  float64
 12  developer     28836 non-null  object 
dtypes: float64(2), object(11)
memory usage: 11.9+ MB


In [105]:
df_user_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 88310 entries, 0 to 88309
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   user_id      88310 non-null  object
 1   items_count  88310 non-null  int64 
 2   steam_id     88310 non-null  object
 3   user_url     88310 non-null  object
 4   items        88310 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.4+ MB


In [106]:
df_user_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25799 entries, 0 to 25798
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   user_id   25799 non-null  object
 1   user_url  25799 non-null  object
 2   reviews   25799 non-null  object
dtypes: object(3)
memory usage: 604.8+ KB


We see that the dataframes have a lot of unnested data and completely empty rows, we are gonna work with that

# Now, we are going to do an specific ETL for every dataframe 

# ETL for df_games

drop the empty columns

In [107]:
df_games = df_games.dropna(how ="all").reset_index(drop=True)

We search for the items with the same id

In [108]:
duplicated_rows_games = df_games[df_games.duplicated(subset = "id", keep=False)]
duplicated_rows_games

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
74,,,,,http://store.steampowered.com/,,,,,19.99,0.0,,
13894,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,0.0,612880.0,Machine Games
14573,Bethesda Softworks,[Action],Wolfenstein II: The New Colossus,Wolfenstein II: The New Colossus,http://store.steampowered.com/app/612880/Wolfe...,2017-10-26,"[Action, FPS, Gore, Violent, Alternate History...",http://steamcommunity.com/app/612880/reviews/?...,"[Single-player, Steam Achievements, Full contr...",59.99,0.0,612880.0,Machine Games
30961,"Warner Bros. Interactive Entertainment, Feral ...","[Action, Adventure]",Batman: Arkham City - Game of the Year Edition,Batman: Arkham City - Game of the Year Edition,http://store.steampowered.com/app/200260,2012-09-07,"[Action, Open World, Batman, Adventure, Stealt...",,"[Single-player, Steam Achievements, Steam Trad...",19.99,0.0,,"Rocksteady Studios,Feral Interactive (Mac)"


The first row is empty, so we have to drop it. The bethesda's rows are the same, so we are dropping one of them.
We have to look the Batman's game case

In [109]:
batman_mask = df_games["title"] == "Batman: Arkham City - Game of the Year Edition"
batman_game = df_games[batman_mask]
batman_game

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
1068,"Warner Bros. Interactive Entertainment, Feral ...","[Action, Adventure]",Batman: Arkham City - Game of the Year Edition,Batman: Arkham City - Game of the Year Edition,http://store.steampowered.com/app/200260/Batma...,2012-09-07,"[Action, Open World, Batman, Adventure, Stealt...",http://steamcommunity.com/app/200260/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",19.99,0.0,200260.0,"Rocksteady Studios,Feral Interactive (Mac)"
30961,"Warner Bros. Interactive Entertainment, Feral ...","[Action, Adventure]",Batman: Arkham City - Game of the Year Edition,Batman: Arkham City - Game of the Year Edition,http://store.steampowered.com/app/200260,2012-09-07,"[Action, Open World, Batman, Adventure, Stealt...",,"[Single-player, Steam Achievements, Steam Trad...",19.99,0.0,,"Rocksteady Studios,Feral Interactive (Mac)"


The two rows have the same game, so we drop one, the one with no id

In [110]:
df_games = df_games.drop([30961,74,13894])

We look at the genres and tags columns, since are very similar

In [111]:
empty_rows_genres = df_games[df_games['genres'].isna()] 
empty_rows_genres

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
4,,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,"[Action, Indie, Casual, Sports]",http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,0.0,773570.0,
11,,,Icarus Six Sixty Six,,http://store.steampowered.com/app/724910/Icaru...,,[Casual],http://steamcommunity.com/app/724910/reviews/?...,"[Single-player, HTC Vive, Tracked Motion Contr...",Free,0.0,724910.0,
19,,,After Life VR,,http://store.steampowered.com/app/772590/After...,,"[Early Access, Indie, VR]",http://steamcommunity.com/app/772590/reviews/?...,"[Single-player, HTC Vive, Tracked Motion Contr...",4.99,1.0,772590.0,
20,,,Kitty Hawk,,http://store.steampowered.com/app/640250/Kitty...,,"[Early Access, Action, Adventure, Indie, Casual]",http://steamcommunity.com/app/640250/reviews/?...,"[Single-player, Steam Leaderboards, HTC Vive, ...",2.99,1.0,640250.0,
22,,,Mortars VR,,http://store.steampowered.com/app/711440/Morta...,,"[Early Access, Strategy, Action, Indie, Casual...",http://steamcommunity.com/app/711440/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",0.99,1.0,711440.0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32118,,,Robotpencil Presents: Exercise: Brushwork,Robotpencil Presents: Exercise: Brushwork,http://store.steampowered.com/app/775640/Robot...,2018-01-03,"[Design & Illustration, Tutorial]",http://steamcommunity.com/app/775640/reviews/?...,,3.99,0.0,775640.0,
32119,,,Robotpencil Presents: Creative Composition,Robotpencil Presents: Creative Composition,http://store.steampowered.com/app/777930/Robot...,2018-01-03,"[Design & Illustration, Tutorial]",http://steamcommunity.com/app/777930/reviews/?...,,3.99,0.0,777930.0,
32120,,,The Gamble House,The Gamble House,http://store.steampowered.com/app/775370/The_G...,2016-11-19,[Movie],http://steamcommunity.com/app/775370/reviews/?...,[Captions available],4.99,0.0,775370.0,
32121,,,Kalen Chock Presents: 2017 Free Tutorial,Kalen Chock Presents: 2017 Free Tutorial,http://store.steampowered.com/app/777950/Kalen...,2018-01-03,"[Design & Illustration, Tutorial]",http://steamcommunity.com/app/777950/reviews/?...,,Free,0.0,777950.0,


In [112]:
empty_rows_tags = df_games[df_games['tags'].isna()] 
empty_rows_tags

Unnamed: 0,publisher,genres,app_name,title,url,release_date,tags,reviews_url,specs,price,early_access,id,developer
279,Her Interactive,[Adventure],Nancy Drew®: Ransom of the Seven Ships Demo,Nancy Drew®: Ransom of the Seven Ships Demo,http://store.steampowered.com/app/31990/Nancy_...,2009-07-21,,http://steamcommunity.com/app/31990/reviews/?b...,"[Single-player, Game demo]",,0.0,31990.0,Her Interactive
298,"SQUARE ENIX, Eidos Interactive","[Action, Adventure]",Mini Ninjas Demo,Mini Ninjas Demo,http://store.steampowered.com/app/35050/Mini_N...,2009-08-19,,http://steamcommunity.com/app/35050/reviews/?b...,"[Single-player, Game demo]",,0.0,35050.0,IO Interactive
309,Freeze Tag Inc.,[Casual],Freeze Tag Fun Pack #2,Freeze Tag Fun Pack #2,http://store.steampowered.com/app/39392/Freeze...,2009-09-23,,http://steamcommunity.com/app/39392/reviews/?b...,[Single-player],19.99,0.0,39392.0,"Freeze Tag Inc.,iQ212"
310,Freeze Tag Inc.,[Casual],Freeze Tag Fun Pack #1,Freeze Tag Fun Pack #1,http://store.steampowered.com/app/39391/Freeze...,1999-11-30,,http://steamcommunity.com/app/39391/reviews/?b...,[Single-player],19.99,0.0,39391.0,"Freeze Tag Inc.,Joju Games,Linksolutions Ltd.,..."
358,Paradox Interactive,,Europa Universalis III: Heir to the Throne,Europa Universalis III: Heir to the Throne,http://store.steampowered.com/app/25806/Europa...,2009-12-15,,http://steamcommunity.com/app/25806/reviews/?b...,"[Single-player, Multi-player, Downloadable Con...",9.99,0.0,25806.0,Paradox Interactive
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31625,PlayFirst,[Casual],Zenerchi®,Zenerchi®,http://store.steampowered.com/app/37290/Zenerchi/,2009-08-12,,http://steamcommunity.com/app/37290/reviews/?b...,[Single-player],9.99,0.0,37290.0,Red Rocket Games
31627,PlayFirst,[Simulation],Wedding Dash® 2: Rings Around the World,Wedding Dash® 2: Rings Around the World,http://store.steampowered.com/app/37280/Weddin...,2009-08-12,,http://steamcommunity.com/app/37280/reviews/?b...,[Single-player],9.99,0.0,37280.0,Sarbakan
31639,Sandlot Games,,Super Granny Collection,Super Granny Collection,http://store.steampowered.com/app/36270/Super_...,2009-07-17,,http://steamcommunity.com/app/36270/reviews/?b...,[Single-player],14.99,0.0,36270.0,Sandlot Games
31652,Strategy First,[Indie],Bad Rats: the Rats' Revenge Demo,Bad Rats: the Rats' Revenge Demo,http://store.steampowered.com/app/34910/Bad_Ra...,2009-07-20,,http://steamcommunity.com/app/34910/reviews/?b...,"[Single-player, Game demo]",,0.0,34910.0,Invent4 Entertainment


The tags and genres columns have similar atributes (very similar). We are going to drop the tags column, since we are not going to use it in the api and the machine learning model. So, as not to waste the information, we are going to fill the empty values in genres with the values in tags that are not empty.

In [113]:
df_games['genre'] = df_games['genres'].combine_first(df_games['tags'])
df_games = df_games.drop(['tags', 'genres'], axis=1)
df_games


Unnamed: 0,publisher,app_name,title,url,release_date,reviews_url,specs,price,early_access,id,developer,genre
0,Kotoshiro,Lost Summoner Kitty,Lost Summoner Kitty,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,0.0,761140.0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]"
1,"Making Fun, Inc.",Ironbound,Ironbound,http://store.steampowered.com/app/643980/Ironb...,2018-01-04,http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,0.0,643980.0,Secret Level SRL,"[Free to Play, Indie, RPG, Strategy]"
2,Poolians.com,Real Pool 3D - Poolians,Real Pool 3D - Poolians,http://store.steampowered.com/app/670290/Real_...,2017-07-24,http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,0.0,670290.0,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]"
3,彼岸领域,弹炸人2222,弹炸人2222,http://store.steampowered.com/app/767400/2222/,2017-12-07,http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,0.0,767400.0,彼岸领域,"[Action, Adventure, Casual]"
4,,Log Challenge,,http://store.steampowered.com/app/773570/Log_C...,,http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,0.0,773570.0,,"[Action, Indie, Casual, Sports]"
...,...,...,...,...,...,...,...,...,...,...,...,...
32130,Ghost_RUS Games,Colony On Mars,Colony On Mars,http://store.steampowered.com/app/773640/Colon...,2018-01-04,http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,0.0,773640.0,"Nikita ""Ghost_RUS""","[Casual, Indie, Simulation, Strategy]"
32131,Sacada,LOGistICAL: South Africa,LOGistICAL: South Africa,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,0.0,733530.0,Sacada,"[Casual, Indie, Strategy]"
32132,Laush Studio,Russian Roads,Russian Roads,http://store.steampowered.com/app/610660/Russi...,2018-01-04,http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,0.0,610660.0,Laush Dmitriy Sergeevich,"[Indie, Racing, Simulation]"
32133,SIXNAILS,EXIT 2 - Directions,EXIT 2 - Directions,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,0.0,658870.0,"xropi,stev3ns","[Casual, Indie]"


The same goes with the appname and title, so we do the same

In [114]:
df_games['app_title'] = df_games['app_name'].combine_first(df_games['title'])
df_games = df_games.drop(['app_name', 'title'], axis=1)
df_games

Unnamed: 0,publisher,url,release_date,reviews_url,specs,price,early_access,id,developer,genre,app_title
0,Kotoshiro,http://store.steampowered.com/app/761140/Lost_...,2018-01-04,http://steamcommunity.com/app/761140/reviews/?...,[Single-player],4.99,0.0,761140.0,Kotoshiro,"[Action, Casual, Indie, Simulation, Strategy]",Lost Summoner Kitty
1,"Making Fun, Inc.",http://store.steampowered.com/app/643980/Ironb...,2018-01-04,http://steamcommunity.com/app/643980/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free To Play,0.0,643980.0,Secret Level SRL,"[Free to Play, Indie, RPG, Strategy]",Ironbound
2,Poolians.com,http://store.steampowered.com/app/670290/Real_...,2017-07-24,http://steamcommunity.com/app/670290/reviews/?...,"[Single-player, Multi-player, Online Multi-Pla...",Free to Play,0.0,670290.0,Poolians.com,"[Casual, Free to Play, Indie, Simulation, Sports]",Real Pool 3D - Poolians
3,彼岸领域,http://store.steampowered.com/app/767400/2222/,2017-12-07,http://steamcommunity.com/app/767400/reviews/?...,[Single-player],0.99,0.0,767400.0,彼岸领域,"[Action, Adventure, Casual]",弹炸人2222
4,,http://store.steampowered.com/app/773570/Log_C...,,http://steamcommunity.com/app/773570/reviews/?...,"[Single-player, Full controller support, HTC V...",2.99,0.0,773570.0,,"[Action, Indie, Casual, Sports]",Log Challenge
...,...,...,...,...,...,...,...,...,...,...,...
32130,Ghost_RUS Games,http://store.steampowered.com/app/773640/Colon...,2018-01-04,http://steamcommunity.com/app/773640/reviews/?...,"[Single-player, Steam Achievements]",1.99,0.0,773640.0,"Nikita ""Ghost_RUS""","[Casual, Indie, Simulation, Strategy]",Colony On Mars
32131,Sacada,http://store.steampowered.com/app/733530/LOGis...,2018-01-04,http://steamcommunity.com/app/733530/reviews/?...,"[Single-player, Steam Achievements, Steam Clou...",4.99,0.0,733530.0,Sacada,"[Casual, Indie, Strategy]",LOGistICAL: South Africa
32132,Laush Studio,http://store.steampowered.com/app/610660/Russi...,2018-01-04,http://steamcommunity.com/app/610660/reviews/?...,"[Single-player, Steam Achievements, Steam Trad...",1.99,0.0,610660.0,Laush Dmitriy Sergeevich,"[Indie, Racing, Simulation]",Russian Roads
32133,SIXNAILS,http://store.steampowered.com/app/658870/EXIT_...,2017-09-02,http://steamcommunity.com/app/658870/reviews/?...,"[Single-player, Steam Achievements, Steam Cloud]",4.99,0.0,658870.0,"xropi,stev3ns","[Casual, Indie]",EXIT 2 - Directions


We explode the genre row, then we drop the url, reviews_urls and specs columns, since we are not going to use them

In [115]:
df_games = df_games.explode(["genre"]).reindex()
df_games = df_games.drop(['specs', 'url', 'reviews_url'], axis=1)

We look to the empty values

In [116]:
df_games.replace('', None, inplace=True)             #we replace the '' with None
df_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 85627 entries, 0 to 32134
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     59485 non-null  object 
 1   release_date  74929 non-null  object 
 2   price         81934 non-null  object 
 3   early_access  85627 non-null  float64
 4   id            85627 non-null  float64
 5   developer     71571 non-null  object 
 6   genre         85489 non-null  object 
 7   app_title     85625 non-null  object 
dtypes: float64(2), object(6)
memory usage: 5.9+ MB


We look first at app_name, two useless rows so we drop them

In [118]:
empty_rows_games = df_games[df_games["app_title"].isna()] 
empty_rows_games

Unnamed: 0,publisher,release_date,price,early_access,id,developer,genre,app_title
2580,,2014-08-26,,0.0,317160.0,,Action,
2580,,2014-08-26,,0.0,317160.0,,Indie,


In [119]:
df_games = df_games.dropna(subset= ['app_title'])

To the rest, we replace the NaN with "No Data"

In [120]:
df_games.fillna("No data", inplace=True)
df_games.info()

<class 'pandas.core.frame.DataFrame'>
Index: 85625 entries, 0 to 32134
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   publisher     85625 non-null  object 
 1   release_date  85625 non-null  object 
 2   price         85625 non-null  object 
 3   early_access  85625 non-null  float64
 4   id            85625 non-null  float64
 5   developer     85625 non-null  object 
 6   genre         85625 non-null  object 
 7   app_title     85625 non-null  object 
dtypes: float64(2), object(6)
memory usage: 5.9+ MB


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_games.fillna("No data", inplace=True)


Finally, we export the dataframe to a clean csv

In [121]:
df_games.to_csv('Datasets/steam_games_clean.csv', index=False, encoding='utf-8')
print('steam_games_clean.csv was saved')

steam_games_clean.csv was saved


# ETL for df_user_reviews

First, we unnest (explode) the data

In [122]:
df_user_reviews_dec = df_user_reviews.explode(['reviews']) #we decompound the reviews column
df_reviews_series = df_user_reviews_dec['reviews'].apply(pd.Series) #create a series from it
df_user_reviews = pd.concat([df_user_reviews_dec, df_reviews_series], axis=1) #we create the defitive dataframe
df_user_reviews.drop('reviews', axis=1, inplace = True) # and then, we delete the leftover column

Drop the empty rows and the empty column

In [123]:
df_user_reviews = df_user_reviews.dropna(how ="all").reset_index(drop=True)
df_user_reviews.drop(0, axis=1, inplace = True)       #drop the empty column

We look if there are duplicated columns

In [124]:
duplicated_rows_reviews = df_user_reviews[df_user_reviews.duplicated( keep=False)]
duplicated_rows_reviews

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
31,76561198156664158,http://steamcommunity.com/profiles/76561198156...,,Posted June 16.,,252950,0 of 1 people (0%) found this review helpful,True,love it
115,Rivtex,http://steamcommunity.com/id/Rivtex,,"Posted December 23, 2015.",,298630,0 of 1 people (0%) found this review helpful,True,Its so bosss
280,DieMadchenschanderin,http://steamcommunity.com/id/DieMadchenschanderin,,"Posted August 29, 2015.",,346110,0 of 1 people (0%) found this review helpful,True,"It's a good game, mechanics seem solid and the..."
281,DieMadchenschanderin,http://steamcommunity.com/id/DieMadchenschanderin,,"Posted October 5, 2015.","Last edited October 5, 2015.",380770,0 of 5 people (0%) found this review helpful,False,"Well, I don't have much to say about the game...."
282,DieMadchenschanderin,http://steamcommunity.com/id/DieMadchenschanderin,,Posted June 29.,,454890,No ratings yet,True,"It's a pretty good game, not bad for the 49 ce..."
...,...,...,...,...,...,...,...,...,...
44456,76561198092022514,http://steamcommunity.com/profiles/76561198092...,,Posted July 3.,,422400,No ratings yet,True,Muy entretenido y una coleccion de armas prome...
44457,76561198092022514,http://steamcommunity.com/profiles/76561198092...,,Posted June 1.,,218620,No ratings yet,True,"Tiene una jugabilidad y tematica muy buena :D,..."
44458,76561198092022514,http://steamcommunity.com/profiles/76561198092...,,"Posted August 17, 2014.",,261820,No ratings yet,True,"Buen juego, no importa el desarrrollo que tien..."
44459,76561198092022514,http://steamcommunity.com/profiles/76561198092...,,"Posted February 17, 2014.",,224260,No ratings yet,True,exelente aporte :D¡¡¡ es una buen mod basado e...


A lot of duplicated, we have to drop them

In [125]:
print("df_user_reviews's rows are", df_user_reviews.shape[0])

df_user_reviews's rows are 59333


In [126]:
df_user_reviews = df_user_reviews.drop_duplicates()
print("df_user_reviews's rows are", df_user_reviews.shape[0])

df_user_reviews's rows are 58459


We look for empty reviews

In [127]:
df_user_reviews.replace('', None, inplace=True)             #we replace the '' with None
df_user_reviews.head()

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...


In [128]:
empty_rows_reviews_funny = df_user_reviews[df_user_reviews["funny"].isna()] 
empty_rows_reviews_funny

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
0,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted November 5, 2011.",,1250,No ratings yet,True,Simple yet with great replayability. In my opi...
1,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted July 15, 2011.",,22200,No ratings yet,True,It's unique and worth a playthrough.
2,76561197970982479,http://steamcommunity.com/profiles/76561197970...,,"Posted April 21, 2011.",,43110,No ratings yet,True,Great atmosphere. The gunplay can be a bit chu...
3,js41637,http://steamcommunity.com/id/js41637,,"Posted June 24, 2014.",,251610,15 of 20 people (75%) found this review helpful,True,I know what you think when you see this title ...
4,js41637,http://steamcommunity.com/id/js41637,,"Posted September 8, 2013.",,227300,0 of 1 people (0%) found this review helpful,True,For a simple (it's actually not all that simpl...
...,...,...,...,...,...,...,...,...,...
59327,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 10.,,130,No ratings yet,True,if you liked Half life i would really recommen...
59328,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 10.,,70,No ratings yet,True,a must have classic from steam definitely wort...
59329,76561198312638244,http://steamcommunity.com/profiles/76561198312...,,Posted July 8.,,362890,No ratings yet,True,this game is a perfect remake of the original ...
59331,LydiaMorley,http://steamcommunity.com/id/LydiaMorley,,Posted July 20.,,730,No ratings yet,True,:D


In [129]:
empty_rows_reviews_review = df_user_reviews[df_user_reviews["review"].isna()] 
empty_rows_reviews_review

Unnamed: 0,user_id,user_url,funny,posted,last_edited,item_id,helpful,recommend,review
137,gdxsd,http://steamcommunity.com/id/gdxsd,,,,,,,
177,76561198094224872,http://steamcommunity.com/profiles/76561198094...,,,,,,,
2559,76561198021575394,http://steamcommunity.com/profiles/76561198021...,,,,,,,
3098,2ZESTY4ME,http://steamcommunity.com/id/2ZESTY4ME,,Posted March 11.,,550.0,No ratings yet,True,
4619,76561198093337643,http://steamcommunity.com/profiles/76561198093...,,"Posted September 19, 2014.",,550.0,No ratings yet,True,
10080,cmuir37,http://steamcommunity.com/id/cmuir37,,,,,,,
13767,Jaysteeny,http://steamcommunity.com/id/Jaysteeny,,,,,,,
15493,ML8989,http://steamcommunity.com/id/ML8989,,,,,,,
15981,terencemok,http://steamcommunity.com/id/terencemok,,"Posted December 30, 2014.",,218620.0,2 of 2 people (100%) found this review helpful,True,
19184,76561198079215291,http://steamcommunity.com/profiles/76561198079...,,,,,,,


There a lot of people who didnt do any review or nothing, so the data is useless. We can drop all this rows

In [130]:
df_user_reviews = df_user_reviews.dropna(subset=['funny'])

We save the clean data in a csv

In [131]:
archivo_limpio = 'Datasets/user_reviews_clean.csv'
df_user_reviews.to_csv(archivo_limpio, index=False, encoding='utf-8')
print('user_reviews_clean.csv was saved')

user_reviews_clean.csv was saved


# ETL for df_user_items

First, we unnest (explode) the data

In [132]:
#We make the same process that we do in the df_user_reviews dataframe but here we use json_normalize
#function, because with the other way the program got stucked forever
df_user_items_desc = df_user_items.explode(["items"])
df_user_items_desc2 = pd.json_normalize(df_user_items_desc['items']).set_index(df_user_items_desc['items'].index)
df_user_items= pd.concat([df_user_items_desc2, df_user_items_desc], axis=1)
df_user_items.drop('items', axis=1, inplace = True)

Drop the all empty rows

In [133]:
df_user_items = df_user_items.dropna(how ="all").reset_index(drop=True)

See duplicated rows

In [134]:
duplicated_rows_items = df_user_items[df_user_items.duplicated( keep=False)]
duplicated_rows_items

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,user_id,items_count,steam_id,user_url
4346,4000,Garry's Mod,269.0,0.0,76561198156664158,59,76561198156664158,http://steamcommunity.com/profiles/76561198156...
4347,33910,Arma 2,162.0,0.0,76561198156664158,59,76561198156664158,http://steamcommunity.com/profiles/76561198156...
4348,33930,Arma 2: Operation Arrowhead,223.0,0.0,76561198156664158,59,76561198156664158,http://steamcommunity.com/profiles/76561198156...
4349,219540,Arma 2: Operation Arrowhead Beta (Obsolete),0.0,0.0,76561198156664158,59,76561198156664158,http://steamcommunity.com/profiles/76561198156...
4350,400,Portal,196.0,0.0,76561198156664158,59,76561198156664158,http://steamcommunity.com/profiles/76561198156...
...,...,...,...,...,...,...,...,...
4910940,261030,The Walking Dead: Season Two,253.0,0.0,76561198080057659,39,76561198080057659,http://steamcommunity.com/profiles/76561198080...
4910941,273110,Counter-Strike Nexon: Zombies,0.0,0.0,76561198080057659,39,76561198080057659,http://steamcommunity.com/profiles/76561198080...
4910942,730,Counter-Strike: Global Offensive,0.0,0.0,76561198080057659,39,76561198080057659,http://steamcommunity.com/profiles/76561198080...
4985582,,,,,farquadian,0,76561198086134170,http://steamcommunity.com/id/farquadian


Drop duplicated rows

In [135]:
print("df_user_items's rows are", df_user_items.shape[0])

df_user_items's rows are 5170015


In [136]:
df_user_items = df_user_items.drop_duplicates()
print("df_user_items's rows are", df_user_items.shape[0])

df_user_items's rows are 5110819


See the empty values

In [137]:
df_user_items.replace('', None, inplace=True)             #we replace the '' with None
df_user_items.head()

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,user_id,items_count,steam_id,user_url
0,10,Counter-Strike,6.0,0.0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
1,20,Team Fortress Classic,0.0,0.0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
2,30,Day of Defeat,7.0,0.0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
3,40,Deathmatch Classic,0.0,0.0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...
4,50,Half-Life: Opposing Force,0.0,0.0,76561197970982479,277,76561197970982479,http://steamcommunity.com/profiles/76561197970...


In [138]:
empty_rows_reviews_items = df_user_items[df_user_items["playtime_forever"].isna()] 
empty_rows_reviews_items

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,user_id,items_count,steam_id,user_url
3733,,,,,Wackky,0,76561198039117046,http://steamcommunity.com/id/Wackky
3849,,,,,76561198079601835,0,76561198079601835,http://steamcommunity.com/profiles/76561198079...
6019,,,,,hellom8o,0,76561198117222320,http://steamcommunity.com/id/hellom8o
6523,,,,,starkillershadow553,0,76561198059648579,http://steamcommunity.com/id/starkillershadow553
7237,,,,,darkenkane,0,76561198058876001,http://steamcommunity.com/id/darkenkane
...,...,...,...,...,...,...,...,...
5169470,,,,,76561198316380182,0,76561198316380182,http://steamcommunity.com/profiles/76561198316...
5169471,,,,,76561198316970597,0,76561198316970597,http://steamcommunity.com/profiles/76561198316...
5169472,,,,,76561198318100691,0,76561198318100691,http://steamcommunity.com/profiles/76561198318...
5170006,,,,,XxLaughingJackClown77xX,0,76561198328759259,http://steamcommunity.com/id/XxLaughingJackClo...


In [139]:
empty_rows_reviews_items = df_user_items[df_user_items["item_id"].isna()] 
empty_rows_reviews_items

Unnamed: 0,item_id,item_name,playtime_forever,playtime_2weeks,user_id,items_count,steam_id,user_url
3733,,,,,Wackky,0,76561198039117046,http://steamcommunity.com/id/Wackky
3849,,,,,76561198079601835,0,76561198079601835,http://steamcommunity.com/profiles/76561198079...
6019,,,,,hellom8o,0,76561198117222320,http://steamcommunity.com/id/hellom8o
6523,,,,,starkillershadow553,0,76561198059648579,http://steamcommunity.com/id/starkillershadow553
7237,,,,,darkenkane,0,76561198058876001,http://steamcommunity.com/id/darkenkane
...,...,...,...,...,...,...,...,...
5169470,,,,,76561198316380182,0,76561198316380182,http://steamcommunity.com/profiles/76561198316...
5169471,,,,,76561198316970597,0,76561198316970597,http://steamcommunity.com/profiles/76561198316...
5169472,,,,,76561198318100691,0,76561198318100691,http://steamcommunity.com/profiles/76561198318...
5170006,,,,,XxLaughingJackClown77xX,0,76561198328759259,http://steamcommunity.com/id/XxLaughingJackClo...


Same case with the former dataframe, a lot of useless data with NaN values, so we have to drop them

In [140]:
df_user_items = df_user_items.dropna(subset = ['playtime_forever'])

In [141]:
df_user_items.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5094105 entries, 0 to 5170013
Data columns (total 8 columns):
 #   Column            Dtype  
---  ------            -----  
 0   item_id           object 
 1   item_name         object 
 2   playtime_forever  float64
 3   playtime_2weeks   float64
 4   user_id           object 
 5   items_count       int64  
 6   steam_id          object 
 7   user_url          object 
dtypes: float64(2), int64(1), object(5)
memory usage: 349.8+ MB


In [142]:
df_user_items.to_csv('Datasets/user_items_clean.csv', index=False, encoding='utf-8')
print('user_items_clean.csv was saved')

user_items_clean.csv was saved
