# Part 0: Raw Data & Environment Set-up

## Raw Data

The following code loads in and unzips the raw data. Raw data has been downloaded from Kaggle placed into public Google Cloud Storage for ease of access.

In [None]:
# Loads in steam-reviews.zip (game reviews)
url1 = ("https://storage.googleapis.com/dsc232r-group-project-data/steam-reviews.zip")
!wget "{url1}"

In [None]:
# Extracts steam-reviews.zip into specified directory and deletes .zip file
!unzip steam-reviews.zip -d /home/joneel/joneel/Group_Project/raw_data/steam-reviews && rm steam-reviews.zip

In [None]:
# Loads in games.csv (games metadata)
url2 = ("https://storage.googleapis.com/dsc232r-group-project-data/games.csv")
!wget "{url2}"

In [None]:
# Moves games.csv into specified directory
!mv games.csv /home/joneel/joneel/Group_Project/raw_data

## Environment Set-up

Set-up on the cluster included 30 cores with 60GB memory in order to load and process this dataset (total ~45GB).

In [1]:
# Import required libraries
import os, pickle, glob
from pyspark.sql import SparkSession
from pyspark.sql import functions as f
from pyspark import StorageLevel

In [2]:
#sc.stop() # To stop a currently running SparkSession, if needed for troubleshooting/development

In [3]:
# Establishes Spark Session
sc = SparkSession.builder \
    .config("spark.driver.memory", "2g") \
	.config("spark.executor.memory", "2g") \
    .config('spark.executor.instances', 29) \
	.appName("Review_Analysis") \
	.getOrCreate()

In [4]:
# Loads all_reviews.csv file into a spark dataframe
reviews_df = sc.read.csv("/home/joneel/joneel/Group_Project/raw_data/steam-reviews/all_reviews/all_reviews.csv", header=True, inferSchema=True)

In [5]:
# Displays reviews_df schema and counts total # of reviews
reviews_df.printSchema()
print(f"Number of reviews: {reviews_df.count()}")

root
 |-- recommendationid: string (nullable = true)
 |-- appid: string (nullable = true)
 |-- game: string (nullable = true)
 |-- author_steamid: string (nullable = true)
 |-- author_num_games_owned: string (nullable = true)
 |-- author_num_reviews: string (nullable = true)
 |-- author_playtime_forever: string (nullable = true)
 |-- author_playtime_last_two_weeks: string (nullable = true)
 |-- author_playtime_at_review: string (nullable = true)
 |-- author_last_played: string (nullable = true)
 |-- language: string (nullable = true)
 |-- review: string (nullable = true)
 |-- timestamp_created: string (nullable = true)
 |-- timestamp_updated: string (nullable = true)
 |-- voted_up: string (nullable = true)
 |-- votes_up: string (nullable = true)
 |-- votes_funny: string (nullable = true)
 |-- weighted_vote_score: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- steam_purchase: string (nullable = true)
 |-- received_for_free: string (nullable = true)
 |-- writt

In [6]:
# Removes two columns related to Chinese gaming market (not relevant for this analysis)
reviews_df = reviews_df.drop("hidden_in_steam_china", "steam_china_location")
reviews_df.printSchema()

root
 |-- recommendationid: string (nullable = true)
 |-- appid: string (nullable = true)
 |-- game: string (nullable = true)
 |-- author_steamid: string (nullable = true)
 |-- author_num_games_owned: string (nullable = true)
 |-- author_num_reviews: string (nullable = true)
 |-- author_playtime_forever: string (nullable = true)
 |-- author_playtime_last_two_weeks: string (nullable = true)
 |-- author_playtime_at_review: string (nullable = true)
 |-- author_last_played: string (nullable = true)
 |-- language: string (nullable = true)
 |-- review: string (nullable = true)
 |-- timestamp_created: string (nullable = true)
 |-- timestamp_updated: string (nullable = true)
 |-- voted_up: string (nullable = true)
 |-- votes_up: string (nullable = true)
 |-- votes_funny: string (nullable = true)
 |-- weighted_vote_score: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- steam_purchase: string (nullable = true)
 |-- received_for_free: string (nullable = true)
 |-- writt

In [7]:
# Filters dataframe to include only reviews in English & counts new number of reviews
reviews_df_processed = reviews_df.filter(reviews_df.language == 'english')
reviews_df_processed.select("language").distinct().show()
print(f"Number of reviews: {reviews_df_processed.count()}")

+--------+
|language|
+--------+
| english|
+--------+

Number of reviews: 51544179


In [8]:
# Drops rows that contain null values in the id or reviews column & drop duplicate recommendation ids
reviews_df_processed = reviews_df_processed.na.drop(subset=["recommendationid", "appid", "author_steamid", "review"])
print(f"Number of reviews: {reviews_df_processed.count()}")
reviews_df_processed = reviews_df_processed.dropDuplicates(subset=["recommendationid"])
print(f"Number of reviews: {reviews_df_processed.count()}")

Number of reviews: 51544179
Number of reviews: 51351970


In [18]:
reviews_df_processed = reviews_df_processed.withColumn("author_num_games_owned", f.col("author_num_games_owned").cast("integer"))
reviews_df_processed = reviews_df_processed.withColumn("author_num_reviews", f.col("author_num_reviews").cast("integer"))
reviews_df_processed = reviews_df_processed.withColumn("author_playtime_forever", f.col("author_playtime_forever").cast("integer"))
reviews_df_processed = reviews_df_processed.withColumn("author_playtime_last_two_weeks", f.col("author_playtime_last_two_weeks").cast("integer"))
reviews_df_processed = reviews_df_processed.withColumn("author_playtime_at_review", f.col("author_playtime_at_review").cast("integer"))
reviews_df_processed.printSchema()

root
 |-- recommendationid: string (nullable = true)
 |-- appid: string (nullable = true)
 |-- game: string (nullable = true)
 |-- author_steamid: string (nullable = true)
 |-- author_num_games_owned: integer (nullable = true)
 |-- author_num_reviews: integer (nullable = true)
 |-- author_playtime_forever: integer (nullable = true)
 |-- author_playtime_last_two_weeks: integer (nullable = true)
 |-- author_playtime_at_review: integer (nullable = true)
 |-- author_last_played: string (nullable = true)
 |-- language: string (nullable = true)
 |-- review: string (nullable = true)
 |-- timestamp_created: string (nullable = true)
 |-- timestamp_updated: string (nullable = true)
 |-- voted_up: string (nullable = true)
 |-- votes_up: string (nullable = true)
 |-- votes_funny: string (nullable = true)
 |-- weighted_vote_score: string (nullable = true)
 |-- comment_count: string (nullable = true)
 |-- steam_purchase: string (nullable = true)
 |-- received_for_free: string (nullable = true)
 |-- 

In [21]:
reviews_df_processed = reviews_df_processed.withColumn("author_last_played", f.from_unixtime(f.col("author_last_played")).cast("timestamp"))
reviews_df_processed.select("author_last_played").show(5)
reviews_df_processed.printSchema()

AnalysisException: [DATATYPE_MISMATCH.UNEXPECTED_INPUT_TYPE] Cannot resolve "from_unixtime(author_last_played, yyyy-MM-dd HH:mm:ss)" due to data type mismatch: Parameter 1 requires the "BIGINT" type, however "author_last_played" has the type "TIMESTAMP".;
'Project [recommendationid#17, appid#18, game#19, author_steamid#20, author_num_games_owned#781, author_num_reviews#804, author_playtime_forever#827, author_playtime_last_two_weeks#850, author_playtime_at_review#873, cast(from_unixtime(author_last_played#958, yyyy-MM-dd HH:mm:ss, Some(Etc/UTC)) as timestamp) AS author_last_played#981, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
+- Project [recommendationid#17, appid#18, game#19, author_steamid#20, author_num_games_owned#781, author_num_reviews#804, author_playtime_forever#827, author_playtime_last_two_weeks#850, author_playtime_at_review#873, cast(from_unixtime(cast(author_last_played#26 as bigint), yyyy-MM-dd HH:mm:ss, Some(Etc/UTC)) as timestamp) AS author_last_played#958, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
   +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, author_num_games_owned#781, author_num_reviews#804, author_playtime_forever#827, author_playtime_last_two_weeks#850, cast(author_playtime_at_review#25 as int) AS author_playtime_at_review#873, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
      +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, author_num_games_owned#781, author_num_reviews#804, author_playtime_forever#827, cast(author_playtime_last_two_weeks#696 as int) AS author_playtime_last_two_weeks#850, author_playtime_at_review#25, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
         +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, author_num_games_owned#781, author_num_reviews#804, cast(author_playtime_forever#673 as int) AS author_playtime_forever#827, author_playtime_last_two_weeks#696, author_playtime_at_review#25, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
            +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, author_num_games_owned#781, cast(author_num_reviews#650 as int) AS author_num_reviews#804, author_playtime_forever#673, author_playtime_last_two_weeks#696, author_playtime_at_review#25, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
               +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, cast(author_num_games_owned#627 as int) AS author_num_games_owned#781, author_num_reviews#650, author_playtime_forever#673, author_playtime_last_two_weeks#696, author_playtime_at_review#25, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
                  +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, author_num_games_owned#627, author_num_reviews#650, author_playtime_forever#673, cast(author_playtime_last_two_weeks#24 as int) AS author_playtime_last_two_weeks#696, author_playtime_at_review#25, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
                     +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, author_num_games_owned#627, author_num_reviews#650, cast(author_playtime_forever#542 as int) AS author_playtime_forever#673, author_playtime_last_two_weeks#24, author_playtime_at_review#25, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
                        +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, author_num_games_owned#627, cast(author_num_reviews#519 as int) AS author_num_reviews#650, author_playtime_forever#542, author_playtime_last_two_weeks#24, author_playtime_at_review#25, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
                           +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, cast(author_num_games_owned#496 as int) AS author_num_games_owned#627, author_num_reviews#519, author_playtime_forever#542, author_playtime_last_two_weeks#24, author_playtime_at_review#25, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
                              +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, author_num_games_owned#496, author_num_reviews#519, cast(author_playtime_forever#23 as int) AS author_playtime_forever#542, author_playtime_last_two_weeks#24, author_playtime_at_review#25, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
                                 +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, author_num_games_owned#496, cast(author_num_reviews#411 as int) AS author_num_reviews#519, author_playtime_forever#23, author_playtime_last_two_weeks#24, author_playtime_at_review#25, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
                                    +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, cast(author_num_games_owned#388 as int) AS author_num_games_owned#496, author_num_reviews#411, author_playtime_forever#23, author_playtime_last_two_weeks#24, author_playtime_at_review#25, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
                                       +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, author_num_games_owned#388, cast(author_num_reviews#365 as int) AS author_num_reviews#411, author_playtime_forever#23, author_playtime_last_two_weeks#24, author_playtime_at_review#25, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
                                          +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, cast(author_num_games_owned#21 as int) AS author_num_games_owned#388, author_num_reviews#365, author_playtime_forever#23, author_playtime_last_two_weeks#24, author_playtime_at_review#25, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
                                             +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, author_num_games_owned#21, cast(author_num_reviews#342 as int) AS author_num_reviews#365, author_playtime_forever#23, author_playtime_last_two_weeks#24, author_playtime_at_review#25, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
                                                +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, author_num_games_owned#21, cast(author_num_reviews#22 as int) AS author_num_reviews#342, author_playtime_forever#23, author_playtime_last_two_weeks#24, author_playtime_at_review#25, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
                                                   +- Deduplicate [recommendationid#17]
                                                      +- Filter atleastnnonnulls(4, recommendationid#17, appid#18, author_steamid#20, review#28)
                                                         +- Filter (language#27 = english)
                                                            +- Project [recommendationid#17, appid#18, game#19, author_steamid#20, author_num_games_owned#21, author_num_reviews#22, author_playtime_forever#23, author_playtime_last_two_weeks#24, author_playtime_at_review#25, author_last_played#26, language#27, review#28, timestamp_created#29, timestamp_updated#30, voted_up#31, votes_up#32, votes_funny#33, weighted_vote_score#34, comment_count#35, steam_purchase#36, received_for_free#37, written_during_early_access#38]
                                                               +- Relation [recommendationid#17,appid#18,game#19,author_steamid#20,author_num_games_owned#21,author_num_reviews#22,author_playtime_forever#23,author_playtime_last_two_weeks#24,author_playtime_at_review#25,author_last_played#26,language#27,review#28,timestamp_created#29,timestamp_updated#30,voted_up#31,votes_up#32,votes_funny#33,weighted_vote_score#34,comment_count#35,steam_purchase#36,received_for_free#37,written_during_early_access#38,hidden_in_steam_china#39,steam_china_location#40] csv


In [None]:
# Loads games.csv file into a spark dataframe
games_df = sc.read.csv("/home/joneel/joneel/Group_Project/raw_data/games.csv", header=True, inferSchema=True)

In [None]:
# Displays games_df schema and counts total # of games
games_df.printSchema()
print(f"Number of games: {games_df.count()}")

In [None]:
# Removes columns not relevant for this analysis
games_df_processed = games_df.drop("reviews", "header_image", "website", "support_url", "support_email", "full_audio_languages", "screenshots", "movies")
games_df_processed.printSchema()

In [None]:
# Drops rows that contain null values or duplicates in the appid column
print(f"Number of games: {games_df_processed.count()}")
games_df_processed = games_df_processed.na.drop(subset=["appid"])
print(f"Number of games: {games_df_processed.count()}")
games_df_processed = games_df_processed.dropDuplicates(subset=["appid"])
print(f"Number of games: {games_df_processed.count()}")

In [None]:
# Save processed dataframes as parquet files
reviews_df_processed.write.mode("overwrite").parquet("home/joneel/joneel/Group_Project/reviews_processed")
# games_df_processed.write.mode("overwrite").parquet("home/joneel/joneel/Group_Project/games_processed")

In [None]:
"Tokenize reviews, see # of reviews per author, break down tags and genres"
"Make some plots to see data distributions (# of reviews, reviews per author, dates, best and worst reviewed games?)"