This code was originally wrote in Apache Zeppelin in Scala, it is shown here simply for GitHub viewing and not direct running of the notebook.

In [None]:
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._

val reviews = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/user/maria_dev/steam/reviews.csv")

val apps = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/user/maria_dev/steam/applications.csv")

val df = apps.join(reviews, Seq("appid"), "inner")

val df_clean = df.withColumn("final_price", col("mat_final_price").cast("int")).withColumn("release_year", year(to_date(col("release_date"), "yyyy-MM-dd"))).withColumn("voted_up_real", col("voted_up").cast("boolean"))

val df_filter = df_clean.filter((col("type") === "game") && (col("final_price") >= 5000) && (col("release_year") >= 2010))

val df_not_null = df_filter.filter(col("voted_up_real").isNotNull)

val df_answer = df_not_null.agg((sum(when(col("voted_up_real") === true, 1).otherwise(0)) / count("*")).as("positive_review_ratio"))

df_answer.select("positive_review_ratio").show()
df_not_null.groupBy("voted_up_real").count().show()

In [None]:
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._

val reviews = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/user/maria_dev/steam/reviews.csv")

val apps = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/user/maria_dev/steam/applications.csv")

val plat = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/user/maria_dev/steam/platforms.csv")

val plat_help = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/user/maria_dev/steam/application_platforms.csv")

val plat_clean = plat.withColumnRenamed("id", "platform_id").withColumnRenamed("name", "platform_name")

val df_plat = plat_clean.join(plat_help, Seq("platform_id"))

val df = apps.join(reviews, Seq("appid"), "inner")

val df_with_plat = df.join(df_plat, Seq("appid"))

val df_clean = df_with_plat.withColumn("helpfulness_score", col("weighted_vote_score").cast("double")).withColumn("steam_achievements", col("mat_achievement_count").cast("int"))

val df_not_null = df_clean.filter((col("steam_achievements").isNotNull) && (col("helpfulness_score") <= 1))

val achievement_groups = df_not_null.withColumn("ach_group", when(col("steam_achievements") <= 15, "low (1-15)").when(col("steam_achievements") <= 45, "medium (16-45)").when(col("steam_achievements") > 45, "high (45+)"))

val ach_plat_score = achievement_groups.groupBy("platform_name", "ach_group").agg(avg("helpfulness_score").as("avg_helpfulness")).orderBy("platform_name", "avg_helpfulness")

ach_plat_score.show(false)

In [None]:
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._

val reviews = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/user/maria_dev/steam/reviews.csv")

val apps = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/user/maria_dev/steam/applications.csv")

val df = apps.join(reviews, Seq("appid"), "inner")

val free = df.filter(col("received_for_free") === "True")

val free_clean = free.withColumn("voted_up_real", col("voted_up").cast("boolean")).withColumn("playtime_at_review", col("author_playtime_at_review").cast("int"))

val free_clean_no_null = free_clean.filter(col("voted_up_real").isNotNull).filter(col("playtime_at_review").isNotNull)

val games = free_clean_no_null.groupBy("appid", "name").agg(count("*").as("total_free_reviews"), sum(when(col("voted_up_real") === true, 1).otherwise(0)).as("number_positive"), avg("playtime_at_review").as("avg_playtime (minutes)"))

val ratio = games.withColumn("positive_ratio", col("number_positive") / col("total_free_reviews")).orderBy(col("total_free_reviews").desc)

val overall_free = free_clean_no_null.agg(count("*").as("total_free_reviews"), sum(when(col("voted_up_real") === true, 1).otherwise(0)).as("number_positive"), avg("playtime_at_review").as("avg_playtime (minutes)"))

val overall_free_ratio = overall_free.withColumn("overall_positive_ratio", col("number_positive") / col("total_free_reviews"))

val not_free = df.filter(col("received_for_free") === "False")

val not_free_clean = not_free.withColumn("voted_up_real", col("voted_up").cast("boolean")).withColumn("playtime_at_review", col("author_playtime_at_review").cast("int"))

val not_free_clean_no_null = not_free_clean.filter(col("voted_up_real").isNotNull).filter(col("playtime_at_review").isNotNull)

val not_overall_free = not_free_clean_no_null.agg(count("*").as("total_not_free_reviews"), sum(when(col("voted_up_real") === true, 1).otherwise(0)).as("number_positive"), avg("playtime_at_review").as("avg_playtime (minutes)"))

val not_overall_free_ratio = not_overall_free.withColumn("overall_positive_ratio", col("number_positive") / col("total_not_free_reviews"))

not_overall_free_ratio.show(false)

overall_free_ratio.show(false)

ratio.show(5, false)

In [None]:
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._

val reviews = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/user/maria_dev/steam/reviews.csv")

val apps = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load("/user/maria_dev/steam/applications.csv")

val apps_clean = apps.filter(col("type") === "game").filter(col("release_date").isNotNull)

val df = apps_clean.join(reviews, Seq("appid"), "inner")

val df_clean = df.withColumn("playtime", col("author_playtime_at_review").cast("int")).withColumn("votes_up_real", col("votes_up").cast("int")).withColumn("votes_funny_real", col("votes_funny").cast("int")).withColumn("release_year", year(to_date(col("release_date"), "yyyy-MM-dd")))

val filtered = df_clean.filter(col("playtime").isNotNull).filter(col("votes_up_real").isNotNull).filter(col("votes_funny_real").isNotNull).filter(col("votes_up_real") < 10000).filter(col("votes_funny_real") < 10000)

val playtime_groups = filtered.withColumn("playtime_group", when(col("playtime") < 60, "0-1 hours").when(col("playtime") < 300, "1-5 hours").when(col("playtime") < 600, "5-10 hours").when(col("playtime") < 1200, "10-20 hours").when(col("playtime") >= 1200, "over 20 hours"))

val year_groups = playtime_groups.withColumn("year_group", when(col("release_year") < 2005, "Pre 2005").when(col("release_year") <= 2015, "2005-2015").when(col("release_year") > 2015, "2016 and after"))

val df_agg = year_groups.groupBy("playtime_group", "year_group").agg(round(avg("votes_up_real"), 2).as("avg_votes_up"), round(avg("votes_funny_real"), 2).as("avg_funny_votes"), count("*").as("num_reviews")).orderBy("year_group", "playtime_group")

df_agg.show(80, false)