In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, round as spark_round

spark = SparkSession.builder \
    .appName("Lab3-Spark") \
    .master("local[*]") \
    .getOrCreate()

df = spark.read.csv(
    "../../data/steam_reviews.csv",
    header=True,
    inferSchema=False,
    multiLine=True,
    escape='"',
    quote='"',
    encoding='utf-8',
    mode='PERMISSIVE'
)

df = df.select("app_id", "app_name", "review_id", "votes_helpful") \
    .withColumn("votes_helpful_int", col("votes_helpful").try_cast("integer"))

result = df.groupBy("app_id", "app_name") \
    .agg(
        count("*").alias("total_reviews"),
        count(when(col("votes_helpful_int") >= 3, True)).alias("helpful_reviews_count")
    )

result = result.withColumn(
    "helpful_percentage",
    spark_round((col("helpful_reviews_count") / col("total_reviews")) * 100, 2)
)

final_result = result.select(
    "app_id",
    "app_name",
    "helpful_reviews_count",
    "helpful_percentage"
).orderBy(col("helpful_reviews_count").desc())

final_result.show(20, truncate=False)

spark.stop()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/14 23:58:23 WARN Utils: Your hostname, homelab, resolves to a loopback address: 127.0.1.1; using 192.168.1.111 instead (on interface wlp3s0)
25/12/14 23:58:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/14 23:58:35 WARN CSVHeaderChecker: CSV header does not conform to the schema.
 Header: , app_id, app_name, review_id, language, review, timestamp_created, timestamp_updated, recommended, votes_helpful, votes_funny, weighted_vote_score, comment_count, steam_purchase, received_for_free, written_during_early_access, author.steamid, author.num_games_owned, author.num_reviews, author.playtime_forever, author.playtime_last_two_weeks, author.playtime_at_review, author.l

+------+------------------------------+---------------------+------------------+
|app_id|app_name                      |helpful_reviews_count|helpful_percentage|
+------+------------------------------+---------------------+------------------+
|578080|PLAYERUNKNOWN'S BATTLEGROUNDS |75241                |4.58              |
|271590|Grand Theft Auto V            |55543                |5.45              |
|275850|No Man's Sky                  |42819                |23.52             |
|218620|PAYDAY 2                      |39084                |8.01              |
|377160|Fallout 4                     |37719                |16.47             |
|346110|ARK: Survival Evolved         |37594                |9.4               |
|359550|Tom Clancy's Rainbow Six Siege|35486                |4.21              |
|252490|Rust                          |28336                |5.16              |
|381210|Dead by Daylight              |27428                |6.55              |
|292030|The Witcher 3: Wild 