In [1]:
from pyspark.sql import SparkSession

# New API
spark_session = SparkSession.builder\
        .master("spark://192.168.2.133:7077") \
        .appName("PART_AB_DanielCeoca")\
        .config("spark.dynamicAllocation.enabled", True)\
        .config("spark.dynamicAllocation.shuffleTracking.enabled",True)\
        .config("spark.shuffle.service.enabled", False)\
        .config("spark.dynamicAllocation.executorIdleTimeout","30s")\
        .config("spark.executor.cores", 4)\
        .config("spark.driver.port",9999)\
        .config("spark.blockManager.port",10005)\
        .getOrCreate()

# Old API (RDD)
spark_context = spark_session.sparkContext

spark_context.setLogLevel("WARN")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/03/13 19:18:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
transcripts = spark_context.textFile('hdfs://192.168.2.133:9000/yasp-chunk-small-aa.json')

In [3]:
df = spark_session.read.json('hdfs://192.168.2.133:9000/yasp-chunk-small-aa.json')

24/03/13 19:18:53 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


In [5]:
from pyspark.sql.functions import col, count, when

# Calclulate the times of wins each team
radiant_wins = df.select(col("radiant_win")).where(col("radiant_win")).count()
dire_wins = df.select(col("radiant_win")).where(~col("radiant_win")).count()

# Victory percentage
total_games = df.count()
radiant_win_rate = radiant_wins / total_games
dire_win_rate = dire_wins / total_games


win_rate_diff = abs(radiant_win_rate - dire_win_rate) * 100


print(f"Radiant win rate: {radiant_win_rate:.2%}")
print(f"Dire win rate: {dire_win_rate:.2%}")
print(f"Winning percentage gap: {win_rate_diff:.2f}%")


Radiant win rate: 29.70%
Dire win rate: 19.80%
Winning percentage gap: 9.90%


In [15]:
from pyspark.sql.functions import col, when, size
#confirm
df = df.withColumn("has_10_minutes", size(col("radiant_gold_adv")) >= 10)

# Filter
df = df.filter(col("has_10_minutes"))

# Extract
df = df.withColumn("radiant_gold_adv_10min", col("radiant_gold_adv")[9]) \
       .withColumn("radiant_xp_adv_10min", col("radiant_xp_adv")[9])

radiant_gold_lead = df.filter(col("radiant_gold_adv_10min") > 0)
dire_gold_lead = df.filter(col("radiant_gold_adv_10min") < 0)

radiant_xp_lead = df.filter(col("radiant_xp_adv_10min") > 0)
dire_xp_lead = df.filter(col("radiant_xp_adv_10min") < 0)

# Calculate win rates
radiant_gold_lead_win_rate = radiant_gold_lead.filter(col("radiant_win") == True).count() / radiant_gold_lead.count()
dire_gold_lead_win_rate = dire_gold_lead.filter(col("radiant_win") == False).count() / dire_gold_lead.count()

radiant_xp_lead_win_rate = radiant_xp_lead.filter(col("radiant_win") == True).count() / radiant_xp_lead.count()
dire_xp_lead_win_rate = dire_xp_lead.filter(col("radiant_win") == False).count() / dire_xp_lead.count()

print(f"Radiant 10-minute gold lead win rate: {radiant_gold_lead_win_rate * 100:.2f}%")
print(f"Dire 10-minute gold lead win rate: {dire_gold_lead_win_rate * 100:.2f}%")
print(f"Radiant 10-minute XP lead win rate: {radiant_xp_lead_win_rate * 100:.2f}%")
print(f"Dire 10-minute XP lead win rate: {dire_xp_lead_win_rate * 100:.2f}%")


                                                                                

Radiant 10-minute gold lead win rate: 70.37%
Dire 10-minute gold lead win rate: 54.55%
Radiant 10-minute XP lead win rate: 72.41%
Dire 10-minute XP lead win rate: 60.00%


In [8]:
from pyspark.sql.functions import explode, col, when, avg

# Expand the players array and calculate the KDA for each player
df_players = df.withColumn("player", explode(col("players"))).\
    withColumn("KDA", 
               (col("player.kills") + col("player.assists")) / 
               when(col("player.deaths") == 0, 1).otherwise(col("player.deaths"))).\
    select(col("match_id"), col("player.account_id"), col("KDA"))

# Average KDA per game
df_avg_kda_per_match = df_players.groupBy("match_id").agg(avg("KDA").alias("avg_KDA"))


highest_avg_kda = df_avg_kda_per_match.orderBy(col("avg_KDA").desc()).first()
lowest_avg_kda = df_avg_kda_per_match.orderBy(col("avg_KDA").asc()).first()


print(f"The Match id with the highest KDA {highest_avg_kda['match_id']}, Average KDA: {highest_avg_kda['avg_KDA']}")
print(f"The Match id with the lowest KDA: {lowest_avg_kda['match_id']}, Average KDA: {lowest_avg_kda['avg_KDA']}")



The Match id with the highest KDA 2001390318, Average KDA: 16.275263157894738
The Match id with the lowest KDA: 2001375094, Average KDA: 2.539231601731602


                                                                                

In [9]:
from pyspark.sql.functions import explode, sum as spark_sum

df_exploded = df.withColumn("players", explode("players"))

df_kills = df_exploded.groupBy("match_id").agg(spark_sum("players.kills").alias("total_kills"))

# Find the match with the highest total kills
highest_kills_match = df_kills.orderBy("total_kills", ascending=False).first()

highest_kills_match_id = highest_kills_match["match_id"]
highest_kills = highest_kills_match["total_kills"]

print(f"Match with the highest total kills: {highest_kills_match_id}, Total kills: {highest_kills}")




Match with the highest total kills: 2001377504, Total kills: 89


                                                                                

In [12]:
from pyspark.sql.functions import col
longest_matches = df.orderBy(col("duration").desc()).select("match_id", "duration").limit(5)

shortest_matches = df.where(col("duration") > 0).orderBy("duration").select("match_id", "duration").limit(5)

print("Five Longest Matches:")
longest_matches.show()

print("Five Shortest Matches:")
shortest_matches.show()

Five Longest Matches:
+----------+--------+
|  match_id|duration|
+----------+--------+
|2001375031|    2495|
|2001374907|    2423|
|2001375993|    2385|
|2001379127|    2325|
|2001377295|    2324|
+----------+--------+

Five Shortest Matches:
+----------+--------+
|  match_id|duration|
+----------+--------+
|2001389000|     723|
|2001400694|     898|
|2001387742|    1063|
|2001389600|    1093|
|2001378954|    1169|
+----------+--------+



In [17]:
spark_context.stop()