In [0]:
"""
• Understanding the Problem
Let's break down the problem statement:
• Data Input: We have a dataset containing records of football matches. Each record includes the names of two teams, the match result (win or loss), and the number of goals scored by each team.
• Ranking Rules:
1. Teams are ranked based on their total points, calculated by summing the points earned from wins and subtracting the points lost from losses.
2. In case of a tie in total points, the team with a higher number of winning goals gets ranked higher.
"""


from pyspark.sql.types import *

matches_data = [
    (1, 1001, 1007, 1),
    (2, 1007, 1001, 2),
    (3, 1006, 1003, 3),
    (4, 1001, 1003, 1),
    (5, 1007, 1001, 1),
    (6, 1006, 1003, 2),
    (7, 1006, 1001, 3),
    (8, 1007, 1003, 5),
    (9, 1001, 1003, 1),
    (10, 1007, 1006, 2),
    (11, 1006, 1003, 3),
    (12, 1001, 1003, 4),
    (13, 1001, 1006, 2),
    (14, 1007, 1001, 4),
    (15, 1006, 1007, 3),
    (16, 1001, 1003, 3),
    (17, 1001, 1007, 3),
    (18, 1006, 1007, 2),
    (19, 1003, 1001, 1),
    (20, 1001, 1007, 3),
    (21, 1001, 1003, 3)
]

matches_schema = StructType([
    StructField("match_id", IntegerType()),
    StructField("winning_team_id", IntegerType()),
    StructField("losing_team_id", IntegerType()),
    StructField("goals_won", IntegerType()),
])

matches_df = spark.createDataFrame(matches_data, schema=matches_schema)
display(matches_df)


teaminfo_data = [
    (1001,'Nickmiesters'),
    (1003,'sunrisers'),
    (1006,'Philipines prates'),
    (1007,'Smashers')
]

teaminfo_schema = StructType([
    StructField("team_id", IntegerType()),
    StructField("team_name", StringType())
])

teaminfo_df = spark.createDataFrame(teaminfo_data, teaminfo_schema)
display(teaminfo_df)


match_id,winning_team_id,losing_team_id,goals_won
1,1001,1007,1
2,1007,1001,2
3,1006,1003,3
4,1001,1003,1
5,1007,1001,1
6,1006,1003,2
7,1006,1001,3
8,1007,1003,5
9,1001,1003,1
10,1007,1006,2


team_id,team_name
1001,Nickmiesters
1003,sunrisers
1006,Philipines prates
1007,Smashers


In [0]:
matches_df.createOrReplaceTempView("matches")
teaminfo_df.createOrReplaceTempView("teaminfo")

### SPARK SQL

In [0]:
spark.sql("""
          with cte as (
            select winning_team_id as team_id, 1 as points, goals_won from matches
            union all
            select losing_team_id as team_id, -1 as points, 0 as goals_won from matches
          ), cte2 as (
            select team_id, sum(points) as points, sum(goals_won) as goals_won from cte group by team_id
          )
          select team_name, rank() over(order by points desc, goals_won desc) as team_rank from cte2
          inner join teaminfo on cte2.team_id = teaminfo.team_id
          """).show(truncate=False)

+-----------------+---------+
|team_name        |team_rank|
+-----------------+---------+
|Nickmiesters     |1        |
|Philipines prates|2        |
|Smashers         |3        |
|sunrisers        |4        |
+-----------------+---------+



### DF API

In [0]:
from pyspark.sql import functions as F, Window
from pyspark.sql.functions import rank, col, desc

winning_df = matches_df.select(col("winning_team_id").alias("team_id"), col("goals_won")).withColumn("points", lit(1))
losing_df = matches_df.select(col("losing_team_id").alias("team_id") ).withColumn("goals_won", lit(0)).withColumn("points", lit(-1))

winning_df.union(losing_df).groupBy(col("team_id")).agg(sum("points").alias("points"), sum("goals_won").alias("goals_won")) \
    .join(teaminfo_df, "team_id") \
    .withColumn("team_rank", rank().over(Window.orderBy(desc("points"), desc("goals_won")))) \
    .select(col("team_name"), col("team_rank")) \
    .show(truncate=False)


+-----------------+---------+
|team_name        |team_rank|
+-----------------+---------+
|Nickmiesters     |1        |
|Philipines prates|2        |
|Smashers         |3        |
|sunrisers        |4        |
+-----------------+---------+

