In [0]:
#total grand slam titles winner

In [0]:
from pyspark.sql.functions import col,sum,count

#create players dataframe

players_data=[
    (1,"Nadal"),
    (2,"Federer"),
    (3,"Novak")
]
players_columns=['player_id','player_name']

players_df=spark.createDataFrame(players_data,players_columns)

#create championships dataframe
championships_data=[
    (2017,2,1,1,2),
    (2018,3,1,3,2),
    (2019,3,1,3,1)
]
championships_column=['year','wimbeldon','fr_open','us_open','au_open']

championships_df=spark.createDataFrame(championships_data,championships_column)

players_df.display()
championships_df.display()


player_id,player_name
1,Nadal
2,Federer
3,Novak


year,wimbeldon,fr_open,us_open,au_open
2017,2,1,1,2
2018,3,1,3,2
2019,3,1,3,1


In [0]:
#create cte1
#we should col() function when applying the alias method

cte1 = championships_df.select("year",col("wimbeldon").alias("player")) \
        .unionAll(championships_df.select("year",col("fr_open").alias("player"))) \
        .unionAll(championships_df.select("year",col("us_open").alias("player"))) \
        .unionAll(championships_df.select("year",col("au_open").alias("player"))) \

cte1.display()

#AttributeError: 'DataFrame' object has no attribute 'unionall'
#Note: 
"""
1.PySpark is case-sensitive when it comes to DataFrame operations and function names. This means that unionAll (with a capital "A") is not the same as unionall 
2.In PySpark, method names typically follow the camelCase convention
"""
                       

year,player
2017,2
2018,3
2019,3
2017,1
2018,1
2019,1
2017,1
2018,3
2019,3
2017,2


Out[43]: '\n1.PySpark is case-sensitive when it comes to DataFrame operations and function names. This means that unionAll (with a capital "A") is not the same as unionall \n2.In PySpark, method names typically follow the camelCase convention\n'

In [0]:
#create cte2

cte2 = cte1.groupBy("player").agg(count("*").alias("grand_slams_count"))

cte2.display()

player,grand_slams_count
2,3
3,4
1,5


In [0]:
#join cte2 with players dataframe

result_df = cte2.join(players_df, cte2.player==players_df.player_id, "inner") \
                .select("player_id","player_name","grand_slams_count")

result_df.display()

player_id,player_name,grand_slams_count
1,Nadal,5
2,Federer,3
3,Novak,4


##SQL Approach:

In [0]:
players_df.createOrReplaceTempView("players")
championships_df.createOrReplaceTempView("championships")

#### SQL Approach 1:

In [0]:
%sql

--split tournament into separate rows:

with cte1 as (
select year,wimbeldon as player from championships
union all
select year,fr_open as player from championships
union all
select year,us_open as player from championships
union all
select year,au_open as player from championships
)
select player,count(1) as grand_slams_count
from cte1
group by player


player,grand_slams_count
2,3
3,4
1,5


In [0]:
%sql

--aggregate each player

with cte1 as (
select year,wimbeldon as player from championships
union all
select year,fr_open as player from championships
union all
select year,us_open as player from championships
union all
select year,au_open as player from championships
)
,cte2 as (
select player,count(1) as grand_slams_count
from cte1
group by player
)

--join to get player_name in final result

select player_id,player_name,grand_slams_count
from players
inner join cte2 on players.player_id = cte2.player



player_id,player_name,grand_slams_count
1,Nadal,5
2,Federer,3
3,Novak,4


#### SQL Approach 2:

In [0]:
%sql

--join player with tournament:

select *
from players p
join championships c
on c.wimbeldon=p.player_id or c.fr_open=p.player_id or c.us_open=p.player_id or c.au_open=p.player_id
order by 1,3


--OR
--condition column_name IN (col1, col2, col3) checks if column_name is equal to any of the values in col1, col2, or col3.
/*
SELECT *
FROM players p
JOIN championships c
ON p.player_id IN (c.wimbeldon, c.fr_open, c.us_open, c.au_open)
ORDER BY 1, 3;
*/

--ORDER BY 1, 3 means the results will be ordered by the first column and the third column in the select list.



player_id,player_name,year,wimbeldon,fr_open,us_open,au_open
1,Nadal,2017,2,1,1,2
1,Nadal,2018,3,1,3,2
1,Nadal,2019,3,1,3,1
2,Federer,2017,2,1,1,2
2,Federer,2018,3,1,3,2
3,Novak,2018,3,1,3,2
3,Novak,2019,3,1,3,1


add no. of titles for each year




In [0]:
%sql
select 
p.player_id,p.player_name,c.year,
(case when c.wimbeldon=p.player_id then 1 else 0 end) +
(case when c.fr_open=p.player_id then 1 else 0 end) +
(case when c.us_open=p.player_id then 1 else 0 end) +
(case when c.au_open=p.player_id then 1 else 0 end) as year_total
from players p
join championships c
on p.player_id IN (c.wimbeldon, c.fr_open, c.us_open, c.au_open)
order by 1,3



player_id,player_name,year,year_total
1,Nadal,2017,2
1,Nadal,2018,1
1,Nadal,2019,2
2,Federer,2017,2
2,Federer,2018,1
3,Novak,2018,2
3,Novak,2019,2


aggregate no. of titles for each year

In [0]:
%sql
select 
p.player_id,p.player_name,
sum(
(case when c.wimbeldon=p.player_id then 1 else 0 end) +
(case when c.fr_open=p.player_id then 1 else 0 end) +
(case when c.us_open=p.player_id then 1 else 0 end) +
(case when c.au_open=p.player_id then 1 else 0 end)
) as grand_slams_count
from players p
join championships c
on p.player_id IN (c.wimbeldon, c.fr_open, c.us_open, c.au_open)
group by 1,2 --i.e. grouping by p.player_id and p.player_name
order by p.player_id desc



player_id,player_name,grand_slams_count
3,Novak,4
2,Federer,3
1,Nadal,5
