In [2]:
from functools import reduce
from pyspark import SparkContext
from pyspark.sql import SparkSession, Window, Row
from pyspark.sql.functions import row_number, rank, ntile, when, col, sum, round
from pyspark.sql.types import *

In [3]:
#Connecting to a spark cluster
spark = SparkSession \
        .builder \
        .appName('Ranking-Football') \
        .getOrCreate()

In [4]:
#Read from file "matches.csv" to a dataframe
df_matches = spark.read.format('csv').options(header='true').load('./Data/soccer/matches.csv')

In [6]:
#Select all columns with limit 10 rows in df_matches
df_matches.limit(10).toPandas()

Unnamed: 0,Match_ID,Div,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG
0,1,D2,2009,04/04/2010,Oberhausen,Kaiserslautern,2,1
1,2,D2,2009,01/11/2009,Munich 1860,Kaiserslautern,0,1
2,3,D2,2009,04/10/2009,Frankfurt FSV,Kaiserslautern,1,1
3,4,D2,2009,21/02/2010,Frankfurt FSV,Karlsruhe,2,1
4,5,D2,2009,06/12/2009,Ahlen,Karlsruhe,1,3
5,6,D2,2009,03/04/2010,Union Berlin,Karlsruhe,1,1
6,7,D2,2009,14/08/2009,Paderborn,Karlsruhe,2,0
7,8,D2,2009,08/03/2010,Bielefeld,Karlsruhe,0,1
8,9,D2,2009,26/09/2009,Kaiserslautern,Karlsruhe,2,0
9,10,D2,2009,21/11/2009,Hansa Rostock,Karlsruhe,2,1


In [12]:
# WithCoumn "HomeTeamWin" with result is 1 if FTHG > FTAG else 0 
# WithCoumn "AwayTeamWin" with result is 1 if FTHG < FTAGelse 0
# WithCoumn "GameTie" with result is 1 if FTHG = FTAG else 0
df_matches = df_matches.withColumn("HomeTeamWin", when(col('FTHG') > col('FTAG'), 1).otherwise(0)) \
                       .withColumn("AwayTeamWin", when(col('FTHG') < col('FTAG'), 1).otherwise(0)) \
                       .withColumn("GameTie", when(col('FTHG') == col('FTAG'), 1).otherwise(0)) 

In [13]:
#Select all columns with limit 10 rows in df_matches after add some columns
df_matches.limit(10).toPandas()

Unnamed: 0,Match_ID,Div,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,HomeTeamWin,AwayTeamWin,GameTie
0,1,D2,2009,04/04/2010,Oberhausen,Kaiserslautern,2,1,1,0,0
1,2,D2,2009,01/11/2009,Munich 1860,Kaiserslautern,0,1,0,1,0
2,3,D2,2009,04/10/2009,Frankfurt FSV,Kaiserslautern,1,1,0,0,1
3,4,D2,2009,21/02/2010,Frankfurt FSV,Karlsruhe,2,1,1,0,0
4,5,D2,2009,06/12/2009,Ahlen,Karlsruhe,1,3,0,1,0
5,6,D2,2009,03/04/2010,Union Berlin,Karlsruhe,1,1,0,0,1
6,7,D2,2009,14/08/2009,Paderborn,Karlsruhe,2,0,1,0,0
7,8,D2,2009,08/03/2010,Bielefeld,Karlsruhe,0,1,0,1,0
8,9,D2,2009,26/09/2009,Kaiserslautern,Karlsruhe,2,0,1,0,0
9,10,D2,2009,21/11/2009,Hansa Rostock,Karlsruhe,2,1,1,0,0


In [17]:
#Filter df_matches with season from 2000 to 2010 and Div is D1 then assign to new dataframe
bundesliga = df_matches.filter((col('Season') >= 2000) & \
                               (col('Season') <= 2010) & \
                               (col('Div') == 'D1'))

In [18]:
#Select all columns limit 10 rows of bundesliga
bundesliga.limit(10).toPandas()

Unnamed: 0,Match_ID,Div,Season,Date,HomeTeam,AwayTeam,FTHG,FTAG,HomeTeamWin,AwayTeamWin,GameTie
0,21,D1,2009,06/02/2010,Bochum,Leverkusen,1,1,0,0,1
1,22,D1,2009,22/11/2009,Bayern Munich,Leverkusen,1,1,0,0,1
2,23,D1,2009,08/05/2010,M'gladbach,Leverkusen,1,1,0,0,1
3,24,D1,2009,08/08/2009,Mainz,Leverkusen,2,2,0,0,1
4,25,D1,2009,17/10/2009,Hamburg,Leverkusen,0,0,0,0,1
5,26,D1,2009,17/04/2010,Stuttgart,Leverkusen,2,1,1,0,0
6,27,D1,2009,20/03/2010,Dortmund,Leverkusen,3,0,1,0,0
7,28,D1,2009,31/10/2009,Schalke 04,Leverkusen,2,2,0,0,1
8,29,D1,2009,22/08/2009,Freiburg,Leverkusen,0,5,0,1,0
9,30,D1,2009,21/02/2010,Werder Bremen,Leverkusen,2,2,0,0,1


In [42]:
# Calculating total home win matches, home loss matches, home tie matches, total scored goals and total scored against in each season
home_matches = bundesliga.groupby('Season', 'HomeTeam') \
    .agg(sum(col('HomeTeamWin')).alias('TotalHomeWin') \
        ,sum(col('AwayTeamWin')).alias('TotalHomeLoss') \
        ,sum(col('GameTie')).alias('TotalHomeTie') \
        ,sum(col('FTHG')).alias('HomeScoredGoals') \
        ,sum(col('FTAG')).alias('HomeAgainstGoals')) \
    .withColumnRenamed('HomeTeam', 'Team')
home_matches.limit(10).toPandas()

Unnamed: 0,Season,Team,TotalHomeWin,TotalHomeLoss,TotalHomeTie,HomeScoredGoals,HomeAgainstGoals
0,2005,Kaiserslautern,5,7,5,26.0,33.0
1,2006,Cottbus,6,6,5,21.0,22.0
2,2001,St Pauli,4,9,4,19.0,28.0
3,2005,Mainz,6,4,7,31.0,23.0
4,2006,Hamburg,4,4,9,22.0,19.0
5,2003,Stuttgart,9,1,7,29.0,13.0
6,2003,Hansa Rostock,10,6,1,34.0,18.0
7,2007,Hansa Rostock,5,8,4,17.0,21.0
8,2001,M'gladbach,6,6,5,21.0,21.0
9,2002,M'gladbach,10,2,5,31.0,11.0


In [43]:
# Calculating total away win matches, away loss matches, away tie matches, total scored goals and total scored against in each season
away_matches = bundesliga.groupby('Season', 'AwayTeam') \
    .agg(sum(col('AwayTeamWin')).alias('TotalAwayWin') \
        ,sum(col('HomeTeamWin')).alias('TotalAwayLoss') \
        ,sum(col('GameTie')).alias('TotalAwayTie') \
        ,sum(col('FTAG')).alias('AwayScoredGoals') \
        ,sum(col('FTHG')).alias('AwayAgainstGoals')) \
    .withColumnRenamed('AwayTeam', 'Team')
away_matches.limit(10).toPandas()

Unnamed: 0,Season,Team,TotalAwayWin,TotalAwayLoss,TotalAwayTie,AwayScoredGoals,AwayAgainstGoals
0,2005,Kaiserslautern,3,10,4,21.0,38.0
1,2006,Cottbus,5,9,3,17.0,27.0
2,2001,St Pauli,0,11,6,18.0,42.0
3,2005,Mainz,3,10,4,15.0,24.0
4,2006,Hamburg,6,5,6,21.0,18.0
5,2003,Stuttgart,9,5,3,23.0,11.0
6,2003,Hansa Rostock,2,8,7,21.0,36.0
7,2007,Hansa Rostock,3,12,2,13.0,31.0
8,2001,M'gladbach,3,7,7,20.0,32.0
9,2002,M'gladbach,1,12,4,12.0,34.0


In [33]:
#Collect information of matches on each season
collect_seasons = home_matches.join(away_matches, ['Season', 'Team'], 'inner') \
    .withColumn('GoalsScored', col('HomeScoredGoals') + col('AwayScoredGoals')) \
    .withColumn('GoalsAgainst', col('HomeAgainstGoals') + col('AwayAgainstGoals')) \
    .withColumn('GoalDifferentials', col('GoalsScored') - col('GoalsAgainst')) \
    .withColumn('Win', col('TotalHomeWin') + col('TotalAwayWin')) \
    .withColumn('Loss', col('TotalHomeLoss') + col('TotalAwayLoss')) \
    .withColumn('Tie', col('TotalHomeTie') + col('TotalAwayTie')) \
    .withColumn('WinPct', round((col('Win')/(col('Win') + col('Loss') + col('Tie'))) * 100, 2)) \
    .drop('TotalHomeWin','TotalHomeLoss','TotalHomeTie', 'HomeScoredGoals', 'HomeAgainstGoals') \
    .drop('TotalAwayWin', 'TotalAwayLoss', 'TotalAwayTie', 'AwayScoredGoals', 'AwayAgainstGoals') 

In [34]:
# Select all columns litmit 10 rows of collect_seasons
collect_seasons.limit(10).toPandas()

Unnamed: 0,Season,Team,GoalsScored,GoalsAgainst,GoalDifferentials,Win,Loss,Tie,WinPct
0,2005,Kaiserslautern,47.0,71.0,-24.0,8,17,9,23.53
1,2006,Cottbus,38.0,49.0,-11.0,11,15,8,32.35
2,2001,St Pauli,37.0,70.0,-33.0,4,20,10,11.76
3,2005,Mainz,46.0,47.0,-1.0,9,14,11,26.47
4,2006,Hamburg,43.0,37.0,6.0,10,9,15,29.41
5,2003,Stuttgart,52.0,24.0,28.0,18,6,10,52.94
6,2003,Hansa Rostock,55.0,54.0,1.0,12,14,8,35.29
7,2007,Hansa Rostock,30.0,52.0,-22.0,8,20,6,23.53
8,2001,M'gladbach,41.0,53.0,-12.0,9,13,12,26.47
9,2002,M'gladbach,43.0,45.0,-2.0,11,14,9,32.35


In [37]:
#Season feature
window_season = Window.partitionBy('Season').orderBy(col('WinPct').desc(), col('GoalDifferentials').desc())

In [38]:
#Ranking of all Team on each season
ranking_table = collect_seasons.withColumn("Position", row_number().over(window_season))
ranking_table.limit(20).toPandas()

Unnamed: 0,Season,Team,GoalsScored,GoalsAgainst,GoalDifferentials,Win,Loss,Tie,WinPct,Position
0,2000,Bayern Munich,62.0,37.0,25.0,19,9,6,55.88,1
1,2000,Schalke 04,65.0,35.0,30.0,18,8,8,52.94,2
2,2000,Hertha,58.0,52.0,6.0,18,14,2,52.94,3
3,2000,Leverkusen,54.0,40.0,14.0,17,11,6,50.0,4
4,2000,Dortmund,62.0,42.0,20.0,16,8,10,47.06,5
5,2000,Freiburg,54.0,37.0,17.0,15,9,10,44.12,6
6,2000,Werder Bremen,53.0,48.0,5.0,15,11,8,44.12,7
7,2000,Kaiserslautern,49.0,54.0,-5.0,15,14,5,44.12,8
8,2000,Wolfsburg,60.0,45.0,15.0,12,11,11,35.29,9
9,2000,FC Koln,59.0,52.0,7.0,12,12,10,35.29,10


In [41]:
# Show top 1 of each seans in bundesliga(D1)
top_ranking_seasons = ranking_table.filter(col('Position') == 1)
top_ranking_seasons.toPandas()

Unnamed: 0,Season,Team,GoalsScored,GoalsAgainst,GoalDifferentials,Win,Loss,Tie,WinPct,Position
0,2000,Bayern Munich,62.0,37.0,25.0,19,9,6,55.88,1
1,2001,Leverkusen,77.0,38.0,39.0,21,7,6,61.76,1
2,2002,Bayern Munich,70.0,25.0,45.0,23,5,6,67.65,1
3,2003,Werder Bremen,79.0,38.0,41.0,22,4,8,64.71,1
4,2004,Bayern Munich,75.0,33.0,42.0,24,5,5,70.59,1
5,2005,Bayern Munich,67.0,32.0,35.0,22,3,9,64.71,1
6,2006,Stuttgart,61.0,37.0,24.0,21,6,7,61.76,1
7,2007,Bayern Munich,68.0,21.0,47.0,22,2,10,64.71,1
8,2008,Wolfsburg,80.0,41.0,39.0,21,7,6,61.76,1
9,2009,Bayern Munich,72.0,31.0,41.0,20,4,10,58.82,1
