## Boiler Plate Code

In [1]:
# Import PySpark
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession \
    .builder \
    .master('local[*]') \
    .appName("cricket") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

In [2]:
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [13]:
#lista = ['India','Pakisthan','Australia','England','SriLanka','Bangladesh','WestIndies','NewZealand','Afghanisthan'];
lista = ['India','Pakisthan','Australia','England','SriLanka','Bangladesh'];
colsa = ['Teams'];

data = [(item,) for item in lista]
df1 = spark.createDataFrame(data, colsa)

##Cricket Tournament Problem - Match Fixtures


In [14]:
df1.show()

+----------+
|     Teams|
+----------+
|     India|
| Pakisthan|
| Australia|
|   England|
|  SriLanka|
|Bangladesh|
+----------+



In [15]:
# Generate the matches
df2 = df1.alias("t1")\
        .crossJoin(df1.alias('t2'))\
        .filter(col("t1.Teams") > col("t2.Teams"))\
        .select(col("t1.Teams").alias("Team A"),\
                col("t2.Teams").alias("Team B"))
df2.show()

+----------+----------+
|    Team A|    Team B|
+----------+----------+
|     India| Australia|
| Pakisthan|     India|
| Pakisthan| Australia|
|     India|   England|
|     India|Bangladesh|
| Pakisthan|   England|
| Pakisthan|Bangladesh|
|   England| Australia|
|  SriLanka|     India|
|  SriLanka| Pakisthan|
|  SriLanka| Australia|
|Bangladesh| Australia|
|   England|Bangladesh|
|  SriLanka|   England|
|  SriLanka|Bangladesh|
+----------+----------+



## Points Table with Winner

In [16]:

df2 = df2.withColumn("random_num",rand())\
            .withColumn("Winner",when(col("random_num") < 0.4 ,df2["Team A"])\
                        .when(col("random_num")> 0.6 , df2["Team B"])\
                        .otherwise("Draw")
                        ).select("Team A","Team B","Winner")
df2.show()


+----------+----------+----------+
|    Team A|    Team B|    Winner|
+----------+----------+----------+
|     India| Australia| Australia|
| Pakisthan|     India| Pakisthan|
| Pakisthan| Australia|      Draw|
|     India|   England|      Draw|
|     India|Bangladesh|     India|
| Pakisthan|   England|   England|
| Pakisthan|Bangladesh|Bangladesh|
|   England| Australia| Australia|
|  SriLanka|     India|  SriLanka|
|  SriLanka| Pakisthan| Pakisthan|
|  SriLanka| Australia| Australia|
|Bangladesh| Australia| Australia|
|   England|Bangladesh|Bangladesh|
|  SriLanka|   England|   England|
|  SriLanka|Bangladesh|Bangladesh|
+----------+----------+----------+



In [17]:
top_df = df2.select("Team A", "Winner")\
          .withColumn("Points"\
            ,when(col("Team A") == col("Winner"),3)\
                  .when(col("Winner")== "Draw", 1)\
                  .otherwise(0)).select(col("Team A").alias("team"),"points")
bottom_df = df2.select("Team B", "Winner")\
          .withColumn("Points"\
            ,when(col("Team B") == col("Winner"),3)\
                  .when(col("Winner")== "Draw", 1)\
                  .otherwise(0)).select(col("Team B").alias("team"),"points")

points_df = top_df.unionAll(bottom_df)

points_df.show()

+----------+------+
|      team|points|
+----------+------+
|     India|     0|
| Pakisthan|     3|
| Pakisthan|     1|
|     India|     1|
|     India|     3|
| Pakisthan|     0|
| Pakisthan|     0|
|   England|     0|
|  SriLanka|     3|
|  SriLanka|     0|
|  SriLanka|     0|
|Bangladesh|     0|
|   England|     0|
|  SriLanka|     0|
|  SriLanka|     0|
| Australia|     3|
|     India|     0|
| Australia|     1|
|   England|     1|
|Bangladesh|     0|
+----------+------+
only showing top 20 rows



In [18]:
points_df.groupBy("team")\
                        .agg(sum("points").alias("total_points")\
                             ,count("points").alias("matches_played"))\
                             .orderBy(desc("total_points"),asc("matches_played")).show()

+----------+------------+--------------+
|      team|total_points|matches_played|
+----------+------------+--------------+
| Australia|          13|             5|
|Bangladesh|           9|             5|
| Pakisthan|           7|             5|
|   England|           7|             5|
|     India|           4|             5|
|  SriLanka|           3|             5|
+----------+------------+--------------+



## With Score and Net Runrate

In [19]:
df3  = df2.withColumn("team_a_score",round(rand()*325))\
            .withColumn("team_b_score",round(rand()*325))\
              .select(col("Team A").alias("team_a")\
                      ,col("Team B").alias("team_b")\
                      ,"team_a_score"\
                      ,"team_b_score"
                      )

In [20]:
df3  = df3.withColumn("winner",when(col("team_a_score") > col("team_b_score"), col("team_a"))\
                             .when(col("team_a_score") < col("team_b_score"), col("team_b"))\
                                   .otherwise("draw")
                                   )

In [23]:
top_df = df3.withColumn("points", when( df3.team_a_score > df3.team_b_score, 3)\
                                  .when(df3.team_a_score < df3.team_b_score,0)\
                              .otherwise(1))\
                              .withColumn("run_diff", 0.02 * (df3.team_a_score - df3.team_b_score))\
                              .select(col("team_a"), col("points"), col("run_diff"))

bottom_df = df3.withColumn("points", when( df3.team_b_score > df3.team_a_score, 3)\
                                  .when(df3.team_b_score < df3.team_a_score,0)\
                              .otherwise(1))\
                              .withColumn("run_diff", 0.02 * (df3.team_b_score - df3.team_a_score))\
                              .select(col("team_b"), col("points"), col("run_diff"))

total_df = top_df.unionAll(bottom_df)

In [26]:
total_df.groupBy(total_df.team_a).agg(sum(total_df.points).alias("total_points")\
                                      ,round(avg(total_df.run_diff),2).alias("net_run_rate"))\
                                      .select(col("team_a"),col("total_points"),col("net_run_rate"))\
                                      .orderBy(desc(col("total_points")),desc(col("net_run_rate")))\
                                      .show()

+----------+------------+------------+
|    team_a|total_points|net_run_rate|
+----------+------------+------------+
|  SriLanka|          15|        1.45|
| Pakisthan|          12|        0.46|
| Australia|           9|        0.95|
|     India|           3|       -0.62|
|   England|           3|       -0.86|
|Bangladesh|           3|       -1.38|
+----------+------------+------------+



In [21]:
1/50

0.02