In [29]:
import pyspark
from pyspark import SparkContext, SparkFiles
from pyspark.sql import Row
from pyspark.sql import SQLContext, SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, FloatType, StringType
from pyspark.ml.feature import Bucketizer
from pyspark.sql.functions import udf
from pyspark.sql.types import *


In [108]:
sc =SparkContext('local[8]')

In [109]:
url = "https://gist.githubusercontent.com/masterofpun/f415cbae73c01c82a45e3a44b7189520/raw/902fddea3208778c107e091fe98ae28a65e93e1e/player_data.csv"
sc.addFile(url)
sqlContext = SQLContext(sc)
SparkFiles.get("player_data.csv")

'/tmp/spark-a4bf5263-e9a9-4a42-b88d-09addcc430b5/userFiles-592ba8fe-b8f1-4da3-a35c-d5ca0913665c/player_data.csv'

In [110]:
df = sqlContext.read.csv(SparkFiles.get("player_data.csv"),sep="," ,header=True, inferSchema= True)


In [111]:
df = df.select("nationality","name", "height", "weight", "goals", "wins", "losses", "appearances", "aerial_lost")

+-------------+------------------+------+------+-----+----+------+-----------+-----------+
|  nationality|              name|height|weight|goals|wins|losses|appearances|aerial_lost|
+-------------+------------------+------+------+-----+----+------+-----------+-----------+
|      England|     Danny Welbeck| 185cm|  73kg| 0.22|  87|    35|        154|       null|
|      Germany|          Emre Can| 184cm|  82kg| 0.03|  29|    16|         60|         71|
|        Spain|    Álvaro Negredo| 186cm|  86kg| 0.26|  23|     8|         39|       null|
|      England|        Jordan Lee| 178cm|  80kg|  0.0|   0|     0|          0|          0|
|      Belgium|  Thibaut Courtois| 199cm|  91kg|  0.0|  32|    10|         62|       null|
|        Italy|    Angelo Ogbonna| 191cm|  86kg|  0.0|  13|     8|         32|         35|
|      Ireland|       John O'Shea| 191cm|  75kg| 13.0| 214|   105|        420|        266|
|      Germany|   Gerhard Tremmel| 190cm|  86kg|  0.0|   7|    15|         29|       null|

In [112]:
df = df.dropna()
df = df.withColumn("height", F.regexp_replace("height", "cm", ""))
df = df.withColumn("height", df["height"].cast(IntegerType()))
df = df.withColumn("weight", F.regexp_replace("weight", "kg", ""))
df = df.withColumn("weight", df["weight"].cast(IntegerType()))

In [113]:
df = df.withColumn("BMI", (df['weight']/((df["height"]/100)**2)))
df = df.withColumn("BMI", df["BMI"].cast(FloatType()))
df = df.withColumn("BMI", F.round(df["BMI"], 2))

In [124]:
# Making labels for lengths
def categorizer(height):
      if height < 175:
        return "small"
      elif height < 185:
        return "medium"
      else:
        return "tall"

In [125]:
# applying categorizer
bucket_udf = udf(categorizer, StringType() )
df = df.withColumn("height_factor", bucket_udf("height"))

In [188]:
# Group by height and the sum of aerial losses
df.groupBy('height_factor').sum("aerial_lost").show()

+-------------+----------------+
|height_factor|sum(aerial_lost)|
+-------------+----------------+
|         tall|           12785|
|       medium|           13685|
|        small|            3164|
+-------------+----------------+



In [192]:
# Filtered out England 
df.filter(df.nationality != "England").show()

+-----------+------------------+------+------+-----+----+------+-----------+-----------+-----+-------------+
|nationality|              name|height|weight|goals|wins|losses|appearances|aerial_lost|  BMI|height_factor|
+-----------+------------------+------+------+-----+----+------+-----------+-----------+-----+-------------+
|    Germany|          Emre Can|   184|    82| 0.03|  29|    16|         60|         71|24.22|       medium|
|      Italy|    Angelo Ogbonna|   191|    86|  0.0|  13|     8|         32|         35|23.57|         tall|
|    Ireland|       John O'Shea|   191|    75| 13.0| 214|   105|        420|        266|20.56|         tall|
|   Scotland|       George Boyd|   186|    79|  0.1|  17|    37|         72|        105|22.84|         tall|
|   Cameroon|        Joel Matip|   195|    90|  0.0|   4|     0|          5|          5|23.67|         tall|
|    Senegal|   Papy Djilobodji|   193|    82|  0.0|   0|     4|          5|          5|22.01|         tall|
|     Serbia|Branis