In [7]:
from pyspark.sql import SparkSession

In [8]:
spark = SparkSession.builder\
                    .appName('Analyzing soccer players')\
                    .getOrCreate()

In [9]:
players = spark.read\
                .format('csv')\
                .option('header', 'true')\
                .load('./datasets/fifa_player/player.csv')

In [10]:
players.printSchema()

root
 |-- id: string (nullable = true)
 |-- player_api_id: string (nullable = true)
 |-- player_name: string (nullable = true)
 |-- player_fifa_api_id: string (nullable = true)
 |-- birthday: string (nullable = true)
 |-- height: string (nullable = true)
 |-- weight: string (nullable = true)



In [11]:
players.show(5)

+---+-------------+------------------+------------------+-------------------+------+------+
| id|player_api_id|       player_name|player_fifa_api_id|           birthday|height|weight|
+---+-------------+------------------+------------------+-------------------+------+------+
|  1|       505942|Aaron Appindangoye|            218353|1992-02-29 00:00:00|182.88|   187|
|  2|       155782|   Aaron Cresswell|            189615|1989-12-15 00:00:00|170.18|   146|
|  3|       162549|       Aaron Doran|            186170|1991-05-13 00:00:00|170.18|   163|
|  4|        30572|     Aaron Galindo|            140161|1982-05-08 00:00:00|182.88|   198|
|  5|        23780|      Aaron Hughes|             17725|1979-11-08 00:00:00|182.88|   154|
+---+-------------+------------------+------------------+-------------------+------+------+
only showing top 5 rows



In [12]:
player_attributes = spark.read\
                        .format('csv')\
                        .option('header', 'true')\
                        .load('./datasets/fifa_player/Player_Attributes.csv')

In [13]:
player_attributes.printSchema()

root
 |-- id: string (nullable = true)
 |-- player_fifa_api_id: string (nullable = true)
 |-- player_api_id: string (nullable = true)
 |-- date: string (nullable = true)
 |-- overall_rating: string (nullable = true)
 |-- potential: string (nullable = true)
 |-- preferred_foot: string (nullable = true)
 |-- attacking_work_rate: string (nullable = true)
 |-- defensive_work_rate: string (nullable = true)
 |-- crossing: string (nullable = true)
 |-- finishing: string (nullable = true)
 |-- heading_accuracy: string (nullable = true)
 |-- short_passing: string (nullable = true)
 |-- volleys: string (nullable = true)
 |-- dribbling: string (nullable = true)
 |-- curve: string (nullable = true)
 |-- free_kick_accuracy: string (nullable = true)
 |-- long_passing: string (nullable = true)
 |-- ball_control: string (nullable = true)
 |-- acceleration: string (nullable = true)
 |-- sprint_speed: string (nullable = true)
 |-- agility: string (nullable = true)
 |-- reactions: string (nullable = true

In [14]:
player_attributes.show(5)

+---+------------------+-------------+-------------------+--------------+---------+--------------+-------------------+-------------------+--------+---------+----------------+-------------+-------+---------+-----+------------------+------------+------------+------------+------------+-------+---------+-------+----------+-------+-------+--------+----------+----------+-------------+-----------+------+---------+-------+---------------+--------------+---------+-----------+----------+--------------+-----------+
| id|player_fifa_api_id|player_api_id|               date|overall_rating|potential|preferred_foot|attacking_work_rate|defensive_work_rate|crossing|finishing|heading_accuracy|short_passing|volleys|dribbling|curve|free_kick_accuracy|long_passing|ball_control|acceleration|sprint_speed|agility|reactions|balance|shot_power|jumping|stamina|strength|long_shots|aggression|interceptions|positioning|vision|penalties|marking|standing_tackle|sliding_tackle|gk_diving|gk_handling|gk_kicking|gk_posit

In [15]:
players.count(), player_attributes.count()

(11060, 183978)

In [16]:
player_attributes.select('player_api_id')\
                .distinct()\
                .count()

11060

In [17]:
players = players.drop('id', 'player_fifa_api_id')
players.columns

['player_api_id', 'player_name', 'birthday', 'height', 'weight']

In [20]:
player_attributes = player_attributes.drop(
    'id',
    'player_fifa_api_)id',
    'preferred_foot',
    'attacking_work_rate',
    'defensive_work_rate',
    'crossing',
    'jumping',
    'sprint_speed',
    'balance',
    'aggression',
    'short_passing',
    'potential')
player_attributes.columns

['player_fifa_api_id',
 'player_api_id',
 'date',
 'overall_rating',
 'finishing',
 'heading_accuracy',
 'volleys',
 'dribbling',
 'curve',
 'free_kick_accuracy',
 'long_passing',
 'ball_control',
 'acceleration',
 'agility',
 'reactions',
 'shot_power',
 'stamina',
 'strength',
 'long_shots',
 'interceptions',
 'positioning',
 'vision',
 'penalties',
 'marking',
 'standing_tackle',
 'sliding_tackle',
 'gk_diving',
 'gk_handling',
 'gk_kicking',
 'gk_positioning',
 'gk_reflexes']

In [21]:
player_attributes = player_attributes.dropna()
players = players.dropna()

In [22]:
players.count(), player_attributes.count()

(11060, 181265)

In [24]:
from pyspark.sql.functions import udf

In [26]:
year_extract_udf = udf(lambda data: data.split('-')[0])
player_attributes = player_attributes.withColumn(
    'year',
    year_extract_udf(player_attributes.date))

In [27]:
player_attributes = player_attributes.drop('date')

In [28]:
player_attributes.columns

['player_fifa_api_id',
 'player_api_id',
 'overall_rating',
 'finishing',
 'heading_accuracy',
 'volleys',
 'dribbling',
 'curve',
 'free_kick_accuracy',
 'long_passing',
 'ball_control',
 'acceleration',
 'agility',
 'reactions',
 'shot_power',
 'stamina',
 'strength',
 'long_shots',
 'interceptions',
 'positioning',
 'vision',
 'penalties',
 'marking',
 'standing_tackle',
 'sliding_tackle',
 'gk_diving',
 'gk_handling',
 'gk_kicking',
 'gk_positioning',
 'gk_reflexes',
 'year']

In [29]:
pa_2016 = player_attributes.filter(player_attributes.year==2016)

In [30]:
pa_2016.count()

14098

In [31]:
pa_2016.select(pa_2016.player_api_id)\
    .distinct()\
    .count()

5586

In [32]:
pa_striker_2016 = pa_2016.groupBy('player_api_id')\
    .agg({
        'finishing': 'avg',
        'shot_power': 'avg',
        'acceleration': 'avg',
    })

In [33]:
pa_striker_2016.count()

5586

In [34]:
pa_striker_2016.show(5)

+-------------+-----------------+-----------------+---------------+
|player_api_id|   avg(finishing)|avg(acceleration)|avg(shot_power)|
+-------------+-----------------+-----------------+---------------+
|       309726|75.44444444444444|74.11111111111111|           76.0|
|        26112|             53.0|             51.0|           76.0|
|        38433|            68.25|             74.0|           74.0|
|       295060|             25.0|             62.0|           40.0|
|       161396|             29.0|             72.0|           69.0|
+-------------+-----------------+-----------------+---------------+
only showing top 5 rows



In [40]:
pa_striker_2016 = pa_striker_2016.withColumnRenamed('avg(finishing)', 'finishing')\
                                .withColumnRenamed('avg(shot_power)', 'shot_power')\
                                .withColumnRenamed('avg(acceleration)', 'acceleration')

In [41]:
weight_finishing = 1
weight_shot_power = 2
weight_acceleration = 1

total_weight = weight_finishing + weight_shot_power + weight_acceleration

In [42]:
strikers = pa_striker_2016.withColumn('striker_grade',
                                        (pa_striker_2016.finishing * weight_finishing + \
                                         pa_striker_2016.shot_power * weight_shot_power + \
                                        pa_striker_2016.acceleration * weight_acceleration)/ \
                                         total_weight)

In [43]:
strikers = strikers.drop('finishing', 'acceleration', 'shot_power')

In [44]:
strikers = strikers.filter(strikers.striker_grade > 70)\
                    .sort(strikers.striker_grade.desc())

In [45]:
strikers.show(20)

+-------------+-----------------+
|player_api_id|    striker_grade|
+-------------+-----------------+
|        20276|            89.25|
|        37412|             89.0|
|        38817|            88.75|
|        32118|            88.25|
|        31921|             87.0|
|        30834|            86.75|
|       303824|85.10714285714286|
|       129944|             85.0|
|       158263|            84.75|
|       150565|            84.75|
|        25759|84.66666666666667|
|       156726|             84.5|
|       169193|          84.4375|
|       286119|84.42857142857143|
|        30348|           84.375|
|        93447|            84.25|
|        50047|            84.25|
|        46509|            84.25|
|       178812|             84.0|
|       181276|             84.0|
+-------------+-----------------+
only showing top 20 rows



In [54]:
strikers.count(), players.count()

(1609, 11060)

In [55]:
striker_details = players.join(strikers, players.player_api_id == strikers.player_api_id)

In [58]:
striker_details.columns

['player_api_id',
 'player_name',
 'birthday',
 'height',
 'weight',
 'player_api_id',
 'striker_grade']

In [59]:
striker_details.show(10)

+-------------+--------------------+-------------------+------+------+-------------+-----------------+
|player_api_id|         player_name|           birthday|height|weight|player_api_id|    striker_grade|
+-------------+--------------------+-------------------+------+------+-------------+-----------------+
|        20276|                Hulk|1986-07-25 00:00:00|180.34|   187|        20276|            89.25|
|        37412|       Sergio Aguero|1988-06-02 00:00:00|172.72|   163|        37412|             89.0|
|        38817|        Carlos Tevez|1984-02-05 00:00:00|172.72|   157|        38817|            88.75|
|        32118|      Lukas Podolski|1985-06-04 00:00:00|182.88|   183|        32118|            88.25|
|        31921|         Gareth Bale|1989-07-16 00:00:00|182.88|   163|        31921|             87.0|
|        30834|        Arjen Robben|1984-01-23 00:00:00|180.34|   176|        30834|            86.75|
|       303824|       Memphis Depay|1994-02-13 00:00:00|175.26|   172|   

In [60]:
striker_details = players.join(strikers, ['player_api_id'])

In [61]:
striker_details.columns

['player_api_id',
 'player_name',
 'birthday',
 'height',
 'weight',
 'striker_grade']

In [62]:
striker_details.select('player_name', 'striker_grade')\
                .coalesce(1)\
                .write\
                .option('header', 'true')\
                .csv('striker_grade.csv')

In [66]:
striker_details.select('player_name', 'striker_grade')\
                .write\
                .json('striker_grade.json')

In [68]:
valuesA = [('John', 100000), ('James', 150000), ('Emily', 65000), ('Nina', 200000)]
tableA = spark.createDataFrame(valuesA, ['name', 'salary'])

In [69]:
tableA.show()

+-----+------+
| name|salary|
+-----+------+
| John|100000|
|James|150000|
|Emily| 65000|
| Nina|200000|
+-----+------+



In [72]:
valuesB = [('James', 2), ('Emily', 3), ('Darth Vader', 5), ('Princess Leia', 6)]
tableB = spark.createDataFrame(valuesB, ['name', 'employee_id'])

In [73]:
tableB.show()

+-------------+-----------+
|         name|employee_id|
+-------------+-----------+
|        James|          2|
|        Emily|          3|
|  Darth Vader|          5|
|Princess Leia|          6|
+-------------+-----------+



In [74]:
inner_join =tableA.join(tableB, tableA.name == tableB.name)
inner_join.show()

+-----+------+-----+-----------+
| name|salary| name|employee_id|
+-----+------+-----+-----------+
|James|150000|James|          2|
|Emily| 65000|Emily|          3|
+-----+------+-----+-----------+



In [75]:
left_join =tableA.join(tableB, tableA.name == tableB.name, how = 'left')
left_join.show()

+-----+------+-----+-----------+
| name|salary| name|employee_id|
+-----+------+-----+-----------+
|James|150000|James|          2|
| John|100000| null|       null|
|Emily| 65000|Emily|          3|
| Nina|200000| null|       null|
+-----+------+-----+-----------+



In [76]:
left_join =tableA.join(tableB, tableA.name == tableB.name, how = 'right')
left_join.show()

+-----+------+-------------+-----------+
| name|salary|         name|employee_id|
+-----+------+-------------+-----------+
|James|150000|        James|          2|
| null|  null|Princess Leia|          6|
|Emily| 65000|        Emily|          3|
| null|  null|  Darth Vader|          5|
+-----+------+-------------+-----------+



In [78]:
left_join =tableA.join(tableB, tableA.name == tableB.name, how = 'outer')
left_join.show()

+-----+------+-------------+-----------+
| name|salary|         name|employee_id|
+-----+------+-------------+-----------+
|James|150000|        James|          2|
| John|100000|         null|       null|
| null|  null|Princess Leia|          6|
|Emily| 65000|        Emily|          3|
| Nina|200000|         null|       null|
| null|  null|  Darth Vader|          5|
+-----+------+-------------+-----------+



In [79]:
left_join =tableA.join(tableB, tableA.name == tableB.name, how = 'full')
left_join.show()

+-----+------+-------------+-----------+
| name|salary|         name|employee_id|
+-----+------+-------------+-----------+
|James|150000|        James|          2|
| John|100000|         null|       null|
| null|  null|Princess Leia|          6|
|Emily| 65000|        Emily|          3|
| Nina|200000|         null|       null|
| null|  null|  Darth Vader|          5|
+-----+------+-------------+-----------+

