In [1]:
import pyspark
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('fifa-transformation') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/04/12 12:53:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.option("basePath", "dataset/").parquet('dataset/*/*')
df.columns

                                                                                

['sofifa_id',
 'short_name',
 'overall',
 'age',
 'dob',
 'height_cm',
 'weight_kg',
 'nationality_name',
 'club_name',
 'league_name',
 'league_level',
 'potential',
 'value_eur',
 'wage_eur',
 'player_positions',
 'gender',
 'unique_id',
 'version']

In [4]:
print((df.count(), len(df.columns)))
df.show()

(144323, 18)
+---------+-------------+-------+---+-------------------+---------+---------+----------------+---------+-----------+------------+---------+---------+--------+----------------+------+-----------+-------+
|sofifa_id|   short_name|overall|age|                dob|height_cm|weight_kg|nationality_name|club_name|league_name|league_level|potential|value_eur|wage_eur|player_positions|gender|  unique_id|version|
+---------+-------------+-------+---+-------------------+---------+---------+----------------+---------+-----------+------------+---------+---------+--------+----------------+------+-----------+-------+
|   227246|    L. Bronze|     92| 29|1991-10-28 00:00:00|      171|       67|         England|  unknown|       null|        null|       92|     null|    null|          RB, CM|female|25769820906|     22|
|   227316|    W. Renard|     92| 30|1990-07-20 00:00:00|      187|       70|          France|  unknown|       null|        null|       92|     null|    null|              CB|

In [5]:
# TOTAL PLAYERS IN EACH FIFA BY GENDER

df.groupby('version', 'gender').count().orderBy('version', 'gender').show() 

+-------+------+-----+
|version|gender|count|
+-------+------+-----+
|     15|  male|16155|
|     16|female|  248|
|     16|  male|15623|
|     17|female|  299|
|     17|  male|17596|
|     18|female|  317|
|     18|  male|17954|
|     19|female|  299|
|     19|  male|18085|
|     20|female|  345|
|     20|  male|18483|
|     21|female|  345|
|     21|  male|18944|
|     22|female|  391|
|     22|  male|19239|
+-------+------+-----+



In [7]:
# Display the players with overall score > 90 for each fifa

df.groupby('version', 'gender', 'overall').count().filter("overall > 90").show()

+-------+------+-------+-----+
|version|gender|overall|count|
+-------+------+-------+-----+
|     18|  male|     91|    1|
|     20|female|     93|    1|
|     22|female|     92|    3|
|     20|female|     92|    2|
|     18|female|     91|    1|
|     20|  male|     94|    1|
|     22|  male|     92|    1|
|     22|female|     91|    1|
|     20|  male|     92|    1|
|     20|female|     91|    3|
|     18|female|     92|    1|
|     20|  male|     93|    1|
|     18|  male|     94|    1|
|     22|  male|     91|    5|
|     22|  male|     93|    1|
|     18|  male|     93|    1|
|     18|  male|     92|    3|
|     20|  male|     91|    3|
|     21|  male|     91|    4|
|     21|  male|     93|    1|
+-------+------+-------+-----+
only showing top 20 rows



In [8]:
# Create temp views for queries
df.createOrReplaceTempView("fifa")

In [9]:
# TOP PLAYER IN EACH FIFA

df_top_players = spark.sql("""

SELECT version, short_name, overall
FROM fifa WHERE (version,overall) IN
( SELECT version, MAX(overall)
  FROM fifa
  GROUP BY version
)
ORDER BY version

""")


df_top_players.show()

+-------+-----------------+-------+
|version|       short_name|overall|
+-------+-----------------+-------+
|     15|         L. Messi|     93|
|     16|         L. Messi|     94|
|     17|Cristiano Ronaldo|     94|
|     18|Cristiano Ronaldo|     94|
|     19|Cristiano Ronaldo|     94|
|     19|         L. Messi|     94|
|     20|         L. Messi|     94|
|     21|       M. Rapinoe|     93|
|     21|         L. Messi|     93|
|     22|         L. Messi|     93|
+-------+-----------------+-------+



In [37]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col, row_number

# Transformation to get, for a specific version and gender, sum of the top 11 players for each team
df_fifa_n = df.where((col('version') == 17) & (col('gender') == 'male'))

n = 11
df_best11_per_team = df_fifa_n.select(col('*'), row_number().over(window).alias('club_num')) \
    .where(col('club_num') <= n)

df_best11_per_team = df_best11_per_team.groupBy('version', 'club_name').sum('overall').orderBy('sum(overall)', ascending=False).show(10)

# df_best5_per_teams.select(['club_name', 'league_name', 'short_name', 'overall', 'gender', 'version']).show(500)

+-------+-------------------+------------+
|version|          club_name|sum(overall)|
+-------+-------------------+------------+
|     17|     Real Madrid CF|         967|
|     17|  FC Bayern München|         965|
|     17|       FC Barcelona|         964|
|     17|           Juventus|         945|
|     17|    Manchester City|         936|
|     17|            Chelsea|         935|
|     17|  Manchester United|         934|
|     17|            Arsenal|         933|
|     17|Paris Saint-Germain|         933|
|     17| Atlético de Madrid|         930|
+-------+-------------------+------------+
only showing top 10 rows



In [39]:
# Top wage year by year

df_top_players = spark.sql("""

SELECT version, short_name, wage_eur
FROM fifa WHERE (version, wage_eur) IN
( SELECT version, MAX(wage_eur)
  FROM fifa
  GROUP BY version
)
ORDER BY version

""")


df_top_players.show()

+-------+-----------------+--------+
|version|       short_name|wage_eur|
+-------+-----------------+--------+
|     15|         L. Messi|  550000|
|     16|         L. Messi|  550000|
|     17|         L. Messi|  575000|
|     17|Cristiano Ronaldo|  575000|
|     18|         L. Messi|  575000|
|     18|Cristiano Ronaldo|  575000|
|     19|         L. Messi|  575000|
|     20|         L. Messi|  560000|
|     21|         L. Messi|  560000|
|     22|     K. De Bruyne|  350000|
|     22|       K. Benzema|  350000|
+-------+-----------------+--------+



In [58]:
# Top n wages for a certain year

df_top_players = spark.sql("""

SELECT version, wage_eur, short_name
FROM fifa 
WHERE version = 17
ORDER BY wage_eur DESC
LIMIT 10;
 """)


df_top_players.show()

+-------+--------+-----------------+
|version|wage_eur|       short_name|
+-------+--------+-----------------+
|     17|  575000|Cristiano Ronaldo|
|     17|  575000|         L. Messi|
|     17|  525000|        L. Suárez|
|     17|  450000|           Neymar|
|     17|  375000|          G. Bale|
|     17|  325000|          Iniesta|
|     17|  325000|        S. Agüero|
|     17|  325000|        L. Modrić|
|     17|  325000|         T. Kroos|
|     17|  300000|   Z. Ibrahimović|
+-------+--------+-----------------+



In [61]:
# Top n wages for a certain year

df_top_players_by_year = spark.sql("""

SELECT version, wage_eur, short_name
FROM fifa 
WHERE version = 17
ORDER BY overall DESC
LIMIT 10;
 """).show()

df.createOrReplaceTempView("top_players_year")

spark.sql("""
SELECT SUM
FROM top_players_year 
WHERE version = 17
ORDER BY overall DESC
LIMIT 10;
 """)

df_top_players.show()

+-------+--------+-----------------+
|version|wage_eur|       short_name|
+-------+--------+-----------------+
|     17|  575000|Cristiano Ronaldo|
|     17|  575000|         L. Messi|
|     17|  225000|         M. Neuer|
|     17|  525000|        L. Suárez|
|     17|  450000|           Neymar|
|     17|    null|         C. Lloyd|
|     17|    null|       M. Rapinoe|
|     17|  300000|   Z. Ibrahimović|
|     17|  250000|   R. Lewandowski|
|     17|  200000|       J. Boateng|
+-------+--------+-----------------+



In [None]:
spark.stop()