In [1]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Exercise_1_lab8_DE2")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
# Load data from BigQuery.

conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

gsc_file_path = 'gs://data_de2/transfers.csv'  #  use your gcp bucket name. Also upload sales.csv first

df_transfers = spark.read.format("csv").option("header", "true").option("delimiter", ",") \
       .load(gsc_file_path)
df_transfers.printSchema()
df_transfers.show(5)

df_players = spark.read \
  .format("bigquery") \
  .load("de2024-436414.transfers_kaggle.players")    # project_id.datatset.tablename. Use your project id
df_players.printSchema()
df_players.show(4)



root
 |-- player_id: string (nullable = true)
 |-- transfer_date: string (nullable = true)
 |-- transfer_season: string (nullable = true)
 |-- from_club_id: string (nullable = true)
 |-- to_club_id: string (nullable = true)
 |-- from_club_name: string (nullable = true)
 |-- to_club_name: string (nullable = true)
 |-- transfer_fee: string (nullable = true)
 |-- market_value_in_eur: string (nullable = true)
 |-- player_name: string (nullable = true)

+---------+-------------+---------------+------------+----------+--------------+---------------+------------+-------------------+---------------+
|player_id|transfer_date|transfer_season|from_club_id|to_club_id|from_club_name|   to_club_name|transfer_fee|market_value_in_eur|    player_name|
+---------+-------------+---------------+------------+----------+--------------+---------------+------------+-------------------+---------------+
|   195778|   2026-06-30|          25/26|          79|        27| VfB Stuttgart|  Bayern Munich|       0.000|

In [17]:
df_transfers.columns

['player_id',
 'transfer_date',
 'transfer_season',
 'from_club_id',
 'to_club_id',
 'from_club_name',
 'to_club_name',
 'transfer_fee',
 'market_value_in_eur',
 'player_name']

**Which players hold the highest and second-highest transfer fees for each club, and what are their details?**

In [16]:
# first, only select the columns from df_players that are not in df_transfers and insightful
df_players_new = df_players.select("player_id", "player_code", "country_of_birth", "country_of_citizenship", 
                                   "date_of_birth", "sub_position", "position", "foot", "height_in_cm", 
                                   "agent_name", "image_url") 

# Turn date_of_birth into an object without timestamp
df_players_new = df_players_new.withColumn("date_of_birth", to_date(col("date_of_birth")))

df_players_new.show(15)



+---------+--------------------+----------------+----------------------+-------------+------------+--------+-----+------------+----------------+--------------------+
|player_id|         player_code|country_of_birth|country_of_citizenship|date_of_birth|sub_position|position| foot|height_in_cm|      agent_name|           image_url|
+---------+--------------------+----------------+----------------------+-------------+------------+--------+-----+------------+----------------+--------------------+
|    56607|         anco-jansen|     Netherlands|           Netherlands|   1989-03-09|        NULL| Missing|right|         186|            NULL|https://img.a.tra...|
|   231165|      fotis-doumanis|          Greece|                Greece|   1993-10-28|        NULL| Missing|right|         173|            NULL|https://img.a.tra...|
|   727111|anastasios-papado...|            NULL|                Greece|   2002-03-15|        NULL| Missing|right|         180|            NULL|https://img.a.tra...|
|   

In [32]:
from pyspark.sql.functions import *
from pyspark.sql import Window

# Join the players and transfers datasets using a left join
join_expression = df_players_new["player_id"] == df_transfers["player_id"]
players_merged = df_transfers.join(df_players_new, join_expression, "left").drop("rid")

# We found some NAN values in the transfer fee section, we will remove these rows because they are not important
players_merged_clean = players_merged.filter(col("transfer_fee").isNotNull())


players_merged_clean.printSchema()


root
 |-- player_id: string (nullable = true)
 |-- transfer_date: string (nullable = true)
 |-- transfer_season: string (nullable = true)
 |-- from_club_id: string (nullable = true)
 |-- to_club_id: string (nullable = true)
 |-- from_club_name: string (nullable = true)
 |-- to_club_name: string (nullable = true)
 |-- transfer_fee: string (nullable = true)
 |-- market_value_in_eur: string (nullable = true)
 |-- player_name: string (nullable = true)
 |-- player_id: long (nullable = true)
 |-- player_code: string (nullable = true)
 |-- country_of_birth: string (nullable = true)
 |-- country_of_citizenship: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- sub_position: string (nullable = true)
 |-- position: string (nullable = true)
 |-- foot: string (nullable = true)
 |-- height_in_cm: long (nullable = true)
 |-- agent_name: string (nullable = true)
 |-- image_url: string (nullable = true)



We can see in the shema above that a lot of these variables that should be integers are strings. Let's fix that.

In [33]:
# Convert transfer_fee, market_value_in_eur, and height_in_cm to integer type
players_merged_clean = players_merged_clean.withColumn("transfer_fee", col("transfer_fee").cast("int")) \
                                           .withColumn("market_value_in_eur", col("market_value_in_eur").cast("int")) \
                                           .withColumn("height_in_cm", col("height_in_cm").cast("int"))

players_merged_clean.printSchema()


root
 |-- player_id: string (nullable = true)
 |-- transfer_date: string (nullable = true)
 |-- transfer_season: string (nullable = true)
 |-- from_club_id: string (nullable = true)
 |-- to_club_id: string (nullable = true)
 |-- from_club_name: string (nullable = true)
 |-- to_club_name: string (nullable = true)
 |-- transfer_fee: integer (nullable = true)
 |-- market_value_in_eur: integer (nullable = true)
 |-- player_name: string (nullable = true)
 |-- player_id: long (nullable = true)
 |-- player_code: string (nullable = true)
 |-- country_of_birth: string (nullable = true)
 |-- country_of_citizenship: string (nullable = true)
 |-- date_of_birth: date (nullable = true)
 |-- sub_position: string (nullable = true)
 |-- position: string (nullable = true)
 |-- foot: string (nullable = true)
 |-- height_in_cm: integer (nullable = true)
 |-- agent_name: string (nullable = true)
 |-- image_url: string (nullable = true)



In [34]:
# Calculate the average fee per club for comparison
avg_fee_club = players_merged_clean.groupBy("to_club_name").agg(avg("transfer_fee").alias("avg_fee_club"))

# We exclude all players that went on a free transfer (transer_fee == 0), as this is not an expenditure for the club.
players_merged_clean = players_merged_clean.filter(col("transfer_fee") > 0)

# Exclude players who went on a free transfer (transfer_fee == 0)
players_merged_clean = players_merged_clean.filter(col("transfer_fee") > 0)

# Add the average fee to the players_merged DataFrame
players_with_avg_fee = players_merged_clean.join(avg_fee_club, on="to_club_name", how="left")


# Define a window to rank players by transfer_fee within each club, ordering by transfer_fee in descending order
highest_fee_club = Window.partitionBy("to_club_name").orderBy(col("transfer_fee").desc(), col("player_name"))

# Add a rank column to determine the order of players by transfer_fee
players_merged_ranked = players_with_avg_fee.withColumn("rank_fee", row_number().over(highest_fee_club))


top_two_expenditures_table = players_merged_ranked.where((col("rank_fee") == 1) | (col("rank_fee") == 2) ).select("*")
# top_two_expenditures_table.show(20)

top_two_expenditures_table2 = top_two_expenditures_table.withColumn("Record", when(col("rank_fee") == 1, "Most Expensive")
                                                          .otherwise("2nd Most expensive")).select("player_name", "to_club_name", "country_of_birth","date_of_birth" , "position", "foot", "height_in_cm",  "Record", "transfer_fee", "avg_fee_club")

top_two_expenditures_table2 = top_two_expenditures_table2.orderBy(col("to_club_name"), col("transfer_fee").desc())
top_two_expenditures_table2.show(20)


+--------------------+--------------+----------------+-------------+----------+-----+------------+------------------+------------+------------------+
|         player_name|  to_club_name|country_of_birth|date_of_birth|  position| foot|height_in_cm|            Record|transfer_fee|      avg_fee_club|
+--------------------+--------------+----------------+-------------+----------+-----+------------+------------------+------------+------------------+
|     Philipp Hofmann|1.FC K'lautern|         Germany|   1993-03-30|    Attack| left|         195|    Most Expensive|     1000000|          110000.0|
|        Simon Zoller|1.FC K'lautern|         Germany|   1991-06-26|    Attack|right|         179|2nd Most expensive|      630000|          110000.0|
|        Jhon Córdoba|     1.FC Köln|        Colombia|   1993-05-11|    Attack|right|         188|    Most Expensive|    17000000|1216901.4084507043|
|  Sebastiaan Bornauw|     1.FC Köln|         Belgium|   1999-03-22|  Defender|right|         191|2n

In [35]:
# Use the Cloud Storage bucket for temporary BigQuery export data the connector uses.
bucket = "temp_de2"
spark.conf.set('temporaryGcsBucket', bucket)
# Saving the data to BigQuery
top_two_expenditures_table2.write.format('bigquery') \
  .option('table', 'de2024-436414.transfers_kaggle.record_expenditure_clubs') \
  .mode("overwrite") \
  .save()

In [42]:
# Stop the spark context
spark.stop()