#Silver Layer: Data Transformation and Cleaning

The Silver Layer focuses on transforming and cleaning the raw data collected from the previous Bronze Layer. This phase involves processing the data to make it more structured and suitable for analysis. The goal of this layer is to prepare the data for deeper analysis and insights extraction in the subsequent layers of the project.

In [0]:
import pandas as pd
from pyspark.sql import SparkSession

In [0]:
# Read a Parquet file
df_spark_loaded = spark.read.format("parquet").load("/FileStore/tables/messi_raw_data_parquet")

In the cell above, we converted the data saved in Parquet format into a Spark DataFrame. During the testing phases of developing this MVP, I noticed that some data were duplicated. To address this issue from the beginning, I returned here in the begining of the data to remove duplicates. At the same time, I took the opportunity to rename the column 'id' as it is a reserved word in SQL.

In [0]:
df_spark_loaded = df_spark_loaded.withColumnRenamed("id", "id_event")

In [0]:
# Remove duplicates
df_spark_loaded = df_spark_loaded.dropDuplicates(["id_event"])

I created some temporary tables for testing purposes and have kept them registered here.

In [0]:
# Create a temporary table called "messi_temp_silver"
df_spark_loaded.createOrReplaceTempView("messi_temp_silver")

In [0]:
spark.sql("SHOW TABLES").show()

+--------+--------------------+-----------+
|database|           tableName|isTemporary|
+--------+--------------------+-----------+
| default|          messi_data|      false|
|        |messi_cleaned_silver|       true|
|        |   messi_temp_silver|       true|
|        |  messi_temp_silver2|       true|
|        |  messi_temp_silver3|       true|
+--------+--------------------+-----------+



In [0]:
%sql
SELECT * FROM messi_temp_silver WHERE player_name = 'Lionel Andrés Messi Cuccittini' LIMIT 5

id_event,index,period,timestamp,minute,second,possession,duration,type_id,type_name,possession_team_id,possession_team_name,play_pattern_id,play_pattern_name,team_id,team_name,tactics_formation,location,player_id,player_name,position_id,position_name,pass_recipient_id,pass_recipient_name,pass_length,pass_angle,pass_height_id,pass_height_name,pass_end_location,pass_body_part_id,pass_body_part_name,pass_type_id,pass_type_name,carry_end_location,under_pressure,pass_outcome_id,pass_outcome_name,ball_receipt_outcome_id,ball_receipt_outcome_name,counterpress,dribble_outcome_id,dribble_outcome_name,foul_committed_advantage,foul_won_advantage,duel_type_id,duel_type_name,duel_outcome_id,duel_outcome_name,pass_aerial_won,pass_switch,pass_technique_id,pass_technique_name,pass_through_ball,clearance_body_part_id,clearance_body_part_name,clearance_left_foot,out,off_camera,pass_deflected,pass_cross,clearance_head,pass_outswinging,interception_outcome_id,interception_outcome_name,pass_assisted_shot_id,pass_shot_assist,shot_one_on_one,shot_statsbomb_xg,shot_end_location,shot_key_pass_id,shot_type_id,shot_type_name,shot_outcome_id,shot_outcome_name,shot_technique_id,shot_technique_name,shot_body_part_id,shot_body_part_name,goalkeeper_end_location,goalkeeper_position_id,goalkeeper_position_name,goalkeeper_type_id,goalkeeper_type_name,foul_won_defensive,foul_committed_card_id,foul_committed_card_name,shot_first_time,goalkeeper_body_part_id,goalkeeper_body_part_name,goalkeeper_outcome_id,goalkeeper_outcome_name,goalkeeper_technique_id,goalkeeper_technique_name,dribble_overrun,ball_recovery_offensive,pass_no_touch,clearance_right_foot,ball_recovery_recovery_failure,dribble_nutmeg,pass_cut_back,miscontrol_aerial_won,shot_open_goal,shot_aerial_won,clearance_aerial_won,bad_behaviour_card_id,bad_behaviour_card_name,substitution_outcome_id,substitution_outcome_name,substitution_replacement_id,substitution_replacement_name,pass_inswinging,foul_committed_offensive,foul_committed_type_id,foul_committed_type_name,50_50_outcome_id,50_50_outcome_name,pass_straight,pass_goal_assist,block_offensive,clearance_other,block_deflection,shot_deflected,injury_stoppage_in_chain,pass_miscommunication,foul_committed_penalty,foul_won_penalty,goalkeeper_punched_out,shot_saved_off_target,goalkeeper_shot_saved_off_target,shot_saved_to_post,goalkeeper_shot_saved_to_post,block_save_block,dribble_no_touch,shot_redirect,shot_follows_dribble,goalkeeper_success_in_play,goalkeeper_lost_in_play,half_start_late_video_start,player_off_permanent,goalkeeper_lost_out,half_end_early_video_end,goalkeeper_success_out,goalkeeper_saved_to_post
0000b8bf-8a71-423e-b260-6a681e9f549d,653,1,00:14:09.711,14,9,39,0.5391,43,Carry,217,Barcelona,1,Regular Play,217,Barcelona,,"List(57.1, 78.2)",5503.0,Lionel Andrés Messi Cuccittini,17.0,Right Wing,,,,,,,,,,,,"List(56.7, 77.6)",True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
000f9ca1-c0fe-4178-8809-64aea478c73d,3319,2,00:36:58.023,81,58,140,1.186096,30,Pass,217,Barcelona,3,From Free Kick,217,Barcelona,,"List(74.6, 30.2)",5503.0,Lionel Andrés Messi Cuccittini,22.0,Right Center Forward,4447.0,Martin Braithwaite Christensen,13.869751,-0.60600066,1.0,Ground Pass,"List(86.0, 22.3)",38.0,Left Foot,,,,,9.0,Incomplete,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
00112783-b357-4515-8311-5a891fa4e21e,822,1,00:18:42.115,18,42,47,0.431666,30,Pass,217,Barcelona,7,From Goal Kick,217,Barcelona,,"List(78.7, 77.3)",5503.0,Lionel Andrés Messi Cuccittini,17.0,Right Wing,4447.0,Martin Braithwaite Christensen,7.752419,-0.5271459,3.0,High Pass,"List(85.4, 73.4)",38.0,Left Foot,,,,,9.0,Incomplete,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
001175d7-9486-4ccc-85e1-426a4fca7fff,2603,2,00:16:29.754,61,29,129,,42,Ball Receipt*,217,Barcelona,6,From Counter,217,Barcelona,,"List(58.4, 77.3)",5503.0,Lionel Andrés Messi Cuccittini,23.0,Center Forward,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
0011df67-c605-4df2-8dc8-5247d8abde37,2948,2,00:25:32.605,70,32,181,,42,Ball Receipt*,217,Barcelona,1,Regular Play,217,Barcelona,,"List(69.2, 68.7)",5503.0,Lionel Andrés Messi Cuccittini,17.0,Right Wing,,,,,,,,,,,,,True,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [0]:
spark.table("messi_temp_silver").printSchema()


root
 |-- id_event: string (nullable = true)
 |-- index: long (nullable = true)
 |-- period: long (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- minute: long (nullable = true)
 |-- second: long (nullable = true)
 |-- possession: long (nullable = true)
 |-- duration: double (nullable = true)
 |-- type_id: long (nullable = true)
 |-- type_name: string (nullable = true)
 |-- possession_team_id: long (nullable = true)
 |-- possession_team_name: string (nullable = true)
 |-- play_pattern_id: long (nullable = true)
 |-- play_pattern_name: string (nullable = true)
 |-- team_id: long (nullable = true)
 |-- team_name: string (nullable = true)
 |-- tactics_formation: double (nullable = true)
 |-- location: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- player_id: double (nullable = true)
 |-- player_name: string (nullable = true)
 |-- position_id: double (nullable = true)
 |-- position_name: string (nullable = true)
 |-- pass_recipient_id: double (n

I replaced **.describe().show()** and **.summary().show()** by saving the result into a DataFrames and presenting it in this format because the table has many columns, making it difficult to interpret the information visually.

In [0]:
# Loading data from a temporary table
df_table_columns = spark.sql("SHOW COLUMNS IN messi_temp_silver")

#shape
num_columns = df_table_columns.count()
num_rows = spark.sql("SELECT COUNT(*) AS count FROM messi_temp_silver").first()["count"]

print(f"Columns: {num_columns} x Rows: {num_rows}")

Columns: 143 x Rows: 1976777


In [0]:
# Visualizing all columns of this df
pd.set_option('display.max_columns', None)

In [0]:
# Statistics description
df_description = spark.table("messi_temp_silver").describe()
df_description = df_description.toPandas()

df_description


Unnamed: 0,summary,id_event,index,period,timestamp,minute,second,possession,duration,type_id,type_name,possession_team_id,possession_team_name,play_pattern_id,play_pattern_name,team_id,team_name,tactics_formation,player_id,player_name,position_id,position_name,pass_recipient_id,pass_recipient_name,pass_length,pass_angle,pass_height_id,pass_height_name,pass_body_part_id,pass_body_part_name,pass_type_id,pass_type_name,pass_outcome_id,pass_outcome_name,ball_receipt_outcome_id,ball_receipt_outcome_name,dribble_outcome_id,dribble_outcome_name,duel_type_id,duel_type_name,duel_outcome_id,duel_outcome_name,pass_technique_id,pass_technique_name,clearance_body_part_id,clearance_body_part_name,interception_outcome_id,interception_outcome_name,pass_assisted_shot_id,shot_statsbomb_xg,shot_key_pass_id,shot_type_id,shot_type_name,shot_outcome_id,shot_outcome_name,shot_technique_id,shot_technique_name,shot_body_part_id,shot_body_part_name,goalkeeper_position_id,goalkeeper_position_name,goalkeeper_type_id,goalkeeper_type_name,foul_committed_card_id,foul_committed_card_name,goalkeeper_body_part_id,goalkeeper_body_part_name,goalkeeper_outcome_id,goalkeeper_outcome_name,goalkeeper_technique_id,goalkeeper_technique_name,bad_behaviour_card_id,bad_behaviour_card_name,substitution_outcome_id,substitution_outcome_name,substitution_replacement_id,substitution_replacement_name,foul_committed_type_id,foul_committed_type_name,50_50_outcome_id,50_50_outcome_name
0,count,1976777,1976777.0,1976777.0,1976777,1976777.0,1976777.0,1976777.0,1454132.0,1976777.0,1976777,1976777.0,1976777,1976777.0,1976777,1976777.0,1976777,2016.0,1970092.0,1970092,1970092.0,1970092,522301.0,522301,552987.0,552987.0,552987.0,552987,525920.0,525920,95287.0,95287,98516.0,98516,67815.0,67815,23290.0,23290,32785.0,32785,20556.0,20556,8125.0,8125,21878.0,21878,10845.0,10845,9295,12986.0,9295,12986.0,12986,12986.0,12986,12986.0,12986,12986.0,12986,13020.0,13020,15289.0,15289,2121.0,2121,3996.0,3996,7548.0,7548,5334.0,5334,524.0,524,3114.0,3114,3114.0,3114,1083.0,1083,708.0,708
1,mean,,1897.436162500879,1.4960756827907244,,44.52193090065293,29.324156442532463,93.46939336101138,1.2315029634025856,32.83605485090124,,263.01778197540744,,2.8617249188957583,,270.46033012322584,,3109.504464285714,11781.592706330466,,11.261167498776707,,11331.91978954664,,20.430384714027717,0.008293696834005,1.483423660954055,,39.8842561606328,,64.93133376011419,,19.076718502578263,,9.0,,8.387591240875912,,10.626994052158,,12.042031523642732,,106.57255384615384,,38.215056220861136,,10.292392807745506,,,0.1152503571537887,,85.12236254427846,,98.17272447250886,,92.88965039273064,,38.94378561527799,,43.84831029185868,,31.310550068676825,,6.91984912776992,,36.72697697697698,,42.47191308956015,,45.58136482939633,,6.944656488549619,,102.93770070648684,,13688.49421965318,,23.23822714681441,,2.038135593220339,
2,stddev,,1111.383650187602,0.499984725961907,,26.889945442483597,17.3833533494916,55.838345877925256,2.0130726331521758,11.814214968586986,,179.95726340309403,,2.212915593662428,,193.2129927971178,,7827.833301654504,8721.188664210928,,7.09286246521311,,8535.8468276135,,13.976019099757082,1.5685432985668513,0.7780432611922886,,4.564964609506957,,1.9941188133784848,,23.849013756428494,,0.0,,0.4872109019630775,,0.4836110466644008,,4.892598298512687,,1.7178673926101922,,2.606169175478441,,5.26942684876814,,,0.1523738257014054,,6.614918588580633,,2.1051229115450227,,0.8486946198608467,,1.9522794256081537,,0.507824394218875,,7.055210934205515,,0.3544690100892118,,2.329284209071266,,19.68649558951844,,0.4933815978484019,,0.307319995560021,,0.2417371659283777,,9837.689224801545,,1.297850275792675,,1.18451246781008,
3,min,00000e28-2f19-491b-88b9-1d9077baf034,1.0,1.0,00:00:00.000,0.0,0.0,1.0,-1767.1079,2.0,50/50,205.0,Albacete,1.0,From Corner,205.0,Albacete,343.0,2948.0,Aarón Escandell Banacloche,0.0,Center Attacking Midfield,2948.0,Aarón Escandell Banacloche,0.0,-3.1386428,1.0,Ground Pass,37.0,Drop Kick,61.0,Corner,9.0,Incomplete,9.0,Incomplete,8.0,Complete,10.0,Aerial Lost,4.0,Lost In Play,104.0,Inswinging,37.0,Head,4.0,Lost In Play,00007fd2-738e-4b27-bb68-7bdeffc77ee7,0.00018,0026542a-afae-4d55-9219-9f7a640b2306,61.0,Corner,96.0,Blocked,89.0,Backheel,37.0,Head,42.0,Moving,25.0,Collected,5.0,Red Card,35.0,Both Hands,1.0,Claim,45.0,Diving,5.0,Red Card,102.0,Injury,3023.0,Abdallahi Mohamed Mahmoud,19.0,6 Seconds,1.0,Lost
4,max,fffffd42-a5ff-48dd-a264-1f5e08ef04ef,4806.0,2.0,00:53:40.441,98.0,59.0,273.0,88.8957,43.0,Tactical Shift,1230.0,Xerez,9.0,Regular Play,1230.0,Xerez,42211.0,117025.0,Šime Vrsaljko,25.0,Substitute,105762.0,Šime Vrsaljko,119.19941,3.1415927,3.0,Low Pass,106.0,Right Foot,67.0,Throw-in,77.0,Unknown,9.0,Incomplete,9.0,Incomplete,11.0,Tackle,17.0,Won,108.0,Through Ball,70.0,Right Foot,17.0,Won,fffd9039-256b-4a7e-895d-8c071f2ca0ed,0.99281865,ffff5a06-d8ba-4a56-9e64-a894aa5a229d,88.0,Penalty,116.0,Wayward,95.0,Volley,70.0,Right Foot,44.0,Set,114.0,Smother,7.0,Yellow Card,41.0,Right Hand,117.0,Won,46.0,Standing,7.0,Yellow Card,103.0,Tactical,117025.0,Óscar de Marcos Arana,24.0,Handball,4.0,Won


In [0]:
# Statistics summary
df_summary = spark.table("messi_temp_silver").summary()
df_summary = df_summary.toPandas()

df_summary

Unnamed: 0,summary,id_event,index,period,timestamp,minute,second,possession,duration,type_id,type_name,possession_team_id,possession_team_name,play_pattern_id,play_pattern_name,team_id,team_name,tactics_formation,player_id,player_name,position_id,position_name,pass_recipient_id,pass_recipient_name,pass_length,pass_angle,pass_height_id,pass_height_name,pass_body_part_id,pass_body_part_name,pass_type_id,pass_type_name,pass_outcome_id,pass_outcome_name,ball_receipt_outcome_id,ball_receipt_outcome_name,dribble_outcome_id,dribble_outcome_name,duel_type_id,duel_type_name,duel_outcome_id,duel_outcome_name,pass_technique_id,pass_technique_name,clearance_body_part_id,clearance_body_part_name,interception_outcome_id,interception_outcome_name,pass_assisted_shot_id,shot_statsbomb_xg,shot_key_pass_id,shot_type_id,shot_type_name,shot_outcome_id,shot_outcome_name,shot_technique_id,shot_technique_name,shot_body_part_id,shot_body_part_name,goalkeeper_position_id,goalkeeper_position_name,goalkeeper_type_id,goalkeeper_type_name,foul_committed_card_id,foul_committed_card_name,goalkeeper_body_part_id,goalkeeper_body_part_name,goalkeeper_outcome_id,goalkeeper_outcome_name,goalkeeper_technique_id,goalkeeper_technique_name,bad_behaviour_card_id,bad_behaviour_card_name,substitution_outcome_id,substitution_outcome_name,substitution_replacement_id,substitution_replacement_name,foul_committed_type_id,foul_committed_type_name,50_50_outcome_id,50_50_outcome_name
0,count,1976777,1976777.0,1976777.0,1976777,1976777.0,1976777.0,1976777.0,1454132.0,1976777.0,1976777,1976777.0,1976777,1976777.0,1976777,1976777.0,1976777,2016.0,1970092.0,1970092,1970092.0,1970092,522301.0,522301,552987.0,552987.0,552987.0,552987,525920.0,525920,95287.0,95287,98516.0,98516,67815.0,67815,23290.0,23290,32785.0,32785,20556.0,20556,8125.0,8125,21878.0,21878,10845.0,10845,9295,12986.0,9295,12986.0,12986,12986.0,12986,12986.0,12986,12986.0,12986,13020.0,13020,15289.0,15289,2121.0,2121,3996.0,3996,7548.0,7548,5334.0,5334,524.0,524,3114.0,3114,3114.0,3114,1083.0,1083,708.0,708
1,mean,,1897.436162500879,1.4960756827907244,,44.52193090065293,29.324156442532463,93.46939336101138,1.2315029634025856,32.83605485090124,,263.01778197540744,,2.8617249188957583,,270.46033012322584,,3109.504464285714,11781.592706330466,,11.261167498776707,,11331.91978954664,,20.430384714027717,0.008293696834005,1.483423660954055,,39.8842561606328,,64.93133376011419,,19.076718502578263,,9.0,,8.387591240875912,,10.626994052158,,12.042031523642732,,106.57255384615384,,38.215056220861136,,10.292392807745506,,,0.1152503571537887,,85.12236254427846,,98.17272447250886,,92.88965039273064,,38.94378561527799,,43.84831029185868,,31.310550068676825,,6.91984912776992,,36.72697697697698,,42.47191308956015,,45.58136482939633,,6.944656488549619,,102.93770070648684,,13688.49421965318,,23.23822714681441,,2.038135593220339,
2,stddev,,1111.383650187602,0.499984725961907,,26.889945442483597,17.3833533494916,55.838345877925256,2.0130726331521758,11.814214968586986,,179.95726340309403,,2.212915593662428,,193.2129927971178,,7827.833301654504,8721.188664210928,,7.09286246521311,,8535.8468276135,,13.976019099757082,1.5685432985668513,0.7780432611922886,,4.564964609506957,,1.9941188133784848,,23.849013756428494,,0.0,,0.4872109019630775,,0.4836110466644008,,4.892598298512687,,1.7178673926101922,,2.606169175478441,,5.26942684876814,,,0.1523738257014054,,6.614918588580633,,2.1051229115450227,,0.8486946198608467,,1.9522794256081537,,0.507824394218875,,7.055210934205515,,0.3544690100892118,,2.329284209071266,,19.68649558951844,,0.4933815978484019,,0.307319995560021,,0.2417371659283777,,9837.689224801545,,1.297850275792675,,1.18451246781008,
3,min,00000e28-2f19-491b-88b9-1d9077baf034,1.0,1.0,00:00:00.000,0.0,0.0,1.0,-1767.1079,2.0,50/50,205.0,Albacete,1.0,From Corner,205.0,Albacete,343.0,2948.0,Aarón Escandell Banacloche,0.0,Center Attacking Midfield,2948.0,Aarón Escandell Banacloche,0.0,-3.1386428,1.0,Ground Pass,37.0,Drop Kick,61.0,Corner,9.0,Incomplete,9.0,Incomplete,8.0,Complete,10.0,Aerial Lost,4.0,Lost In Play,104.0,Inswinging,37.0,Head,4.0,Lost In Play,00007fd2-738e-4b27-bb68-7bdeffc77ee7,0.00018,0026542a-afae-4d55-9219-9f7a640b2306,61.0,Corner,96.0,Blocked,89.0,Backheel,37.0,Head,42.0,Moving,25.0,Collected,5.0,Red Card,35.0,Both Hands,1.0,Claim,45.0,Diving,5.0,Red Card,102.0,Injury,3023.0,Abdallahi Mohamed Mahmoud,19.0,6 Seconds,1.0,Lost
4,25%,,942.0,1.0,,21.0,14.0,45.0,0.3545,30.0,,217.0,,1.0,,217.0,,433.0,5216.0,,5.0,,5216.0,,11.031773,-1.2260236,1.0,,38.0,,63.0,,9.0,,9.0,,8.0,,10.0,,4.0,,105.0,,37.0,,4.0,,,0.03036632,,87.0,,97.0,,93.0,,38.0,,44.0,,30.0,,7.0,,35.0,,15.0,,45.0,,7.0,,103.0,,5558.0,,23.0,,1.0,
5,50%,,1883.0,1.0,,45.0,29.0,92.0,1.032884,39.0,,217.0,,3.0,,217.0,,442.0,6647.0,,11.0,,6400.0,,16.519686,-0.0033112462,1.0,,40.0,,66.0,,9.0,,9.0,,8.0,,11.0,,14.0,,108.0,,37.0,,13.0,,,0.062358446,,87.0,,98.0,,93.0,,38.0,,44.0,,32.0,,7.0,,35.0,,52.0,,46.0,,7.0,,103.0,,6998.0,,24.0,,1.0,
6,75%,,2825.0,2.0,,67.0,44.0,139.0,1.682078,42.0,,217.0,,4.0,,217.0,,4231.0,20131.0,,17.0,,20125.0,,25.402363,1.2554973,2.0,,40.0,,66.0,,9.0,,9.0,,9.0,,11.0,,16.0,,108.0,,40.0,,16.0,,,0.12549576,,87.0,,100.0,,93.0,,40.0,,44.0,,32.0,,7.0,,39.0,,55.0,,46.0,,7.0,,103.0,,24166.0,,24.0,,3.0,
7,max,fffffd42-a5ff-48dd-a264-1f5e08ef04ef,4806.0,2.0,00:53:40.441,98.0,59.0,273.0,88.8957,43.0,Tactical Shift,1230.0,Xerez,9.0,Regular Play,1230.0,Xerez,42211.0,117025.0,Šime Vrsaljko,25.0,Substitute,105762.0,Šime Vrsaljko,119.19941,3.1415927,3.0,Low Pass,106.0,Right Foot,67.0,Throw-in,77.0,Unknown,9.0,Incomplete,9.0,Incomplete,11.0,Tackle,17.0,Won,108.0,Through Ball,70.0,Right Foot,17.0,Won,fffd9039-256b-4a7e-895d-8c071f2ca0ed,0.99281865,ffff5a06-d8ba-4a56-9e64-a894aa5a229d,88.0,Penalty,116.0,Wayward,95.0,Volley,70.0,Right Foot,44.0,Set,114.0,Smother,7.0,Yellow Card,41.0,Right Hand,117.0,Won,46.0,Standing,7.0,Yellow Card,103.0,Tactical,117025.0,Óscar de Marcos Arana,24.0,Handball,4.0,Won


We have many attributes and data that are irrelevant for this context, so it is necessary to clean up the data by removing irrelevant records and attributes. Initially, I thought it would be a good idea to eliminate some rows and apply a filter focusing on Lionel Messi's actions to improve performance and reduce processing time. After all, my goal was to analyze only Messi's actions. However, I realized this approach was naive as I progressed with the analysis. In the Gold Layer, I explain why this was important. But I maintained the decision to eliminate some columns.

In [0]:
columns_to_keep = [
    "player_name", "pass_recipient_name", "pass_length", "pass_deflected", "pass_miscommunication", 
    "pass_cross", "pass_type_name", "pass_outcome_name", "pass_technique_name", "type_name", 
    "shot_outcome_name", "pass_shot_assist", "pass_goal_assist", "shot_body_part_name", 
    "counterpress", "shot_end_location", "shot_aerial_won", "shot_follows_dribble", 
    "shot_first_time", "shot_open_goal", "shot_statsbomb_xg", "shot_deflected", 
    "shot_technique_name", "shot_type_name", "position_name", "under_pressure", 
    "tactics_formation", "50_50_outcome_name", "bad_behaviour_card_name", "play_pattern_name",
    "shot_one_on_one", "id_event","index", "period", "minute", "possession", "possession_team_name", 
    "team_name", "dribble_outcome_name", "dribble_overrun", "dribble_nutmeg", "dribble_no_touch"
]

# Create a DataFrame with the selected columns
df_cleaned = df_spark_loaded.select(columns_to_keep)

# Rename more 2 columns
df_cleaned = df_cleaned.withColumnRenamed("index", "index_match")
df_cleaned = df_cleaned.withColumnRenamed("minute", "minute_match")


In [0]:
df_cleaned.write.mode("overwrite").parquet("/FileStore/tables/messi_silver_done.parquet")

In [0]:
# Save in a temporary table to validate
df_cleaned.createOrReplaceTempView("messi_temp_silver_done")

In [0]:
# Show the schema of the new dataframe
df_cleaned.printSchema()

root
 |-- player_name: string (nullable = true)
 |-- pass_recipient_name: string (nullable = true)
 |-- pass_length: double (nullable = true)
 |-- pass_deflected: boolean (nullable = true)
 |-- pass_miscommunication: boolean (nullable = true)
 |-- pass_cross: boolean (nullable = true)
 |-- pass_type_name: string (nullable = true)
 |-- pass_outcome_name: string (nullable = true)
 |-- pass_technique_name: string (nullable = true)
 |-- type_name: string (nullable = true)
 |-- shot_outcome_name: string (nullable = true)
 |-- pass_shot_assist: boolean (nullable = true)
 |-- pass_goal_assist: boolean (nullable = true)
 |-- shot_body_part_name: string (nullable = true)
 |-- counterpress: boolean (nullable = true)
 |-- shot_end_location: array (nullable = true)
 |    |-- element: double (containsNull = true)
 |-- shot_aerial_won: boolean (nullable = true)
 |-- shot_follows_dribble: boolean (nullable = true)
 |-- shot_first_time: boolean (nullable = true)
 |-- shot_open_goal: boolean (nullable 

In [0]:
# Count columns and rows
num_columns_cleaned = len(df_cleaned.columns)
num_rows_cleaned = df_cleaned.count()

print(f"Columns: {num_columns} x Rows: {num_rows}")

Columns: 143 x Rows: 1976777


In [0]:
# Statistics description
df_description_cleaned = spark.table("messi_temp_silver_done").describe()
df_description_cleaned = df_description_cleaned.toPandas()

df_description_cleaned

Unnamed: 0,summary,player_name,pass_recipient_name,pass_length,pass_type_name,pass_outcome_name,pass_technique_name,type_name,shot_outcome_name,shot_body_part_name,shot_statsbomb_xg,shot_technique_name,shot_type_name,position_name,tactics_formation,50_50_outcome_name,bad_behaviour_card_name,play_pattern_name,id_event,index_match,period,minute_match,possession,possession_team_name,team_name,dribble_outcome_name
0,count,1970092,522301,552987.0,95287,98516,8125,1976777,12986,12986,12986.0,12986,12986,1970092,2016.0,708,524,1976777,1976777,1976777.0,1976777.0,1976777.0,1976777.0,1976777,1976777,23290
1,mean,,,20.430384714027717,,,,,,,0.1152503571537887,,,,3109.504464285714,,,,,1897.436162500879,1.4960756827907244,44.52193090065293,93.46939336101138,,,
2,stddev,,,13.976019099757082,,,,,,,0.1523738257014054,,,,7827.833301654504,,,,,1111.383650187602,0.499984725961907,26.889945442483597,55.838345877925256,,,
3,min,Aarón Escandell Banacloche,Aarón Escandell Banacloche,0.0,Corner,Incomplete,Inswinging,50/50,Blocked,Head,0.00018,Backheel,Corner,Center Attacking Midfield,343.0,Lost,Red Card,From Corner,00000e28-2f19-491b-88b9-1d9077baf034,1.0,1.0,0.0,1.0,Albacete,Albacete,Complete
4,max,Šime Vrsaljko,Šime Vrsaljko,119.19941,Throw-in,Unknown,Through Ball,Tactical Shift,Wayward,Right Foot,0.99281865,Volley,Penalty,Substitute,42211.0,Won,Yellow Card,Regular Play,fffffd42-a5ff-48dd-a264-1f5e08ef04ef,4806.0,2.0,98.0,273.0,Xerez,Xerez,Incomplete


In [0]:
# Statistics Summary
df_summary_cleaned = spark.table("messi_temp_silver_done").summary()
df_summary_cleaned = df_summary_cleaned.toPandas()

df_summary_cleaned

Unnamed: 0,summary,player_name,pass_recipient_name,pass_length,pass_type_name,pass_outcome_name,pass_technique_name,type_name,shot_outcome_name,shot_body_part_name,shot_statsbomb_xg,shot_technique_name,shot_type_name,position_name,tactics_formation,50_50_outcome_name,bad_behaviour_card_name,play_pattern_name,id_event,index_match,period,minute_match,possession,possession_team_name,team_name,dribble_outcome_name
0,count,1970092,522301,552987.0,95287,98516,8125,1976777,12986,12986,12986.0,12986,12986,1970092,2016.0,708,524,1976777,1976777,1976777.0,1976777.0,1976777.0,1976777.0,1976777,1976777,23290
1,mean,,,20.430384714027717,,,,,,,0.1152503571537887,,,,3109.504464285714,,,,,1897.436162500879,1.4960756827907244,44.52193090065293,93.46939336101138,,,
2,stddev,,,13.976019099757082,,,,,,,0.1523738257014054,,,,7827.833301654504,,,,,1111.383650187602,0.499984725961907,26.889945442483597,55.838345877925256,,,
3,min,Aarón Escandell Banacloche,Aarón Escandell Banacloche,0.0,Corner,Incomplete,Inswinging,50/50,Blocked,Head,0.00018,Backheel,Corner,Center Attacking Midfield,343.0,Lost,Red Card,From Corner,00000e28-2f19-491b-88b9-1d9077baf034,1.0,1.0,0.0,1.0,Albacete,Albacete,Complete
4,25%,,,11.031773,,,,,,,0.03036632,,,,433.0,,,,,942.0,1.0,21.0,45.0,,,
5,50%,,,16.519686,,,,,,,0.062358446,,,,442.0,,,,,1883.0,1.0,45.0,92.0,,,
6,75%,,,25.402363,,,,,,,0.12549576,,,,4231.0,,,,,2825.0,2.0,67.0,139.0,,,
7,max,Šime Vrsaljko,Šime Vrsaljko,119.19941,Throw-in,Unknown,Through Ball,Tactical Shift,Wayward,Right Foot,0.99281865,Volley,Penalty,Substitute,42211.0,Won,Yellow Card,Regular Play,fffffd42-a5ff-48dd-a264-1f5e08ef04ef,4806.0,2.0,98.0,273.0,Xerez,Xerez,Incomplete
