## Imports / Spark Initialization

In [64]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, when

import findspark
import pandas as pd

findspark.init()


# Initialize a spark session.
def init_spark():
    spark = (
        SparkSession.builder.appName("SOEN 471 Project")
        .config("spark.some.config.option", "some-value")
        .getOrCreate()
    )
    return spark

## Sample of Initial Data

Initial row count is of **10,003,590** values.

In [65]:
from IPython.display import display

spark = init_spark()
players = spark.read.csv("notebook_data.csv", header=True)
sample = players.take(5)
sample = pd.DataFrame(sample, columns=sample[0].__fields__)
sample = sample.drop(sample.columns[0], axis=1)

display(sample)


Unnamed: 0,player_id,player_url,fifa_version,fifa_update,fifa_update_date,short_name,long_name,player_positions,overall,potential,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,player_face_url
0,158023,/player/158023/lionel-messi/230009,23,9,2023-01-13,L. Messi,Lionel Andrés Messi Cuccittini,RW,91,91,...,63+3,63+3,64+3,59+3,50+3,50+3,50+3,59+3,19+3,https://cdn.sofifa.net/players/158/023/23_120.png
1,165153,/player/165153/karim-benzema/230009,23,9,2023-01-13,K. Benzema,Karim Benzema,"CF, ST",91,91,...,64+3,64+3,64+3,60+3,55+3,55+3,55+3,60+3,18+3,https://cdn.sofifa.net/players/165/153/23_120.png
2,188545,/player/188545/robert-lewandowski/230009,23,9,2023-01-13,R. Lewandowski,Robert Lewandowski,ST,91,91,...,66+3,66+3,64+3,61+3,60+3,60+3,60+3,61+3,19+3,https://cdn.sofifa.net/players/188/545/23_120.png
3,192985,/player/192985/kevin-de-bruyne/230009,23,9,2023-01-13,K. De Bruyne,Kevin De Bruyne,"CM, CAM",91,91,...,79+3,79+3,78+3,74+3,68+3,68+3,68+3,74+3,21+3,https://cdn.sofifa.net/players/192/985/23_120.png
4,231747,/player/231747/kylian-mbappe/230009,23,9,2023-01-13,K. Mbappé,Kylian Mbappé Lottin,"ST, LW",91,95,...,63+3,63+3,67+3,63+3,54+3,54+3,54+3,63+3,18+3,https://cdn.sofifa.net/players/231/747/23_120.png


## Feature Selection

Selects all field-related features relevant to our model and removes unnecessary characteristics such as player name, height, age, net worth, etc.

In [66]:
players = spark.read.csv("notebook_data.csv", header=True)
players = players.select(
        "player_positions",
        "overall",
        "skill_moves",
        "pace",
        "shooting",
        "passing",
        "dribbling",
        "defending",
        "physic",
        "attacking_crossing",
        "attacking_finishing",
        "attacking_heading_accuracy",
        "attacking_short_passing",
        "attacking_volleys",
        "skill_dribbling",
        "skill_curve",
        "skill_fk_accuracy",
        "skill_long_passing",
        "skill_ball_control",
        "movement_acceleration",
        "movement_sprint_speed",
        "movement_agility",
        "movement_reactions",
        "movement_balance",
        "power_shot_power",
        "power_jumping",
        "power_stamina",
        "power_strength",
        "power_long_shots",
        "mentality_aggression",
        "mentality_interceptions",
        "mentality_positioning",
        "mentality_vision",
        "mentality_penalties",
        "mentality_composure",
        "defending_marking_awareness",
        "defending_standing_tackle",
        "defending_sliding_tackle",
        "goalkeeping_diving",
        "goalkeeping_handling",
        "goalkeeping_kicking",
        "goalkeeping_positioning",
        "goalkeeping_reflexes",
        "goalkeeping_speed",
    )

sample = players.take(10)
sample = pd.DataFrame(sample, columns=sample[0].__fields__)
display(sample)

Unnamed: 0,player_positions,overall,skill_moves,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,...,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed
0,RW,91,4,81.0,89.0,90.0,94.0,34.0,64.0,84,...,96,20,35,24,6,11,15,14,8,
1,"CF, ST",91,4,80.0,88.0,83.0,87.0,39.0,78.0,75,...,90,43,24,18,13,11,5,5,7,
2,ST,91,4,75.0,91.0,79.0,86.0,44.0,83.0,71,...,88,35,42,19,15,6,12,8,10,
3,"CM, CAM",91,4,74.0,88.0,93.0,87.0,63.0,77.0,94,...,89,65,65,53,15,13,5,10,13,
4,"ST, LW",91,5,97.0,89.0,80.0,92.0,36.0,76.0,78,...,88,26,34,32,13,5,7,11,6,
5,GK,90,1,,,,,,,14,...,66,20,18,16,84,89,75,89,90,46.0
6,RW,90,4,90.0,89.0,82.0,90.0,45.0,75.0,80,...,92,38,43,41,14,14,9,11,14,
7,GK,89,1,,,,,,,15,...,70,17,10,11,87,88,90,88,87,49.0
8,LW,89,5,87.0,83.0,85.0,93.0,37.0,61.0,83,...,93,35,32,29,9,9,15,15,11,
9,CDM,89,2,63.0,73.0,78.0,73.0,88.0,89.0,62,...,84,90,88,87,13,14,16,12,12,


As seen above, most players have more than one preferred position, displayed in increasing order. We therefore filter through the positions and return only the most preferred, creating a new column and dropping the previous one.

In [67]:
players = players.withColumn(
        "player_position", split(players["player_positions"], ",")[0]
    ).drop("player_positions")

sample = players.select(players.columns[-1]).take(10)
sample = pd.DataFrame(sample, columns=sample[0].__fields__)
display(sample)

Unnamed: 0,player_position
0,RW
1,CF
2,ST
3,CM
4,ST
5,GK
6,RW
7,GK
8,LW
9,CDM


For players whose position is goalkeeper, important attribute fields are empty, such as shooting, passing, dribbling, defending, etc. We therefore remove all goal-keeping related attributes and goal-keepers from our data selection.

Reduces our row count from **10,003,590** to **8,882,644**.

In [68]:
players = players.drop(
        "goalkeeping_diving",
        "goalkeeping_handling",
        "goalkeeping_kicking",
        "goalkeeping_positioning",
        "goalkeeping_reflexes",
        "goalkeeping_speed",
    ).filter(players.player_position != "GK")

sample = players.take(10)
sample = pd.DataFrame(sample, columns=sample[0].__fields__)
display(sample)

Unnamed: 0,overall,skill_moves,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,attacking_finishing,...,mentality_aggression,mentality_interceptions,mentality_positioning,mentality_vision,mentality_penalties,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,player_position
0,91,4,81.0,89.0,90.0,94.0,34.0,64.0,84,90,...,44,40,93,94,75,96,20,35,24,RW
1,91,4,80.0,88.0,83.0,87.0,39.0,78.0,75,92,...,63,39,92,89,84,90,43,24,18,CF
2,91,4,75.0,91.0,79.0,86.0,44.0,83.0,71,94,...,81,49,94,81,90,88,35,42,19,ST
3,91,4,74.0,88.0,93.0,87.0,63.0,77.0,94,85,...,75,64,88,94,83,89,65,65,53,CM
4,91,5,97.0,89.0,80.0,92.0,36.0,76.0,78,93,...,64,38,92,83,80,88,26,34,32,ST
5,90,4,90.0,89.0,82.0,90.0,45.0,75.0,80,93,...,63,55,92,85,86,92,38,43,41,RW
6,89,5,87.0,83.0,85.0,93.0,37.0,61.0,83,83,...,63,37,86,90,91,93,35,32,29,LW
7,89,2,63.0,73.0,78.0,73.0,88.0,89.0,62,64,...,91,88,75,79,66,84,90,88,87,CDM
8,89,3,69.0,91.0,83.0,83.0,47.0,82.0,80,93,...,80,44,94,87,92,92,50,36,38,ST
9,89,2,81.0,60.0,71.0,72.0,90.0,86.0,53,52,...,85,89,47,65,62,90,91,91,86,CB


We then drop features which are not discriminative towards any of the labels, and where same values can easily be obtained between players of all classes.

In [69]:
players = players.drop(
        "overall",
        "skill_moves",
        "physic",
        "skill_curve",
        "movement_reactions",
        "power_jumping",
        "power_strength",
        "mentality_aggression",
        "mentality_vision",
        "mentality_composure",
    )

sample = players.take(5)
sample = pd.DataFrame(sample, columns=sample[0].__fields__)
display(sample)

Unnamed: 0,pace,shooting,passing,dribbling,defending,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,...,power_shot_power,power_stamina,power_long_shots,mentality_interceptions,mentality_positioning,mentality_penalties,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,player_position
0,81.0,89.0,90.0,94.0,34.0,84,90,70,91,88,...,86,70,91,40,93,75,20,35,24,RW
1,80.0,88.0,83.0,87.0,39.0,75,92,90,89,88,...,87,82,80,39,92,84,43,24,18,CF
2,75.0,91.0,79.0,86.0,44.0,71,94,91,84,89,...,91,76,84,49,94,90,35,42,19,ST
3,74.0,88.0,93.0,87.0,63.0,94,85,55,93,83,...,92,89,91,64,88,83,65,65,53,CM
4,97.0,89.0,80.0,92.0,36.0,78,93,72,85,83,...,88,87,82,38,92,80,26,34,32,ST


## Label Conversion

Converts FIFA-defined positions to one of our three labels:

In [70]:
def label_conversion(player_position):
    player_position = (
        when(player_position.isin(["CB", "RB", "LB", "RWB", "LWB"]), "Defender")
        .when(player_position.isin(["CM", "CDM", "CAM", "RM", "LM"]), "Midfielder")
        .when(player_position.isin(["ST", "CF", "RF", "LF", "RW", "LW"]), "Forward")
        .otherwise("Undefined")
    )

    return player_position

Converts specific positions such as RW, CDM, LB to a more general label (defender, midfielder or forward):

In [71]:
players = players.withColumn(
        "label_position", label_conversion(players["player_position"])
    ).drop("player_position")

sample = players.select(players.columns[-1]).take(10)
sample = pd.DataFrame(sample, columns=sample[0].__fields__)
display(sample)

Unnamed: 0,label_position
0,Forward
1,Forward
2,Forward
3,Midfielder
4,Forward
5,Forward
6,Forward
7,Midfielder
8,Forward
9,Defender


We complete the data preparation phase by dropping rows containing null values or a label position of "Undefined". No undefined positions were counted but removing nulls minimally reduces our row count from **8,882,644** to **8,882,438**.

In [72]:
players.filter(players.label_position != "Undefined").count()
players = players.dropna()

Final sample of features (total counts **3,266,866** Defenders, **3,712,577** Midfielders, and **1,902,995** Forwards):

In [73]:
sample = players.take(10)
sample = pd.DataFrame(sample, columns=sample[0].__fields__)
display(sample)

Unnamed: 0,pace,shooting,passing,dribbling,defending,attacking_crossing,attacking_finishing,attacking_heading_accuracy,attacking_short_passing,attacking_volleys,...,power_shot_power,power_stamina,power_long_shots,mentality_interceptions,mentality_positioning,mentality_penalties,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,label_position
0,81.0,89.0,90.0,94.0,34.0,84,90,70,91,88,...,86,70,91,40,93,75,20,35,24,Forward
1,80.0,88.0,83.0,87.0,39.0,75,92,90,89,88,...,87,82,80,39,92,84,43,24,18,Forward
2,75.0,91.0,79.0,86.0,44.0,71,94,91,84,89,...,91,76,84,49,94,90,35,42,19,Forward
3,74.0,88.0,93.0,87.0,63.0,94,85,55,93,83,...,92,89,91,64,88,83,65,65,53,Midfielder
4,97.0,89.0,80.0,92.0,36.0,78,93,72,85,83,...,88,87,82,38,92,80,26,34,32,Forward
5,90.0,89.0,82.0,90.0,45.0,80,93,59,84,84,...,83,87,85,55,92,86,38,43,41,Forward
6,87.0,83.0,85.0,93.0,37.0,83,83,63,85,86,...,79,79,81,37,86,91,35,32,29,Forward
7,63.0,73.0,78.0,73.0,88.0,62,64,81,84,65,...,88,88,81,88,75,66,90,88,87,Midfielder
8,69.0,91.0,83.0,83.0,47.0,80,93,85,84,87,...,92,83,86,44,94,92,50,36,38,Forward
9,81.0,60.0,71.0,72.0,90.0,53,52,86,79,45,...,81,74,64,89,47,62,91,91,86,Defender
