## Imports / Spark Initialization

In [10]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import split, when

import findspark
import pandas as pd

findspark.init()


# Initialize a spark session.
def init_spark():
    spark = (
        SparkSession.builder.appName("SOEN 471 Project")
        .config("spark.some.config.option", "some-value")
        .getOrCreate()
    )
    return spark

## Sample of Initial Data

Initial row count is of **10,003,590** values.

In [11]:
from IPython.display import display

spark = init_spark()
players = spark.read.csv("notebook_data.csv", header=True)
players = players.take(3)
players = pd.DataFrame(players, columns=players[0].__fields__)
players = players.drop(players.columns[0], axis=1)

display(players)


Unnamed: 0,player_id,player_url,fifa_version,fifa_update,fifa_update_date,short_name,long_name,player_positions,overall,potential,...,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,gk,player_face_url
0,158023,/player/158023/lionel-messi/230009,23,9,2023-01-13,L. Messi,Lionel Andrés Messi Cuccittini,RW,91,91,...,63+3,63+3,64+3,59+3,50+3,50+3,50+3,59+3,19+3,https://cdn.sofifa.net/players/158/023/23_120.png
1,165153,/player/165153/karim-benzema/230009,23,9,2023-01-13,K. Benzema,Karim Benzema,"CF, ST",91,91,...,64+3,64+3,64+3,60+3,55+3,55+3,55+3,60+3,18+3,https://cdn.sofifa.net/players/165/153/23_120.png
2,188545,/player/188545/robert-lewandowski/230009,23,9,2023-01-13,R. Lewandowski,Robert Lewandowski,ST,91,91,...,66+3,66+3,64+3,61+3,60+3,60+3,60+3,61+3,19+3,https://cdn.sofifa.net/players/188/545/23_120.png


## Feature Selection

Selects all field-related features relevant to our model and removes unnecessary characteristics such as player name, height, age, net worth, etc.

In [12]:
players = spark.read.csv("notebook_data.csv", header=True)
players = players.select(
        "player_positions",
        "overall",
        "skill_moves",
        "pace",
        "shooting",
        "passing",
        "dribbling",
        "defending",
        "physic",
        "attacking_crossing",
        "attacking_finishing",
        "attacking_heading_accuracy",
        "attacking_short_passing",
        "attacking_volleys",
        "skill_dribbling",
        "skill_curve",
        "skill_fk_accuracy",
        "skill_long_passing",
        "skill_ball_control",
        "movement_acceleration",
        "movement_sprint_speed",
        "movement_agility",
        "movement_reactions",
        "movement_balance",
        "power_shot_power",
        "power_jumping",
        "power_stamina",
        "power_strength",
        "power_long_shots",
        "mentality_aggression",
        "mentality_interceptions",
        "mentality_positioning",
        "mentality_vision",
        "mentality_penalties",
        "mentality_composure",
        "defending_marking_awareness",
        "defending_standing_tackle",
        "defending_sliding_tackle",
        "goalkeeping_diving",
        "goalkeeping_handling",
        "goalkeeping_kicking",
        "goalkeeping_positioning",
        "goalkeeping_reflexes",
        "goalkeeping_speed",
    )

players = players.take(3)
players = pd.DataFrame(players, columns=players[0].__fields__)
display(players)

Unnamed: 0,player_positions,overall,skill_moves,pace,shooting,passing,dribbling,defending,physic,attacking_crossing,...,mentality_composure,defending_marking_awareness,defending_standing_tackle,defending_sliding_tackle,goalkeeping_diving,goalkeeping_handling,goalkeeping_kicking,goalkeeping_positioning,goalkeeping_reflexes,goalkeeping_speed
0,RW,91,4,81.0,89.0,90.0,94.0,34.0,64.0,84,...,96,20,35,24,6,11,15,14,8,
1,"CF, ST",91,4,80.0,88.0,83.0,87.0,39.0,78.0,75,...,90,43,24,18,13,11,5,5,7,
2,ST,91,4,75.0,91.0,79.0,86.0,44.0,83.0,71,...,88,35,42,19,15,6,12,8,10,
