In [1]:
from pyspark.sql import SparkSession   
from pyspark.sql.functions import col, sum, when, count_distinct, expr, desc, avg, asc
from pyspark.ml.feature import StringIndexer, VectorAssembler
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier, LinearSVC
from pyspark.ml import Pipeline
from pyspark.ml.evaluation import BinaryClassificationEvaluator

spark = SparkSession.builder.getOrCreate()


24/06/02 17:41:55 WARN Utils: Your hostname, Justins-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.0.120 instead (on interface en0)
24/06/02 17:41:55 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/02 17:41:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
df = spark.read.format("csv")\
    .option("header", "true")\
    .option("inferSchema", "true")\
    .load("/Users/justinwyrley/Downloads/archive-3/players_20.csv")

df.printSchema()
df.createOrReplaceTempView("dfTable") 

                                                                                

root
 |-- sofifa_id: integer (nullable = true)
 |-- player_url: string (nullable = true)
 |-- short_name: string (nullable = true)
 |-- long_name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- dob: date (nullable = true)
 |-- height_cm: integer (nullable = true)
 |-- weight_kg: integer (nullable = true)
 |-- nationality: string (nullable = true)
 |-- club: string (nullable = true)
 |-- overall: integer (nullable = true)
 |-- potential: integer (nullable = true)
 |-- value_eur: integer (nullable = true)
 |-- wage_eur: integer (nullable = true)
 |-- player_positions: string (nullable = true)
 |-- preferred_foot: string (nullable = true)
 |-- international_reputation: integer (nullable = true)
 |-- weak_foot: integer (nullable = true)
 |-- skill_moves: integer (nullable = true)
 |-- work_rate: string (nullable = true)
 |-- body_type: string (nullable = true)
 |-- real_face: string (nullable = true)
 |-- release_clause_eur: integer (nullable = true)
 |-- player_tags: st

24/06/02 17:42:06 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


As there are many columns (104) in this dataset I am going to pick specific attributes I want to work with 

In [3]:
desired_columns = ['short_name','age','nationality','weight_kg','overall','potential','value_eur','preferred_foot','skill_moves','weak_foot'] 
df = df.select(desired_columns) # These are values I believe we can use for prediction and analysis

In [4]:
df.describe().show()

[Stage 4:>                                                          (0 + 1) / 1]

+-------+-----------+------------------+-----------+-----------------+-----------------+----------------+-----------------+--------------+------------------+------------------+
|summary| short_name|               age|nationality|        weight_kg|          overall|       potential|        value_eur|preferred_foot|       skill_moves|         weak_foot|
+-------+-----------+------------------+-----------+-----------------+-----------------+----------------+-----------------+--------------+------------------+------------------+
|  count|      18278|             18278|      18278|            18278|            18278|           18278|            18278|         18278|             18278|             18278|
|   mean|       NULL|25.283291388554545|       NULL| 75.2763431447642|66.24499398183609|71.5468869679396|2484037.640879746|          NULL|2.3680380785643944|2.9442499179341284|
| stddev|       NULL| 4.656964497822068|       NULL|7.047743570355303|6.949953254963414| 6.1396690749562|5585481.06

                                                                                

As this data set uses fifa ratings potential and overall can be a maximum of 100 and skill moves and weak foot range from 1 to 5

In [5]:
null_counts = df.select([sum(col(c).isNull().cast("int")).alias(c) for c in df.columns]) #col(c) checks if the column is null, then casts it to an integer
null_counts.show()

+----------+---+-----------+---------+-------+---------+---------+--------------+-----------+---------+
|short_name|age|nationality|weight_kg|overall|potential|value_eur|preferred_foot|skill_moves|weak_foot|
+----------+---+-----------+---------+-------+---------+---------+--------------+-----------+---------+
|         0|  0|          0|        0|      0|        0|        0|             0|          0|        0|
+----------+---+-----------+---------+-------+---------+---------+--------------+-----------+---------+



This shows there are no null values in the dataset 

# (i)

In [6]:
df.select(count_distinct('nationality')).show() # Show how many countries are represented by players in the dataset

+---------------------------+
|count(DISTINCT nationality)|
+---------------------------+
|                        162|
+---------------------------+



In [7]:
result_df = df.groupBy("nationality").agg(expr("count(*) as count")).orderBy(expr("count desc"))
result_df.show() # I could not seem to get this to work using the sql query so i used the df api instead

+-------------------+-----+
|        nationality|count|
+-------------------+-----+
|        Puerto Rico|    1|
|            Eritrea|    1|
|             Jordan|    1|
|               Chad|    1|
|     Chinese Taipei|    1|
|             Malawi|    1|
|            Liberia|    1|
|            Bahrain|    1|
|São Tomé & Príncipe|    1|
|          Mauritius|    1|
|              Macau|    1|
|           St Lucia|    1|
|           Barbados|    1|
|          Hong Kong|    1|
|          Gibraltar|    1|
|          Indonesia|    1|
|               Guam|    1|
|           Ethiopia|    1|
|        South Sudan|    1|
|             Belize|    1|
+-------------------+-----+
only showing top 20 rows



In [8]:
df.groupBy("nationality").count().orderBy(desc("count")).show() 

+-------------------+-----+
|        nationality|count|
+-------------------+-----+
|            England| 1667|
|            Germany| 1216|
|              Spain| 1035|
|             France|  984|
|          Argentina|  886|
|             Brazil|  824|
|              Italy|  732|
|           Colombia|  591|
|              Japan|  453|
|        Netherlands|  416|
|           China PR|  373|
|              Chile|  370|
|             Sweden|  358|
|             Norway|  350|
|Republic of Ireland|  348|
|      United States|  347|
|            Denmark|  345|
|           Portugal|  344|
|             Mexico|  340|
|             Poland|  324|
+-------------------+-----+
only showing top 20 rows



In [9]:
df.groupBy("nationality").agg(avg("overall").alias("average_overall")).orderBy(desc("average_overall")).show()
# Interesting results as originally I thought a country with more players may be higher 
# But then it seems very obvious that with less players a few higher rated ones would skew the average

+--------------------+-----------------+
|         nationality|  average_overall|
+--------------------+-----------------+
|          Mozambique|            73.25|
|                Chad|             73.0|
|             Bahrain|             72.0|
|             Uruguay|71.64634146341463|
|             Algeria|            71.38|
|              Brazil| 71.1614077669903|
|              Israel|           71.125|
|             Eritrea|             71.0|
|               Libya|             71.0|
|               Gabon|          70.6875|
|            Portugal|70.51453488372093|
|Central African Rep.|             70.5|
|          Cape Verde|             70.5|
|             Ecuador|70.49056603773585|
|      Czech Republic|70.31372549019608|
|          Madagascar|            70.25|
|               Syria|            70.25|
|               Egypt|             70.1|
|         Puerto Rico|             70.0|
| São Tomé & Príncipe|             70.0|
+--------------------+-----------------+
only showing top

Also interesting to see Brazil is the 6th highest rated country on average even though they are 6th most populated in the dataset

In [10]:
df.filter(df["nationality"] == "Mozambique").show()
# Showing what I previously said about a few higher rated players skewing the average


+----------+---+-----------+---------+-------+---------+---------+--------------+-----------+---------+
|short_name|age|nationality|weight_kg|overall|potential|value_eur|preferred_foot|skill_moves|weak_foot|
+----------+---+-----------+---------+-------+---------+---------+--------------+-----------+---------+
|     Mexer| 30| Mozambique|       79|     76|       76|  6000000|         Right|          2|        3|
| Zainadine| 31| Mozambique|       70|     75|       75|  4900000|         Right|          2|        3|
|Simao Mate| 30| Mozambique|       76|     72|       72|  2600000|         Right|          2|        2|
|  Reinildo| 25| Mozambique|       74|     70|       75|  2100000|          Left|          3|        3|
+----------+---+-----------+---------+-------+---------+---------+--------------+-----------+---------+



In [11]:
df.groupBy("nationality").agg(avg("overall").alias("average_overall")).orderBy(asc("average_overall")).show()

+-------------------+------------------+
|        nationality|   average_overall|
+-------------------+------------------+
|          Indonesia|              56.0|
|              Niger|              57.0|
|        South Sudan|              59.0|
|  Antigua & Barbuda| 59.42857142857143|
|           China PR| 59.48525469168901|
|              India|              60.0|
|              Malta|              60.0|
|            Grenada|              60.5|
|       Saudi Arabia| 60.92903225806452|
|              Macau|              61.0|
|Republic of Ireland| 61.00574712643678|
|      Faroe Islands|              61.4|
|             Latvia|61.666666666666664|
|             Cyprus| 61.72727272727273|
|             Malawi|              62.0|
|        Afghanistan|              62.0|
|      Liechtenstein|              62.0|
|          Palestine|              62.0|
|          Gibraltar|              62.0|
|            Vietnam|              62.0|
+-------------------+------------------+
only showing top

In [12]:
df.filter(df["nationality"] == "Indonesia").show()
# Only entry setting the average

+----------------+---+-----------+---------+-------+---------+---------+--------------+-----------+---------+
|      short_name|age|nationality|weight_kg|overall|potential|value_eur|preferred_foot|skill_moves|weak_foot|
+----------------+---+-----------+---------+-------+---------+---------+--------------+-----------+---------+
|E. Maulana Vikri| 18|  Indonesia|       65|     56|       74|   190000|          Left|          3|        3|
+----------------+---+-----------+---------+-------+---------+---------+--------------+-----------+---------+



# (ii)

I want to see what good predictors may be 

In [13]:
df.corr("overall", "potential") # Shows a strong correlation between overall and potential

0.6466500272615198

In [14]:
df = df.withColumn('future_potential', col('potential') - col('overall')) # To see how different the potential is from the overall

In [15]:
df.corr("age", "future_potential") # This shows that as age increases the future_potential decreases

-0.8720936145366164

In [16]:
indexer = StringIndexer(inputCol="preferred_foot", outputCol="left_foot_preferred")
df = indexer.fit(df).transform(df)
# I change left footed players to 1 and right footed to 0 so I can measure how preferred foot affects other variables


                                                                                

In [17]:
df.corr('left_foot_preferred', 'weak_foot') # This shows that right footed players have a slightly stronger weak foot

-0.07543675895427816

# (iii)

I am going to choose to work with preferred foot 

In [18]:
feature_cols = ["skill_moves", "weak_foot"] 
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

train_data, test_data = df.randomSplit([0.8, 0.2], seed=4)

logreg = LogisticRegression(featuresCol="features", labelCol="left_foot_preferred")
pipeline = Pipeline(stages=[assembler, logreg])
model = pipeline.fit(train_data)

predictions = model.transform(test_data)
evaluator = BinaryClassificationEvaluator(labelCol="left_foot_preferred")
accuracy = evaluator.evaluate(predictions)

print("Using the given features the models accuracy was", accuracy)

24/06/02 17:42:25 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS


Using the given features the models accuracy was 0.5807532534767708


Starting with fewer predictors I will try more 

In [19]:
feature_cols = ["weight_kg", "overall", "potential", "value_eur", "skill_moves", "weak_foot"] # Only using numerical values for prediction
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

train_data, test_data = df.randomSplit([0.8, 0.2], seed=4)

logreg = LogisticRegression(featuresCol="features", labelCol="left_foot_preferred")
pipeline = Pipeline(stages=[assembler, logreg])
model = pipeline.fit(train_data)

predictions = model.transform(test_data)
evaluator = BinaryClassificationEvaluator(labelCol="left_foot_preferred")
accuracy = evaluator.evaluate(predictions)

print("Using the given features the models accuracy was", accuracy)

Using the given features the models accuracy was 0.590004449340374


This did not improve the prediction much I will try a different model 

In [20]:
rf = RandomForestClassifier(featuresCol="features", labelCol="left_foot_preferred", numTrees=1000)

pipeline = Pipeline(stages=[assembler, rf])

model = pipeline.fit(train_data)
predictions = model.transform(test_data)

evaluator = BinaryClassificationEvaluator(labelCol="left_foot_preferred")
accuracy = evaluator.evaluate(predictions)

print("Using the given features, the Random Forest model's accuracy was:", accuracy)


24/06/02 17:42:37 WARN DAGScheduler: Broadcasting large task binary with size 1027.3 KiB
24/06/02 17:42:40 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
24/06/02 17:42:44 WARN DAGScheduler: Broadcasting large task binary with size 4.1 MiB
24/06/02 17:42:49 WARN DAGScheduler: Broadcasting large task binary with size 1282.9 KiB
24/06/02 17:42:51 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB
                                                                                

Using the given features, the Random Forest model's accuracy was: 0.5931607518558215


slight improvement 

In [21]:
feature_cols = ["weight_kg", "overall", "potential", "value_eur", "skill_moves"]
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
df_with_features = assembler.transform(df)

train_data, test_data = df_with_features.randomSplit([0.8, 0.2], seed=4)


svm = LinearSVC(featuresCol="features", labelCol="left_foot_preferred", maxIter=10, regParam=0.1)


svm_model = svm.fit(train_data)


predictions = svm_model.transform(test_data)


evaluator = BinaryClassificationEvaluator(labelCol="left_foot_preferred")
accuracy = evaluator.evaluate(predictions)

print("Using the given features, the SVM model's accuracy was:", accuracy)

predictions.select("left_foot_preferred", "prediction").show()


Using the given features, the SVM model's accuracy was: 0.5294394575443612
+-------------------+----------+
|left_foot_preferred|prediction|
+-------------------+----------+
|                1.0|       0.0|
|                0.0|       0.0|
|                0.0|       0.0|
|                0.0|       0.0|
|                0.0|       0.0|
|                0.0|       0.0|
|                1.0|       0.0|
|                1.0|       0.0|
|                0.0|       0.0|
|                0.0|       0.0|
|                0.0|       0.0|
|                0.0|       0.0|
|                0.0|       0.0|
|                0.0|       0.0|
|                1.0|       0.0|
|                1.0|       0.0|
|                0.0|       0.0|
|                1.0|       0.0|
|                0.0|       0.0|
|                0.0|       0.0|
+-------------------+----------+
only showing top 20 rows



It seems like it consistenly guessed the players were right footed which is why it was correct almost 50% of the time. Overall it does not seem like there are any predictors that can accurately predict if a players prefered foot is their left foot.