**Fifa World Cup 2018 Players Exploratory Data Analysis with PySpark**

# Imports

In [1]:
import pyspark
from pyspark.sql           import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types     import *
from pyspark.sql           import SQLContext


import seaborn as sns
import pandas  as pd

from IPython.core.display    import HTML
from IPython.display         import Image

## Helper Functions/Variables

In [2]:
# function to show dataframe dimensions
def sparkShape(dataFrame):
    return (dataFrame.count(), len(dataFrame.columns))

# functions to change jupyter html for better reading and plots
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

In [3]:
# Creating/Initiating Spark Session
# SpakSession: it's useed to work with Spark
spSession = (SparkSession.builder.master('local')
                                .appName('fifa_analytics_01')
                                .getOrCreate()
            )

In [4]:
# home path
home_path = '/home/marxcerqueira/repos/fifawc-2018-players-EDA'

In [5]:
spSession

In [7]:
jupyter_settings()

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


# Loading Data

**Context**
The 2018 World Cup is an international football tournament.

**Data**
Confirmed list of FIFA 2018 World Cup players by country

**Acknowledgements**
Source: https://img.fifa.com/image/upload/hzfqyndmnqazczvc5xdb.pdf

Banner Image: https://unsplash.com/photos/ChI4eUGTpeY

Inspiration
Who will the best team be in the 2018 World Cup?

**Notes:**
- With Spark 2.0 a new class org.apache.spark.sql.SparkSession has been introduced which is a combined class for all different contexts we used to have prior to 2.0 release hence SparkSession will be used in replace with SQLContext, HiveContext.

In [8]:
#loading trainig data and creating a RDD in memory with Spark
df = spSession.read.csv(home_path + '/data/wc2018-players.csv', header=True, inferSchema=True)

In [9]:
#creating Spark SQL Context to work with SQL
sqlContext = SQLContext(sc)
teste = sqlContext.read.csv(home_path + '/data/wc2018-players.csv', header=True, inferSchema=True)

In [10]:
type(teste)

pyspark.sql.dataframe.DataFrame

In [11]:
#check df type
type(df)

pyspark.sql.dataframe.DataFrame

In [12]:
# display 5 firsts rows of the dataframe
df.show(5)

+---------+---+----+------------------+----------+----------+--------------------+------+------+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|
+---------+---+----+------------------+----------+----------+--------------------+------+------+
|Argentina|  3|  DF|TAGLIAFICO Nicolas|31.08.1992|TAGLIAFICO|      AFC Ajax (NED)|   169|    65|
|Argentina| 22|  MF|    PAVON Cristian|21.01.1996|     PAVÓN|CA Boca Juniors (...|   169|    65|
|Argentina| 15|  MF|    LANZINI Manuel|15.02.1993|   LANZINI|West Ham United F...|   167|    66|
|Argentina| 18|  DF|    SALVIO Eduardo|13.07.1990|    SALVIO|    SL Benfica (POR)|   167|    69|
|Argentina| 10|  FW|      MESSI Lionel|24.06.1987|     MESSI|  FC Barcelona (ESP)|   170|    72|
+---------+---+----+------------------+----------+----------+--------------------+------+------+
only showing top 5 rows



In [13]:
teste.show(5)

+---------+---+----+------------------+----------+----------+--------------------+------+------+
|     Team|  #|Pos.| FIFA Popular Name|Birth Date|Shirt Name|                Club|Height|Weight|
+---------+---+----+------------------+----------+----------+--------------------+------+------+
|Argentina|  3|  DF|TAGLIAFICO Nicolas|31.08.1992|TAGLIAFICO|      AFC Ajax (NED)|   169|    65|
|Argentina| 22|  MF|    PAVON Cristian|21.01.1996|     PAVÓN|CA Boca Juniors (...|   169|    65|
|Argentina| 15|  MF|    LANZINI Manuel|15.02.1993|   LANZINI|West Ham United F...|   167|    66|
|Argentina| 18|  DF|    SALVIO Eduardo|13.07.1990|    SALVIO|    SL Benfica (POR)|   167|    69|
|Argentina| 10|  FW|      MESSI Lionel|24.06.1987|     MESSI|  FC Barcelona (ESP)|   170|    72|
+---------+---+----+------------------+----------+----------+--------------------+------+------+
only showing top 5 rows



# Data Description

## Rename Columns

In [14]:
# rename columns
df = df.withColumnRenamed('Team', 'team')\
.withColumnRenamed('#', 'jersey_number')\
.withColumnRenamed('Pos.', 'position')\
.withColumnRenamed('FIFA Popular Name', 'players_name')\
.withColumnRenamed('Birth Date', 'birth_date')\
.withColumnRenamed('Shirt Name', 'jersey_name')\
.withColumnRenamed('Club', 'club')\
.withColumnRenamed('Height', 'height')\
.withColumnRenamed('Weight', 'weight')

## Data Dimension

In [15]:
# dataframe dimensions
pyspark.sql.dataframe.DataFrame.shape = sparkShape
df.shape()

(736, 9)

In [16]:
sparkShape(df)

(736, 9)

## Data Types

In [17]:
# check columns types
df.printSchema()

root
 |-- team: string (nullable = true)
 |-- jersey_number: integer (nullable = true)
 |-- position: string (nullable = true)
 |-- players_name: string (nullable = true)
 |-- birth_date: string (nullable = true)
 |-- jersey_name: string (nullable = true)
 |-- club: string (nullable = true)
 |-- height: integer (nullable = true)
 |-- weight: integer (nullable = true)



In [18]:
# other way to check it
df.dtypes

[('team', 'string'),
 ('jersey_number', 'int'),
 ('position', 'string'),
 ('players_name', 'string'),
 ('birth_date', 'string'),
 ('jersey_name', 'string'),
 ('club', 'string'),
 ('height', 'int'),
 ('weight', 'int')]

## Check NA Values

In [19]:
#check na using pandas
df.toPandas().isna().sum()

team             0
jersey_number    0
position         0
players_name     0
birth_date       0
jersey_name      0
club             0
height           0
weight           0
dtype: int64

In [20]:
#using pyspark filter
for column in df.columns:
    print(column, df.filter(df[column].isNull()).count())

team 0
jersey_number 0
position 0
players_name 0
birth_date 0
jersey_name 0
club 0
height 0
weight 0


## Fillout NAs

## Change Dtypes

In [90]:
# replace . to - in birth_date column (str)
df1 = df.withColumn('birth_date', translate('birth_date', '.', '-'))

In [91]:
# change birth date dtype to datetime
df1 = df1.withColumn('birth_date', to_date('birth_date', 'dd-MM-yyyy'))

In [69]:
# checking dtypes after conversion
df1.dtypes

[('team', 'string'),
 ('jersey_number', 'int'),
 ('position', 'string'),
 ('players_name', 'string'),
 ('birth_date', 'date'),
 ('jersey_name', 'string'),
 ('club', 'string'),
 ('height', 'int'),
 ('weight', 'int')]

In [70]:
df1.show(5)

+---------+-------------+--------+------------------+----------+-----------+--------------------+------+------+
|     team|jersey_number|position|      players_name|birth_date|jersey_name|                club|height|weight|
+---------+-------------+--------+------------------+----------+-----------+--------------------+------+------+
|Argentina|            3|      DF|TAGLIAFICO Nicolas|1992-08-31| TAGLIAFICO|      AFC Ajax (NED)|   169|    65|
|Argentina|           22|      MF|    PAVON Cristian|1996-01-21|      PAVÓN|CA Boca Juniors (...|   169|    65|
|Argentina|           15|      MF|    LANZINI Manuel|1993-02-15|    LANZINI|West Ham United F...|   167|    66|
|Argentina|           18|      DF|    SALVIO Eduardo|1990-07-13|     SALVIO|    SL Benfica (POR)|   167|    69|
|Argentina|           10|      FW|      MESSI Lionel|1987-06-24|      MESSI|  FC Barcelona (ESP)|   170|    72|
+---------+-------------+--------+------------------+----------+-----------+--------------------+------+

## Descriptive Statistics

In [71]:
# numeric features
num_attributes_list = [item[0] for item in df.dtypes if item[1].startswith('int')]
df.describe(num_attributes_list).show()

+-------+-----------------+-----------------+-----------------+
|summary|    jersey_number|           height|           weight|
+-------+-----------------+-----------------+-----------------+
|  count|              736|              736|              736|
|   mean|             12.0|182.4076086956522|77.18885869565217|
| stddev|6.637760461599851|6.930924233929302|7.233778346883639|
|    min|                1|              165|               59|
|    max|               23|              201|               99|
+-------+-----------------+-----------------+-----------------+



In [43]:
# other way
df.select(num_attributes_list).describe().show()

+-------+-----------------+-----------------+-----------------+
|summary|    jersey_number|           height|           weight|
+-------+-----------------+-----------------+-----------------+
|  count|              736|              736|              736|
|   mean|             12.0|182.4076086956522|77.18885869565217|
| stddev|6.637760461599851|6.930924233929302|7.233778346883639|
|    min|                1|              165|               59|
|    max|               23|              201|               99|
+-------+-----------------+-----------------+-----------------+



In [44]:
# categorical features
cat_attributes_list = [item[0] for item in df.dtypes if item[1].startswith('str')]
df.describe(cat_attributes_list).show()

+-------+---------+--------+------------+----------+-----------+--------------------+
|summary|     team|position|players_name|birth_date|jersey_name|                club|
+-------+---------+--------+------------+----------+-----------+--------------------+
|  count|      736|     736|         736|       736|        736|                 736|
|   mean|     null|    null|        null|      null|       null|                null|
| stddev|     null|    null|        null|      null|       null|                null|
|    min|Argentina|      DF|ABDALLA SAID|01.01.1984|  A. ASHRAF|    1. FC Köln (GER)|
|    max|  Uruguay|      MF|ZUBER Steven|31.10.1997|   ŽIVKOVIĆ|Étoile du Sahel (...|
+-------+---------+--------+------------+----------+-----------+--------------------+



# Data Filtering

In [45]:
df.columns

['team',
 'jersey_number',
 'position',
 'players_name',
 'birth_date',
 'jersey_name',
 'club',
 'height',
 'weight']

## ways of selecting specific columns for analysis

In [46]:
# using select, and we can organize by order of the column
df.select('players_name', 'team', 'club').show(5)

+------------------+---------+--------------------+
|      players_name|     team|                club|
+------------------+---------+--------------------+
|TAGLIAFICO Nicolas|Argentina|      AFC Ajax (NED)|
|    PAVON Cristian|Argentina|CA Boca Juniors (...|
|    LANZINI Manuel|Argentina|West Ham United F...|
|    SALVIO Eduardo|Argentina|    SL Benfica (POR)|
|      MESSI Lionel|Argentina|  FC Barcelona (ESP)|
+------------------+---------+--------------------+
only showing top 5 rows



In [47]:
df.select(col('players_name'), col('team')).show(5)

+------------------+---------+
|      players_name|     team|
+------------------+---------+
|TAGLIAFICO Nicolas|Argentina|
|    PAVON Cristian|Argentina|
|    LANZINI Manuel|Argentina|
|    SALVIO Eduardo|Argentina|
|      MESSI Lionel|Argentina|
+------------------+---------+
only showing top 5 rows



In [48]:
df.select(df['players_name'], df['team']).show(5)

+------------------+---------+
|      players_name|     team|
+------------------+---------+
|TAGLIAFICO Nicolas|Argentina|
|    PAVON Cristian|Argentina|
|    LANZINI Manuel|Argentina|
|    SALVIO Eduardo|Argentina|
|      MESSI Lionel|Argentina|
+------------------+---------+
only showing top 5 rows



In [49]:
# calling a column with alias
df.select(df['team'].alias('time')).show(5)

+---------+
|     time|
+---------+
|Argentina|
|Argentina|
|Argentina|
|Argentina|
|Argentina|
+---------+
only showing top 5 rows



In [50]:
#checkin country names in the dataset
df.select('team').distinct().orderBy('team').show(25)

+--------------+
|          team|
+--------------+
|     Argentina|
|     Australia|
|       Belgium|
|        Brazil|
|      Colombia|
|    Costa Rica|
|       Croatia|
|       Denmark|
|         Egypt|
|       England|
|        France|
|       Germany|
|       IR Iran|
|       Iceland|
|         Japan|
|Korea Republic|
|        Mexico|
|       Morocco|
|       Nigeria|
|        Panama|
|          Peru|
|        Poland|
|      Portugal|
|        Russia|
|  Saudi Arabia|
+--------------+
only showing top 25 rows



## Filtering dataframe

In [51]:
# get only brazilian team
df.filter('team = "Brazil"').show(10)

+------+-------------+--------+-----------------+----------+-----------+--------------------+------+------+
|  team|jersey_number|position|     players_name|birth_date|jersey_name|                club|height|weight|
+------+-------------+--------+-----------------+----------+-----------+--------------------+------+------+
|Brazil|           18|      MF|             FRED|05.03.1993|       FRED|FC Shakhtar Donet...|   169|    64|
|Brazil|           21|      FW|           TAISON|13.01.1988|     TAISON|FC Shakhtar Donet...|   172|    64|
|Brazil|           17|      MF|      FERNANDINHO|04.05.1985|FERNANDINHO|Manchester City F...|   179|    67|
|Brazil|           22|      DF|           FAGNER|11.06.1989|     FAGNER|SC Corinthians (BRA)|   168|    67|
|Brazil|           10|      FW|           NEYMAR|05.02.1992|  NEYMAR JR|Paris Saint-Germa...|   175|    68|
|Brazil|           11|      MF|PHILIPPE COUTINHO|12.06.1992|P. COUTINHO|  FC Barcelona (ESP)|   172|    68|
|Brazil|            7|      

In [52]:
# number of brazilian players
df.filter('team = "Brazil"').count()

23

In [53]:
# filtering with 2 conditions
# get argentinians taller than 180 (& condition)
df.filter((df['team']=='Argentina') & (df['height'] > 180)).orderBy('height', ascending = False).show()

+---------+-------------+--------+------------------+----------+-----------+--------------------+------+------+
|     team|jersey_number|position|      players_name|birth_date|jersey_name|                club|height|weight|
+---------+-------------+--------+------------------+----------+-----------+--------------------+------+------+
|Argentina|            6|      DF|    FAZIO Federico|17.03.1987|      FAZIO|       AS Roma (ITA)|   199|    85|
|Argentina|            1|      GK|     GUZMAN Nahuel|10.02.1986|     GUZMÁN|   Tigres UANL (MEX)|   192|    90|
|Argentina|           12|      GK|     ARMANI Franco|16.10.1986|     ARMANI|CA River Plate (ARG)|   189|    85|
|Argentina|           16|      DF|       ROJO Marcos|20.03.1990|       ROJO|Manchester United...|   189|    82|
|Argentina|           23|      GK|CABALLERO Wilfredo|28.09.1981|  CABALLERO|    Chelsea FC (ENG)|   186|    80|
|Argentina|            9|      FW|   HIGUAIN Gonzalo|10.12.1987|    HIGUAÍN|   Juventus FC (ITA)|   184|

In [54]:
# other way of filtering with 2 conditions
df.filter(df['team'] == 'Brazil').filter(df['jersey_number'] < 10).show()

+------+-------------+--------+--------------+----------+-----------+--------------------+------+------+
|  team|jersey_number|position|  players_name|birth_date|jersey_name|                club|height|weight|
+------+-------------+--------+--------------+----------+-----------+--------------------+------+------+
|Brazil|            7|      FW| DOUGLAS COSTA|14.09.1990|   D. COSTA|   Juventus FC (ITA)|   182|    70|
|Brazil|            6|      DF|   FILIPE LUIS|09.08.1985|FILIPE LUIS|Atletico Madrid (...|   182|    73|
|Brazil|            9|      FW| GABRIEL JESUS|03.04.1997|   G. JESUS|Manchester City F...|   175|    73|
|Brazil|            3|      DF|       MIRANDA|07.09.1984|    MIRANDA|FC Internazionale...|   186|    78|
|Brazil|            2|      DF|  THIAGO SILVA|22.09.1984|   T. SILVA|Paris Saint-Germa...|   183|    79|
|Brazil|            4|      DF| PEDRO GEROMEL|21.09.1985|    GEROMEL|   Grêmio FBPA (BRA)|   190|    84|
|Brazil|            5|      MF|      CASEMIRO|23.02.199

In [55]:
# using OR condition
df.filter((df['players_name'] == 'MESSI Lionel') | (df['players_name'] == 'SALVIO Eduardo') | (df['height'] >= 199)).show()

+---------+-------------+--------+------------------+----------+-----------+--------------------+------+------+
|     team|jersey_number|position|      players_name|birth_date|jersey_name|                club|height|weight|
+---------+-------------+--------+------------------+----------+-----------+--------------------+------+------+
|Argentina|           18|      DF|    SALVIO Eduardo|13.07.1990|     SALVIO|    SL Benfica (POR)|   167|    69|
|Argentina|           10|      FW|      MESSI Lionel|24.06.1987|      MESSI|  FC Barcelona (ESP)|   170|    72|
|Argentina|            6|      DF|    FAZIO Federico|17.03.1987|      FAZIO|       AS Roma (ITA)|   199|    85|
|  Belgium|            1|      GK|  COURTOIS Thibaut|11.05.1992|   COURTOIS|    Chelsea FC (ENG)|   199|    91|
|  Croatia|           12|      GK|     KALINIC Lovre|03.04.1990| L. KALINIĆ|      KAA Gent (BEL)|   201|    96|
|  Denmark|            3|      DF|VESTERGAARD Jannik|03.08.1992|VESTERGAARD|VfL Borussia Mönc...|   200|

In [56]:
# using & e |
# getting all defensive brazilian players or belgium player with 199 of height
df.filter((df['position'] == 'DF') & (df['team']== 'Brazil') | (df['height'] == 199) & (df['team'] == 'Belgium')).show()

+-------+-------------+--------+----------------+----------+-----------+--------------------+------+------+
|   team|jersey_number|position|    players_name|birth_date|jersey_name|                club|height|weight|
+-------+-------------+--------+----------------+----------+-----------+--------------------+------+------+
|Belgium|            1|      GK|COURTOIS Thibaut|11.05.1992|   COURTOIS|    Chelsea FC (ENG)|   199|    91|
| Brazil|           22|      DF|          FAGNER|11.06.1989|     FAGNER|SC Corinthians (BRA)|   168|    67|
| Brazil|            6|      DF|     FILIPE LUIS|09.08.1985|FILIPE LUIS|Atletico Madrid (...|   182|    73|
| Brazil|           13|      DF|      MARQUINHOS|14.05.1994| MARQUINHOS|Paris Saint-Germa...|   183|    75|
| Brazil|            3|      DF|         MIRANDA|07.09.1984|    MIRANDA|FC Internazionale...|   186|    78|
| Brazil|           14|      DF|          DANILO|15.07.1991|     DANILO|Manchester City F...|   184|    78|
| Brazil|            2|     

## Feature Engineering

In [96]:
# feature engineering

# creating new columns due business coinstraints
df2 = df1.withColumn('world_cup', lit(2018))

# IMC
df2 = df2.withColumn('IMC', lit(df2['weight']/((df2['height']/100)*(df2['height']/100))))

#birth_year
df2 = df2.withColumn('birth_year', year(df2['birth_date']))
# df1 = df_student.withColumn('birth_month',month(df_student.birthday))

# birth month
df2 = df2.withColumn('birth_month', month(df2['birth_date']))

# day of year
df2 = df2.withColumn("birth_month",to_timestamp(col("birth_month"))).withColumn("D_O_Y", date_format(col("birth_month"), "D"))

# week of year
# df2 = df2.withColumn('week_of_year',weekofyear(df2.birth_month))


In [97]:
df2.show(50)

+---------+-------------+--------+------------------+----------+-----------+--------------------+------+------+---------+------------------+----------+-------------------+-----+------------+
|     team|jersey_number|position|      players_name|birth_date|jersey_name|                club|height|weight|world_cup|               IMC|birth_year|        birth_month|D_O_Y|week_of_year|
+---------+-------------+--------+------------------+----------+-----------+--------------------+------+------+---------+------------------+----------+-------------------+-----+------------+
|Argentina|            3|      DF|TAGLIAFICO Nicolas|1992-08-31| TAGLIAFICO|      AFC Ajax (NED)|   169|    65|     2018|22.758306781975424|      1992|1969-12-31 21:00:08|  365|           1|
|Argentina|           22|      MF|    PAVON Cristian|1996-01-21|      PAVÓN|CA Boca Juniors (...|   169|    65|     2018|22.758306781975424|      1996|1969-12-31 21:00:01|  365|           1|
|Argentina|           15|      MF|    LANZINI

In [None]:
# create columns with condition (using function substring)
df.withColumn('sub', substring(df['team'], 1, 3)).show(5)

In [None]:
# create columns 'year'
df.withColumn('year', substring(df['birth_date'], -4, 4)).show(5)

In [None]:
# creating columns with concat
df.withColumn('concat', concat(df['team'], df['jersey_name'])).show(3)

In [None]:
# creating columns with concat_ws (with separator)
df.withColumn('concat', concat_ws(' - ',df['team'], df['jersey_name'])).show(3)