In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import plotly.graph_objects as go
import plotly.express as px
import plotly
import os

In [6]:
df_20 = pd.read_csv("/content/players_21.csv")

In [None]:
Data Preprocessing & Feature Engineering

In [7]:
df_20 = df_20.drop(['sofifa_id', 'player_url', 'long_name', 'body_type', 'real_face', 'nation_position', 'nation_jersey_number'], axis=1)

In [None]:
1: Position Columns

In [8]:
stats = ['ls', 'st', 'rs', 'lw', 'lf', 'cf', 'rf', 'rw', 'lam', 'cam', 'ram',
       'lm', 'lcm', 'cm', 'rcm', 'rm', 'lwb', 'ldm', 'cdm', 'rdm', 'rwb', 'lb',
       'lcb', 'cb', 'rcb', 'rb']
for col in stats:
    new = df_20[col].str.split("+", n = 1, expand = True)
    df_20[col] = new[0]
# Replace NaN with 0
df_20[stats] = df_20[stats].fillna(0)
df_20[stats] = df_20[stats].astype(int)

In [None]:
2: Player's Work Rate

In [9]:
df_20 = pd.concat([df_20, pd.get_dummies(df_20['work_rate'])], axis=1)
# Drop original work_rate column
df_20 = df_20.drop(['work_rate'], axis=1)

In [None]:
3: Player's DOB

In [10]:
# Split the dob column to fetch month
new = df_20["dob"].str.split("-", n = 2, expand = True)
df_20["birth_month"] = new[1].astype(int)

In [None]:
4: Player's Position

In [11]:
df_20 = pd.concat([df_20, df_20['player_positions'].str.get_dummies(sep=', ').add_prefix('Position_')], axis=1)
# Drop original work_rate column
df_20 = df_20.drop(['player_positions'], axis=1)

In [None]:
BMI: New feature creation

In [12]:
df_20['bmi'] = df_20['weight_kg'] / (df_20['height_cm']/100)**2

In [None]:
 Missing Value Estimation

In [14]:
# Check the missing values in the column
missing_data = df_20.isnull().sum().sort_values(ascending=False)
missing_data = missing_data.reset_index(drop=False)
missing_data = missing_data.rename(columns={"index": "Columns", 0: "Value"})
missing_data['Proportion'] = (missing_data['Value']/len(df_20))*100
import plotly.express as px

# Assuming missing_data is your DataFrame with columns 'Columns' and 'Proportion'

# Filter data for columns with proportion > 10
sample = missing_data[missing_data['Proportion'] > 10]

# Create a bar chart
fig = px.bar(sample, x='Columns', y='Proportion',
             color='Proportion',
             color_continuous_scale=px.colors.sequential.Viridis_r,
             title='Percentage of Missing values in Columns',
             labels={'Proportion': 'Percentage'})

# Customize layout
fig.update_layout(paper_bgcolor='rgba(255,255,255,0.8)',
                  plot_bgcolor='rgba(255,255,255,0.8)',
                  font=dict(family='Cambria, monospace', size=12, color='#000000'))

# Show the figure
fig.show()


In [None]:
Fill Missing Values

In [15]:
# Fill missing values of these columns by median
cols = ["dribbling", "defending", "physic", "passing", "shooting", "pace"]
for col in cols:
    df_20[col] = df_20[col].fillna(df_20[col].median())
df_20 = df_20.fillna(0)

In [None]:
Exploratory Data Analysis

In [None]:
Scatter Plot (colored by Age) year 2020 - Overall Rating vs Value in Euros

In [16]:
import plotly.express as px

fig = px.scatter(df_20, x='overall', y='value_eur',
                 color='age', size='age',
                 color_continuous_scale='Plasma',
                 title='Styled Scatter Plot (colored by Age) year 2020 - Overall Rating vs Value in Euros',
                 labels={'overall': 'Overall Rating', 'value_eur': 'Value in Euros'},
                 hover_data=['short_name'])

fig.update_layout(paper_bgcolor='rgba(255,255,255,0.8)',
                  plot_bgcolor='rgba(255,255,255,0.8)',
                  font=dict(family='Cambria, monospace', size=12, color='#000000'))

fig.show()


In [None]:
Scatter Plot - Nationality vs Overall

In [20]:
import plotly.graph_objs as go
import plotly.offline as pyo

# Assuming 'df_20' is your DataFrame for the year 2020

fig = go.Figure()

sample = df_20.sort_values(by='nationality')

fig.add_trace(go.Bar(
    x=sample['nationality'],
    y=sample['overall'],
    marker=dict(
        color=sample['overall'],
        colorscale='Viridis',
        showscale=True
    ),
    text=sample['short_name']
))

fig.update_layout(title='Styled Bar Chart - Nationality vs Overall',
                  xaxis_title='Nationality',
                  yaxis_title='Overall Rating',
                  paper_bgcolor='rgba(255,255,255,0.8)',
                  plot_bgcolor='rgba(255,255,255,0.8)',
                  font=dict(family='Cambria, monospace', size=12, color='#000000'))

pyo.iplot(fig, filename="Nationality_vs_Overall")


In [None]:
Box Plot (with Suspected Outliers) - Overall Rating vs BMI

In [21]:
import plotly.graph_objs as go
import plotly.offline as pyo

# Assuming 'df_20' is your DataFrame for the year 2020

fig = go.Figure()

sample = df_20.sort_values(by='overall')

fig.add_trace(go.Box(
    x=sample['overall'],
    y=sample['bmi'],
    name="Suspected Outliers",
    boxpoints='all',  # show all points
    jitter=0.3,  # add jitter for better visibility of points
    pointpos=-1.8,  # adjust position of points
    marker=dict(
        size=8,
        color='rgba(31, 119, 180, 0.7)',
        outliercolor='rgba(255, 0, 0, 0.6)',
        line=dict(
            outliercolor='rgba(255, 0, 0, 0.6)',
            outlierwidth=2)),
    line_color='rgba(255, 255, 255, 0.3)',
    text=sample['short_name']
))

fig.update_layout(title='Styled Box Plot (with All Points) - Overall Rating vs BMI',
                  xaxis_title='Overall Rating',
                  yaxis_title='BMI',
                  paper_bgcolor='rgba(255,255,255,0.8)',
                  plot_bgcolor='rgba(255,255,255,0.8)',
                  font=dict(family='Cambria, monospace', size=12, color='#000000'),
                  xaxis_rangeslider_visible=True)

pyo.iplot(fig, filename="Box_Plot_Overall_vs_BMI")


In [None]:
Proportion of Player's per Position¶

In [22]:
attack = ['RW', 'LW', 'ST', 'CF', 'LS', 'RS', 'RF', 'LF']
sample = df_20.query('team_position in @attack')
fig = px.pie(sample, names='team_position',
             color_discrete_sequence=px.colors.sequential.Plasma_r,
             title='Percentage of players in Attacker Role')
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.update_layout(paper_bgcolor='rgba(0,0,0,0)',
                  plot_bgcolor='rgba(0,0,0,0)',
                  font=dict(family='Cambria, monospace', size=12, color='#000000'))
fig.show()