In [1]:
# Importing all the relevant libraries 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, median_absolute_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
# load the dataset
fifa19_player_df = pd.read_csv(r'C:\Users\JAINMR1\Desktop\data.csv')
fifa19_player_df.head()

Unnamed: 0.1,Unnamed: 0,ID,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,0,158023,L. Messi,31,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94,94,FC Barcelona,...,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M
1,1,20801,Cristiano Ronaldo,33,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94,94,Juventus,...,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M
2,2,190871,Neymar Jr,26,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92,93,Paris Saint-Germain,...,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M
3,3,193080,De Gea,27,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91,93,Manchester United,...,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M
4,4,192985,K. De Bruyne,27,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91,92,Manchester City,...,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M


In [3]:
# Identifying the rows and columns
fifa19_player_df.shape[0]

18207

#### The 3 business questions I am trying to answer are as follows: - 
Q1: Which clubs spend the maximum $ per player?

Q2: What is the relationship between age and performance?

Q3: What’s the age distribution like? How is it related to the player’s overall rating?

Q4: What are the parameters driving the potential of a player?

In [4]:
# Datatype of each column
fifa19_player_df.info()number


SyntaxError: invalid syntax (<ipython-input-4-11afd21e66c9>, line 2)

In [None]:
# What kind information is provied?
fifa19_player_df.columns

In [None]:
# Descriptive Statistics. Identifying the number of missing values
fifa19_player_df.isnull().sum()

In [None]:
# Drop unused columns which should not be used for the analysis
columns_to_drop = ['Unnamed: 0', 'ID', 'Photo', 'Flag','Club Logo', 'Preferred Foot', 
                   'Body Type', 'Real Face', 'Jersey Number', 'Joined', 'Loaned From',
                   'Contract Valid Until', 'Height', 'Weight','LS', 'ST', 'RS', 'LW', 'LF', 'CF', 'RF', 'RW',
                   'LAM', 'CAM', 'RAM', 'LM', 'LCM', 'CM', 'RCM', 'RM', 'LWB', 'LDM',
                   'CDM', 'RDM', 'RWB', 'LB', 'LCB', 'CB', 'RCB', 'RB', 'Release Clause']

fifa19_player_df.drop(columns_to_drop, axis=1, inplace=True)

In [None]:
# Converting euro amounts in M and K (string) into numbers
"""
Description: This function is used to convert euro values present as string(in thousands and millions) into numbers.


Arguments:
   amount: Euro  values present in thousands and millions

Returns:
    Euros values in numbers
"""
def str2number(amount):
    if amount[-1] == 'M':
        return float(amount[1:-1])*1000000
    elif amount[-1] == 'K':
        return float(amount[1:-1])*1000
    else:
        return float(amount[1:])

In [None]:
# Convert value, wage to number
# First convert value, wage string to actual amount, then divide by 1 million and 1 k respectively
fifa19_player_df['Value_M'] = fifa19_player_df['Value'].apply(lambda x: str2number(x) / 1000000)
fifa19_player_df['Wage_K'] = fifa19_player_df['Wage'].apply(lambda x: str2number(x) / 1000)

# Drop original value & wage column
fifa19_player_df.drop(['Value', 'Wage'], axis=1, inplace=True)

In [None]:
# From below we can see that highest player value is 118.8m and wage is 565k
# Which are Neymar Jr and L. Messi respectively.
# Exploratory data analysis: Finding the mean, count and standard deviation of the raw data before moving into the modeling phase
fifa19_player_df.describe()

In [None]:
club_wages = fifa19_player_df.groupby('Club').sum()

#### Q1: Which clubs spend the maximum euros per player?

In [None]:
club_player_count = fifa19_player_df.groupby('Club').count().sort_values('Age', ascending=False)
c = club_player_count.merge(club_wages, how = "inner", on = "Club")
c["cost_per_player"] = c["Wage_K_y"]/c["Age_x"]
c = c.sort_values("cost_per_player", ascending = False)
c.head(10)


#### Real Madrid, FC Barcalona spend the maximum $ payer

In [None]:
# Number of clubs and average number of players in each club
print('Number of clubs is {}'.format(club_player_count.shape[0]))
print('Average number players in each club is {}'.format(round(club_player_count['Age'].mean(),2)))
club_player_count.head(10)

In [None]:
# Calculate average overall rating
age_mean = fifa19_player_df.groupby('Age').mean()

In [None]:
# Plot age distribution and overall rating together
age_count_list = age_count.values.tolist()
age_overall_rating_list = age_mean['Overall'].values.tolist()

In [None]:

ages = age_count.index.values.tolist()
fig = plt.figure()
ax1 = fig.add_subplot(111)
ax1.plot(ages,age_overall_rating_list, color = 'red', label='Average Rating')
ax1.legend(loc=1)
ax1.set_ylabel('Average Rating')

ax2 = ax1.twinx()
plt.bar(ages, age_count_list, label='Age Count')
ax2.legend(loc=2)
ax2.set_ylabel('Age Count')
plt.show()

#### Q2: What is the relationship between age and performance?

In [None]:
# Drop the 48 players whose skill set is missing.
fifa19_player_df.dropna(axis=0, how='any', inplace=True)

In [None]:
# Work Rate is in the format of attack work rate/defence work rate
# so need to create two new columns here.
fifa19_player_df['Work Rate Attack'] = fifa19_player_df['Work Rate'].map(lambda x: x.split('/')[0])
fifa19_player_df['Work Rate Defence'] = fifa19_player_df['Work Rate'].map(lambda x: x.split('/')[1])
fifa19_player_df.drop('Work Rate', axis=1, inplace=True)

In [None]:
fifa19_player_df.head()

#### Performance increases till a certain age and then starts detoriating. For few older players, we see higher performance as well.

In [None]:
# One Hot Encoding for Position, Work Rate Attack, Work Rate Defence
one_hot_columns = ['Position', 'Work Rate Attack', 'Work Rate Defence']
fifa19_player_df = pd.get_dummies(fifa19_player_df, columns=one_hot_columns, prefix = one_hot_columns)

#### 3. What are the parameters driving the potential of a player?

In [None]:
fifa19_player_df.shape

In [None]:
#DEfining the X and Y variables
y = fifa19_player_df['Potential']
X = fifa19_player_df.drop(['Value_M', 'Wage_K', 'Potential', 'Overall'], axis=1)

In [None]:
# Spliting the datasets into test and training sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=42)

In [None]:
# Running Random Forest for regression and calucating key metrics for evaluation
ForestRegressor = RandomForestRegressor(n_estimators=500)
ForestRegressor.fit(X_train, y_train)
y_test_preds = ForestRegressor.predict(X_test)
print(r2_score(y_test, y_test_preds))
print(mean_squared_error(y_test, y_test_preds))

In [None]:
#Idenitfying the variable importance
coefs_df = pd.DataFrame()

coefs_df['Features'] = X_train.columns
coefs_df['Coefs'] = ForestRegressor.feature_importances_
coefs_df.sort_values('Coefs', ascending=False).head(10)

In [None]:
# Ploting the variable of importance
coefs_df.set_index('Features', inplace=True)
coefs_df.sort_values('Coefs', ascending=False).head(5).plot(kind='bar', color='green')

#### The key parameters driving potential are Ball control, reaction and age.