# Fifa Player analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as mno
import datetime
import ipywidgets as widgets
from ipywidgets import interact
from ipywidgets import interact_manual
import missingno as msno

plt.style.use('bmh')

In [None]:
# read data, check execution time and display shape of data
%time data = pd.read_csv('fifa_data.csv')
print(data.shape)

In [None]:
# display column headers
data.columns

# Checking for missing values

In [None]:
#check for missing values 
def checking_m(df):
    null_v = data.isnull().sum().sort_values(ascending=False)
    null_percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
    null_v = pd.concat([null_v, null_percent], axis=1, keys=['Missing_Number', 'Missing_Percent'])
    return null_v

checking_m(data)

In [None]:
# Lets plot missing values by using missingno
msno.matrix(data,color=(0.100, 0.100, 0.100))
# we are having lots of missing values

#
we got some of the rows are almost empty or contain some data/values ,let's find their index and remove them .to find them we use the height and weight columns because these two columns are contain no missing values except those rows 

In [None]:
miss_height = data[data['Height'].isnull()].index.tolist()
miss_weight = data[data['Weight'].isnull()].index.tolist()

#check if the dataset contain the completly empty rows 
if miss_height == miss_weight:
    print('Yes,empty rows exist')
else:
    print('Not found')

In [None]:
# As we have full empty rows, so we will remove them
#lets remove them 
data.drop(data.index[miss_height],inplace=True)

In [None]:
# lets check if we have any duplicate rows?
#check for dublicated rows
print("Numbers of duplicated rows :",data.duplicated().sum())

# Data Cleansing 
- we have some major features which need to clean properly 
- some categorical features to convert into numerical and many more 

In [None]:
# lets remove the unwanted columns in advance
data.drop(['Unnamed: 0',"ID",'Photo','Flag','Club Logo','Real Face','Jersey Number','Loaned From','Release Clause'],
          axis=1,inplace=True)

In [None]:
#cleaning the value and wage col
def m_and_k(Value):
    if isinstance(Value,str):
        out = Value.replace('€', '')
        if 'M' in out:
            out = float(out.replace('M', ''))*1000000
        elif 'K' in Value:
            out = float(out.replace('K', ''))*1000
        return float(out)
    
data["Value"]=data["Value"].apply(lambda x:m_and_k(x))
data["Wage"]=data["Wage"].apply(lambda x:m_and_k(x))

In [None]:
data[["Value","Wage"]].head(10).T

In [None]:
#clean and preprocess the Joined columns
data["Joined"]=pd.to_datetime(data["Joined"])
data["Joined"]=pd.DatetimeIndex(data["Joined"]).year
data["Joined"]=data["Joined"].replace(np.nan,0)
data["Joined"]=data["Joined"].astype(str)

In [None]:
data["Joined"].head()

In [None]:
#clean and preprocess the Contract Valid Until columns
data["Contract Valid Until"]=pd.to_datetime(data["Contract Valid Until"])
data["Contract Valid Until"]=pd.DatetimeIndex(data["Contract Valid Until"]).year
data["Contract Valid Until"]=data["Contract Valid Until"].replace(np.nan,0)
data["Contract Valid Until"]=data["Contract Valid Until"].astype(str)

In [None]:
#clean and process the height columns 
def clean_height(val):
    test=[]
    value=[]
    if isinstance(val,str):
        test=val.split("'")
        i=test[0]
        j=test[1]
        value=(int(i*12)+int(j))/12
    return(value)   
data["Height"]=data["Height"].apply(clean_height) 

In [None]:
#clean and process the Weight columns 
def clean_weight(val):
    if isinstance(val,str):
        return(val.replace("lbs",""))   
data["Weight"]=data["Weight"].apply(clean_weight) 
data["Weight"]=data["Weight"].astype(float)
data.head()

# Visualization

In [None]:
#top 30 contries with highset numbers of players 
y = data.Nationality.value_counts().head(30).index
x = data.Nationality.value_counts().head(30).values

# Athletes barplot with Discipline
plt.rcParams['figure.dpi'] = 600

fig = plt.figure(figsize=(2,3), facecolor='#f6f5f5')
gs = fig.add_gridspec(1, 1)
gs.update(wspace=1.5, hspace=0.05)

background_color = "#f6f5f5"
sns.set_palette(['#ff355d']*1200)
ax0 = fig.add_subplot(gs[0, 0])
for s in ["right", "top"]:
    ax0.spines[s].set_visible(False)
    
ax0.set_facecolor(background_color)


ax0_sns = sns.barplot(data=data,x=x,y=y,zorder=2) 
ax0_sns.set_xlabel('No of Players',fontsize=4, weight='bold')
ax0_sns.set_ylabel('Countries',fontsize=4, weight='bold')

ax0_sns.grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
ax0_sns.grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)


ax0_sns.tick_params(labelsize=3, width=0.5, length=1.5)


for p in ax0_sns.patches:
            value = f'{p.get_width():.0f}'
            x = p.get_x() + p.get_width() + 20
            y = p.get_y() + p.get_height() / 2 
            ax0.text(x, y, value, ha='left', va='center', fontsize=3, 
                    bbox=dict(facecolor='none', edgecolor='black', boxstyle='round', linewidth=0.3))

plt.show()

In [None]:
# display first five rows of the soccer data for reference
pd.set_option('max_columns', 89)
data.head()

In [None]:
# retrieve statistical information for each computable column
pd.set_option('max_columns', 89)
data.iloc[:, 2:].describe().style.background_gradient(cmap = 'cividis')


In [None]:
# calculate descriptive statistics
data.iloc[:, 13:].describe(include = 'object')

In [None]:
# checking and visualizing data for null values for first half
mno.bar(data.iloc[:, :40],
        color = 'red',
        sort = 'ascending')
plt.title('Checking Null Values for First Half', fontsize = 15)
plt.show()

In [None]:
# checking and visualizing data for null values for first half
mno.bar(data.iloc[:, 40:],
        color = 'blue',
        sort = 'ascending')
plt.title('Checking Null Values for Second Half', fontsize = 15)
plt.show()

In [None]:
# fill in null values

data['ShortPassing'].fillna(data['ShortPassing'].mean(), inplace = True)
data['Volleys'].fillna(data['Volleys'].mean(), inplace = True)
data['Dribbling'].fillna(data['Dribbling'].mean(), inplace = True)
data['Curve'].fillna(data['Curve'].mean(), inplace = True)
data['FKAccuracy'].fillna(data['FKAccuracy'], inplace = True)
data['LongPassing'].fillna(data['LongPassing'].mean(), inplace = True)
data['BallControl'].fillna(data['BallControl'].mean(), inplace = True)
data['HeadingAccuracy'].fillna(data['HeadingAccuracy'].mean(), inplace = True)
data['Finishing'].fillna(data['Finishing'].mean(), inplace = True)
data['Crossing'].fillna(data['Crossing'].mean(), inplace = True)
data['Weight'].fillna('200lbs', inplace = True)
data['Contract Valid Until'].fillna(2019, inplace = True)
data['Height'].fillna("5'11", inplace = True)
# data['Loaned From'].fillna('None', inplace = True)
data['Joined'].fillna('Jul 1, 2018', inplace = True)
# data['Jersey Number'].fillna(8, inplace = True)
data['Body Type'].fillna('Normal', inplace = True)
data['Position'].fillna('ST', inplace = True)
data['Club'].fillna('No Club', inplace = True)
data['Work Rate'].fillna('Medium/ Medium', inplace = True)
data['Skill Moves'].fillna(data['Skill Moves'].median(), inplace = True)
data['Weak Foot'].fillna(3, inplace = True)
data['Preferred Foot'].fillna('Right', inplace = True)
data['International Reputation'].fillna(1, inplace = True)
data['Wage'].fillna('€200K', inplace = True)
data.fillna(0, inplace = True)

# check for null values
data.isnull().sum().sum()

In [None]:
#Best in others aspect 
best_players=['Crossing', 'Finishing', 'HeadingAccuracy', 'ShortPassing', 'Volleys',
       'Dribbling', 'Curve', 'FKAccuracy', 'LongPassing', 'BallControl',
       'Acceleration', 'SprintSpeed', 'Agility', 'Reactions', 'Balance',
       'ShotPower', 'Jumping', 'Stamina', 'Strength', 'LongShots',
       'Aggression', 'Interceptions', 'Positioning', 'Vision', 'Penalties',
       'Composure', 'Marking', 'StandingTackle', 'SlidingTackle', 'GKDiving',
       'GKHandling', 'GKKicking', 'GKPositioning', 'GKReflexes']
print("Best in others aspect :")
print("\n<----------------------------------------------------------------->\n")
for i in best_players:
    print('Best {0}:{1} '.format(i,data.loc[data[i].idxmax()][0]))
print("\n<----------------------------------------------------------------->\n")   

In [None]:
#Most valuable and highest earners players
print('Most valued player : '+str(data.loc[data['Value'].idxmax()][0]))
print('Highest earner : '+str(data.loc[data['Wage'].idxmax()][0]))

In [None]:
# create features by aggregating the data

def defense(data):
    return int(round((data[['Marking', 'StandingTackle', 
                               'SlidingTackle']].mean()).mean()))

def general(data):
    return int(round((data[['HeadingAccuracy', 'Dribbling', 'Curve', 
                               'BallControl']].mean()).mean()))

def mental(data):
    return int(round((data[['Aggression', 'Interceptions', 'Positioning', 
                               'Vision','Composure']].mean()).mean()))

def passing(data):
    return int(round((data[['Crossing', 'ShortPassing', 
                               'LongPassing']].mean()).mean()))

def mobility(data):
    return int(round((data[['Acceleration', 'SprintSpeed', 
                               'Agility','Reactions']].mean()).mean()))
def power(data):
    return int(round((data[['Balance', 'Jumping', 'Stamina', 
                               'Strength']].mean()).mean()))

def rating(data):
    return int(round((data[['Potential', 'Overall']].mean()).mean()))

def shooting(data):
    return int(round((data[['Finishing', 'Volleys', 'FKAccuracy', 
                               'ShotPower','LongShots', 'Penalties']].mean()).mean()))


In [None]:
# create new categories to the data set

data['Defense'] = data.apply(defense, axis = 1)
data['General'] = data.apply(general, axis = 1)
data['Mental'] = data.apply(mental, axis = 1)
data['Passing'] = data.apply(passing, axis = 1)
data['Mobility'] = data.apply(mobility, axis = 1)
data['Power'] = data.apply(power, axis = 1)
data['Rating'] = data.apply(rating, axis = 1)
data['Shooting'] = data.apply(shooting, axis = 1)

In [None]:
# check if categories were added
data.columns

In [None]:
# generate distributiions of abilities


# plt.rcParams['figure.figsize'] = (25, 10)
# plt.subplot(2, 4, 1)
# sns.distplot(data['defense'], color = 'red')
# plt.grid()

# plt.subplot(2, 4, 2)
# sns.distplot(data['general'], color = 'black')
# plt.grid()

# plt.subplot(2, 4, 3)
# sns.distplot(data['mental'], color = 'red')
# plt.grid()

# plt.subplot(2, 4, 4)
# sns.distplot(data['passing'], color = 'black')
# plt.grid()

# plt.subplot(2, 4, 5)
# sns.distplot(data['mobility'], color = 'red')
# plt.grid()

# plt.subplot(2, 4, 6)
# sns.distplot(data['power'], color = 'black')
# plt.grid()

# plt.subplot(2, 4, 7)
# sns.distplot(data['shooting'], color = 'red')
# plt.grid()

# plt.subplot(2, 4, 8)
# sns.distplot(data['rating'], color = 'black')
# plt.grid()

# plt.suptitle('Distribution Score for Various Abilities')
# plt.show()

In [None]:
# display dominant foot comparison

plt.rcParams['figure.figsize'] = (8, 3)
sns.countplot(data['Preferred Foot'], palette = 'pink')
plt.title('Dominant Foot', fontsize = 20)
plt.show()

In [None]:
# display international reputation ratings

labels = ['1 Star', '2 Star', '3 Ster', '4 Ster', '5 Ster']  #data['International Reputation'].index
sizes = data['International Reputation'].value_counts()
colors = plt.cm.copper(np.linspace(0, 1, 5))
explode = [0.1, 0.1, 0.2, 0.5, 0.9]

plt.rcParams['figure.figsize'] = (9, 9)
plt.pie(sizes, labels = labels, colors = colors, explode = explode, shadow = True,)
plt.title('International Repuatation Rating Chart', fontsize = 20)
plt.legend()
plt.show()

In [None]:
# Players with a 5 star rating

data[data['International Reputation'] == 5][['Name','Nationality',
                            'Overall']].sort_values(by = 'Overall',
                                        ascending = False).style.background_gradient(cmap = 'magma')

In [None]:
# 

labels = ['5 Star', '4 Star', '3 Star', '2 Star', '1 Star'] 
size = data['Weak Foot'].value_counts()
colors = plt.cm.Wistia(np.linspace(0, 1, 5))
explode = [0, 0, 0, 0, 0.1]

plt.pie(size, labels = labels, colors = colors, explode = explode, shadow = True, startangle = 90)
plt.title('Distribution of Players Ability to Use Non-Dominant Foot', fontsize = 25)
plt.legend()
plt.show()

In [None]:
# generate chart of positions of players

plt.figure(figsize = (13, 15))
plt.style.use('fivethirtyeight')
ax = sns.countplot(y = 'Position', data = data, palette = 'bone')
ax.set_xlabel(xlabel = 'Count', fontsize = 16)
ax.set_ylabel(ylabel = 'Position', fontsize = 16)
ax.set_title(label = 'Positions and Players', fontsize = 20)
plt.show()

In [None]:
# # defining a function for cleaning the Weight data

def extract_value_from(value):
    out = value.replace('lbs', '')
    return float(out)

# Weight Distribution of Players
sns.distplot(data['Weight'], color = 'black')
plt.title("Distribution of Players Weight (lbs)", fontsize = 15)
plt.show()

In [None]:
# data['Value'] = data['Value'].str.replace('€','').str.replace('M',' 1000000').str.replace('K',' 1000')
# data['Value'] = data['Value'].str.split(' ', expand=True)[0].astype(float) * data['Value'].str.split(' ', expand=True)[1].astype(float)
# data['Value'] = data['Value'].fillna(0).astype(np.float32)

In [None]:
data.head()

In [None]:
# function to clean data for wages

# def extract_value_from(column):
#     out = column.replace('€', '')
#     if 'M' in out:
#         out = float(out.replace('M', ''))*1000000
#     elif 'K' in column:
#         out = float(out.replace('K', ''))*1000
#     return float(out)

# generate data
# plt.rcParams['figure.figsize'] = (16, 5)
# plt.subplot(1, 2, 1)
# sns.distplot(data['Value'], color = 'violet')
# plt.title('Distribution of Value of the Players (€)', fontsize = 15)

# plt.subplot(1, 2, 2)
# sns.distplot(data['Wage'], color = 'purple')
# plt.title('Distribution of Wages of the Players (€)', fontsize = 15)
# plt.show()

In [None]:
# amount of skill moves for players

plt.figure(figsize = (10, 6))
ax = sns.countplot(x = 'Skill Moves', data = data, palette = 'pastel')
ax.set_title(label = 'Count for Number of Skill Moves for Player', fontsize = 20)
ax.set_xlabel(xlabel = '# of Skill Moves', fontsize = 16)
ax.set_ylabel(ylabel = 'Count', fontsize = 16)
plt.show()

In [None]:
# generate work rates (offense/defense)

plt.figure(figsize = (15, 5))
plt.style.use('fivethirtyeight')

sns.countplot(x = 'Work Rate', data = data, palette = 'hls')
plt.title('Player Work Rates (Offense/Defense)', fontsize = 20)
plt.xlabel('Work Rate', fontsize = 16)
plt.ylabel('Players', fontsize = 16)
plt.xticks(rotation = 90)
plt.show()

In [None]:
# Overall and Potential Scores of Players

plt.figure(figsize=(16, 4))
plt.style.use('seaborn-paper')

plt.subplot(1, 2, 1)
x = data.Potential
ax = sns.distplot(x, bins = 58, kde = False, color = 'y')
ax.set_xlabel(xlabel = "Potential Scores", fontsize = 10)
ax.set_ylabel(ylabel = 'Number of players', fontsize = 10)
ax.set_title(label = 'Potential Score of Players', fontsize = 15)

plt.subplot(1, 2, 2)
y = data.Overall
ax = sns.distplot(y, bins = 58, kde = False, color = 'y')
ax.set_xlabel(xlabel = "Overall Scores", fontsize = 10)
ax.set_ylabel(ylabel = 'Number of players', fontsize = 10)
ax.set_title(label = 'Overall Score of Players', fontsize = 15)
plt.show()

In [None]:
# Violin plat of Overall scores in relation to age

plt.rcParams['figure.figsize'] = (20, 7)
plt.style.use('seaborn-dark-palette')

sns.boxplot(data['Overall'], data['Age'])
plt.title('Comparison of Overall Scores in Relation to Age', fontsize = 20)
plt.show()

In [None]:
# countries with the most players

data['Nationality'].value_counts().head(10).plot(kind = 'pie', cmap = 'inferno',
                                        startangle = 90, explode = [0, 0, 0, 0, 0, 0, 0, 0, 0.1, 0])
plt.title('Countries with the most players', fontsize = 15)
plt.axis('off')
plt.show()

In [None]:
some_countries = ('England', 'Germany', 'Spain', 'Argentina', 'France', 'Brazil', 'Italy', 'Columbia', 'Japan', 'Netherlands')
data_countries = data.loc[data['Nationality'].isin(some_countries) & data['Overall']]

plt.rcParams['figure.figsize'] = (15, 7)
ax = sns.barplot(x = data_countries['Nationality'], y = data_countries['Overall'], palette = 'spring')
ax.set_xlabel(xlabel = 'Countries', fontsize = 9)
ax.set_ylabel(ylabel = 'Overall Scores', fontsize = 9)
ax.set_title(label = 'Overall Score of Players from Countries With the Highest Amount of Players', fontsize = 20)
plt.show()

In [None]:
# overall scores of the most popular clubs

clubs = ('CD Leganés', 'Southampton', 'RC Celta', 'Empoli', 'Fortuna Düsseldorf', 'Manchestar City',
             'Tottenham Hotspur', 'FC Barcelona', 'Valencia CF', 'Chelsea', 'Real Madrid')

data_clubs = data.loc[data['Club'].isin(clubs) & data['Overall']]

plt.rcParams['figure.figsize'] = (15, 8)
ax = sns.boxplot(x = data_clubs['Club'], y = data_clubs['Overall'], palette = 'inferno')
ax.set_xlabel(xlabel = 'Popular Clubs', fontsize = 9)
ax.set_ylabel(ylabel = 'Overall Score', fontsize = 9)
ax.set_title(label = 'Overall Score of Popular Clubs', fontsize = 20)
plt.xticks(rotation = 90)
plt.grid()
plt.show()

In [None]:
# best players showing position, age, club, nationality based on overall scores

data.iloc[data.groupby(data['Position'])['Overall'].idxmax()][['Position', 'Name', 'Age', 'Club',
                             'Nationality','Overall']].sort_values(by = 'Overall',
                                        ascending = False).style.background_gradient(cmap = 'pink')

In [None]:
# Top players for each type of skill
data1 = data.copy()
@interact
def skill(skills = [ 'General', 'Mental', 'Passing', 
                       'Mobility', 'Power', 'Rating','Shooting'], score = 75):
    return data1[data1[skills] > score][['Name', 'Nationality', 'Club', 'Overall', skills]].sort_values(by = skills,
                                    ascending = False).head(20).style.background_gradient(cmap = 'Blues')

In [None]:
# Top 15 players from each country

@interact
def country(country = list(data1['Nationality'].value_counts().index)):
    return data1[data1['Nationality'] == country][['Name','Position','Overall',
                    'Potential']].sort_values(by = 'Overall',
                            ascending = False).head(15).style.background_gradient(cmap = 'magma')

In [None]:
# top 15 players from each club

@interact
def club(club = list(data1['Club'].value_counts().index[1:])):
    return data1[data1['Club'] == club][['Name','Position','Overall','Nationality','Age']].sort_values(by = 'Overall',
                                ascending  = False).head(15).style.background_gradient(cmap = 'inferno')

In [None]:
# youngest players

youngest = data1[data1['Age'] == 16][['Name', 'Age', 'Club', 'Nationality', 'Overall']]
youngest.sort_values(by = 'Overall', ascending = False).head().style.background_gradient(cmap = 'magma')

In [None]:
# Oldest players

data.sort_values('Age', ascending = False)[['Name', 'Age', 'Club',
                              'Nationality', 'Overall']].head(15).style.background_gradient(cmap = 'Wistia')

In [None]:
#  top 3 best player features needeed for each position

player_features = ('Acceleration', 'Aggression', 'Agility', 
                   'Balance', 'BallControl', 'Composure', 
                   'Crossing', 'Dribbling', 'FKAccuracy', 
                   'Finishing', 'GKDiving', 'GKHandling', 
                   'GKKicking', 'GKPositioning', 'GKReflexes', 
                   'HeadingAccuracy', 'Interceptions', 'Jumping', 
                   'LongPassing', 'LongShots', 'Marking', 'Penalties')

for i, val in data1.groupby(data1['Position'])[player_features].mean().iterrows():
    print('Position {}: {}, {}, {}'.format(i, *tuple(val.nlargest(3).index)))



In [None]:
# performance comparison between left footed and right footed players regarding ball control vs dribbling

sns.lmplot(x = 'BallControl', y = 'Dribbling', data = data1, col = 'Preferred Foot')
plt.show()

In [None]:
# Clubsides with the Highest PLayers out on loan
# release_clause = data.loc[: ,['Name','Release Clause','Overall']]
# release_clause['Release Clause'] = release_clause['Release Clause'].str.strip('€')
# release_clause['Release Clause'] = release_clause['Release Clause'].str.strip('M')
# release_clause = release_clause[:100]
# release_clause['Release Clause'] = release_clause['Release Clause'].astype(float)*1000000 #Since its in millions
# release_clause = pd.DataFrame(release_clause.sort_values(by='Release Clause',ascending=False)[:10])
# release_clause = release_clause.set_index('Name')
# release_clause

In [None]:
#analysis overall performance and age with respect to preferred foot
#plt.rcParams['figure.figsize'] = (20, 7)
plt.figure(figsize=(20,12))
sns.boxenplot(data1["Age"],data1["Overall"],hue=data1["Preferred Foot"])
plt.title("Players age vs overall performance wrt preferred foot",fontsize=40)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.xlabel("Players age",fontsize=20)
plt.ylabel("Overall",fontsize=20)
plt.show()

In [None]:
cols = ['Age','Overall','Potential','Acceleration','SprintSpeed',"Agility","Stamina",'Strength','Preferred Foot']
df_small = data[cols]
df_small.head()

In [None]:
#analysis these col wrt preferred foot
sns.pairplot(df_small,hue="Preferred Foot")

# Feature Engineering

Feature engineering is the process of using domain knowledge to extract features from raw data. The motivation is to use these extra features to improve the quality of results from a machine learning process, compared with supplying only the raw data to the machine learning process.

In [None]:
#dropping the unnecceary columns 
data=data.drop(data.columns[20:46],axis=1)
col=["Name","Club","Contract Valid Until","Preferred Foot"]
data=data.drop(col,axis=1)
data.shape

In [None]:
#impute the missing values
#we have only 1 missing values column = (Position coluumns)
data["Position"].fillna("Unknown",inplace=True)

In [None]:
df = data.copy()
df.head()

In [None]:
#get a count of nationalities in the dataset,make of list on those with over 250 players .
nationalities_count=df["Nationality"].value_counts()
nat_list=nationalities_count[nationalities_count>250].index.tolist()

def majorNation(data):
    if (data["Nationality"] in nat_list):
        return 1
    else:
        return 0
df["major_nation"]=df.apply(majorNation,axis=1)

#encode preferred foot col into numerical.
# df=pd.get_dummies(df,columns=["Preferred Foot"], drop_first=True)

#Create a simplified position varaible to account for all player positions.
def simple_position(df):
    if (df['Position'] == 'GK'):
        return 'GK'
    elif ((df['Position'] == 'RB') | (df['Position'] == 'LB') | (df['Position'] == 'CB') | (df['Position'] == 'LCB') | (df['Position'] == 'RCB') | (df['Position'] == 'RWB') | (df['Position'] == 'LWB') ):
        return 'DF'
    elif ((df['Position'] == 'LDM') | (df['Position'] == 'CDM') | (df['Position'] == 'RDM')):
        return 'DM'
    elif ((df['Position'] == 'LM') | (df['Position'] == 'LCM') | (df['Position'] == 'CM') | (df['Position'] == 'RCM') | (df['Position'] == 'RM')):
        return 'MF'
    elif ((df['Position'] == 'LAM') | (df['Position'] == 'CAM') | (df['Position'] == 'RAM') | (df['Position'] == 'LW') | (df['Position'] == 'RW')):
        return 'AM'
    elif ((df['Position'] == 'RS') | (df['Position'] == 'ST') | (df['Position'] == 'LS') | (df['Position'] == 'CF') | (df['Position'] == 'LF') | (df['Position'] == 'RF')):
        return 'ST'
    else:
        return df.Position
    
df['Simple_Position'] = df.apply(simple_position,axis = 1)

#encode simple position col into numerical.
df=pd.get_dummies(df,columns=["Simple_Position"], drop_first=True)

In [None]:
#Split the Work Rate Column in two
tempwork = df["Work Rate"].str.split("/ ",expand=True) 

#Create new columns for first and secoend work rate
df["WorkRate1"]= tempwork[0]   
df["WorkRate2"]= tempwork[1]
#encode workrate1 and workRate2 columns into numerical.
df=pd.get_dummies(df,columns=["WorkRate1"], drop_first=True)
df=pd.get_dummies(df,columns=["WorkRate2"], drop_first=True)

df=df.drop(["Nationality",'Body Type','Position',"Work Rate"],axis=1)

In [None]:
df.head()

In [None]:
#get the features and terget columns 
y=df.Value
X=df.drop(["Value"],axis=1)

# Model Building
### importing important Libraries

In [None]:
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,OrdinalEncoder,StandardScaler,MinMaxScaler,PowerTransformer,FunctionTransformer
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import AdaBoostRegressor,GradientBoostingRegressor,RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from xgboost import XGBRegressor
from mlxtend.feature_selection import ExhaustiveFeatureSelector,SequentialFeatureSelector
from sklearn.feature_selection import mutual_info_regression,SelectKBest

#### splitting up the data into test and train sets

In [None]:
#train test split  
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
print("X_train shape: ",x_train.shape)
print("X_test shape: ",x_test.shape)
print()
print("Y_train shape: ",y_train.shape)
print("Y_test shape: ",y_test.shape)

In [None]:
x_train

#### Data Scaling

In [None]:
# Using standard scaler
sc=StandardScaler()
x_train_scaled=sc.fit_transform(x_train)
x_test_scaled=sc.transform(x_test)

x_train = pd.DataFrame(x_train_scaled, columns = x_train.columns[:])
x_test = pd.DataFrame(x_test_scaled, columns = x_test.columns[:])
x_train.head()

#### Features Selection

In [None]:
#Mutual information Gain
mutual=SelectKBest(mutual_info_regression,k=15).fit(x_train,y_train)
mutual

In [None]:
#convert into dataframe and check the top 15 features
fetures=pd.DataFrame({"features":list(x_train.columns),"Score":mutual.scores_})
new=fetures.sort_values("Score",ascending=False)
new.head(15)

In [None]:
#get the top features using the mutual information gain.
xtrain=mutual.transform(x_train)
xtest=mutual.transform(x_test)
xtrain[0:3]

# Creating model
- we will train the following models to predict the "Value"
- Gradient Boosting Classifier
- Random Forest
- XGBoost

In [None]:
#this function used to evalute the models with features and terget.
def models_score(models, x_train, x_test, y_train, y_test):    
    scores = {}
    for name, model in models.items():
        model.fit(x_train, y_train)
        y_pred=model.predict(x_test) 
        scores[name] =r2_score(y_test,y_pred)
        #printing the model name and accuracy !!!!!
        print("Model name: ",model)
        print("Model Predictions: ",y_pred)
        print("R2 score :--->>",r2_score(y_test,y_pred))
        print("MSE",mean_squared_error(y_test,y_pred))
        print("MAE",mean_absolute_error(y_test,y_pred))
        print("\n<<<<------------------------------------------------------------->>>>\n")
           
   
    model_scores = pd.DataFrame(scores, index=['Score']).transpose()
    model_scores = model_scores.sort_values('Score',ascending=False)
    return model_scores

In [None]:
#initialize the models 
models = {"GradiantBoost":GradientBoostingRegressor(random_state=42),
         "RandomForest":RandomForestRegressor(random_state=42),
         "XGBboost":XGBRegressor()}

# Accuracy of the models
- following is shown the accuracy of
- Gradient Boosting Classifier
- Random Forest
- XGBoost

In [None]:
#Calling the function
model_scores = models_score(models, x_train, x_test, y_train, y_test)

In [None]:
#printing the model score
model_scores

#### Comparison of accuracies by visualizing 

In [None]:
model_scores = model_scores.reset_index().rename({"index":"Algorithms"}, axis = 1)

model_scores.style.bar()

In [None]:
import plotly.express as px
import plotly.graph_objs as go
#pie plot 
label = model_scores['Algorithms']
value = model_scores['Score']

fig = go.Figure(data=[go.Pie(labels = label, values = value, rotation = 90)])

fig.update_traces(textposition='inside',
                  textinfo='percent+label',
                  marker=dict(line=dict(color='#000000', width = 1.5)))

fig.update_layout(title_x=0.5,
                  title_font=dict(size=20),
                  uniformtext_minsize=15)

fig.show()

In [None]:
fig = px.bar(data_frame = model_scores,
             x="Algorithms",
             y="Score",
             color="Algorithms", title = "<b>Models Score</b>", template = 'plotly_dark')

fig.update_layout(bargap=0.2)

fig.show()

# END